Skip to content

Commit a261ca1

Browse files
authored
Adding check for engine sleeping status at service_discovery (vllm-project#441)
Signed-off-by: Braulio Dumba <Braulio.Dumba@ibm.com> Co-authored-by: Braulio Dumba <Braulio.Dumba@ibm.com>
1 parent bd0178f commit a261ca1

File tree

2 files changed

+41
-2
lines changed

2 files changed

+41
-2
lines changed

src/vllm_router/service_discovery.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,9 @@ class EndpointInfo:
9494
# Model label
9595
model_label: str
9696

97+
# Endpoint's sleep status
98+
sleep: bool
99+
97100
# Pod name
98101
pod_name: Optional[str] = None
99102

@@ -296,6 +299,7 @@ def get_endpoint_info(self) -> List[EndpointInfo]:
296299
url=url,
297300
model_names=[model], # Convert single model to list
298301
Id=self.engines_id[i],
302+
sleep=False,
299303
added_timestamp=self.added_timestamp,
300304
model_label=model_label,
301305
model_info=self._get_model_info(model),
@@ -376,6 +380,33 @@ def _check_pod_ready(container_statuses):
376380
ready_count = sum(1 for status in container_statuses if status.ready)
377381
return ready_count == len(container_statuses)
378382

383+
def _get_engine_sleep_status(self, pod_ip) -> Optional[bool]:
384+
"""
385+
Get the engine sleeping status by querying the engine's
386+
'/is_sleeping' endpoint.
387+
388+
Args:
389+
pod_ip: the IP address of the pod running the engine
390+
391+
Returns:
392+
the sleep status of the target engine
393+
"""
394+
url = f"http://{pod_ip}:{self.port}/is_sleeping"
395+
sleep = False
396+
try:
397+
headers = None
398+
if VLLM_API_KEY := os.getenv("VLLM_API_KEY"):
399+
logger.info(f"Using vllm server authentication")
400+
headers = {"Authorization": f"Bearer {VLLM_API_KEY}"}
401+
response = requests.get(url, headers=headers)
402+
response.raise_for_status()
403+
sleep = response.json()["is_sleeping"]
404+
except Exception as e:
405+
logger.warning(
406+
f"Failed to get the sleep status for engine at {url} - sleep status is set to `False`: {e}"
407+
)
408+
return sleep
409+
379410
def _get_model_names(self, pod_ip) -> List[str]:
380411
"""
381412
Get the model names of the serving engine pod by querying the pod's
@@ -505,6 +536,7 @@ def _add_engine(
505536
added_timestamp=int(time.time()),
506537
Id=str(uuid.uuid5(uuid.NAMESPACE_DNS, engine_name)),
507538
model_label=model_label,
539+
sleep=self._get_engine_sleep_status(engine_ip),
508540
pod_name=engine_name,
509541
namespace=self.namespace,
510542
model_info=model_info,

src/vllm_router/services/request_service/request.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -223,15 +223,22 @@ async def route_general_request(
223223
update_content_length(request, request_body)
224224

225225
if not request_endpoint:
226-
endpoints = list(filter(lambda x: requested_model in x.model_names, endpoints))
226+
endpoints = list(
227+
filter(
228+
lambda x: requested_model in x.model_names and x.sleep == False,
229+
endpoints,
230+
)
231+
)
227232
engine_stats = request.app.state.engine_stats_scraper.get_engine_stats()
228233
request_stats = request.app.state.request_stats_monitor.get_request_stats(
229234
time.time()
230235
)
231236
else:
232237
endpoints = list(
233238
filter(
234-
lambda x: requested_model in x.model_names and x.Id == request_endpoint,
239+
lambda x: requested_model in x.model_names
240+
and x.Id == request_endpoint
241+
and x.sleep == False,
235242
endpoints,
236243
)
237244
)

0 commit comments

Comments
 (0)