Adding check for engine sleeping status at service_discovery (vllm-project#441)

dumb0002 · web-flow · commit a261ca1785a6 · 2025-06-10T15:54:25.000-07:00
Signed-off-by: Braulio Dumba &lt;Braulio.Dumba@ibm.com&gt;
Co-authored-by: Braulio Dumba &lt;Braulio.Dumba@ibm.com&gt;
diff --git a/src/vllm_router/service_discovery.py b/src/vllm_router/service_discovery.py
@@ -94,6 +94,9 @@ class EndpointInfo:
     # Model label
     model_label: str
 
+    # Endpoint's sleep status
+    sleep: bool
+
     # Pod name
     pod_name: Optional[str] = None
 
@@ -296,6 +299,7 @@ def get_endpoint_info(self) -> List[EndpointInfo]:
                 url=url,
                 model_names=[model],  # Convert single model to list
                 Id=self.engines_id[i],
+                sleep=False,
                 added_timestamp=self.added_timestamp,
                 model_label=model_label,
                 model_info=self._get_model_info(model),
@@ -376,6 +380,33 @@ def _check_pod_ready(container_statuses):
         ready_count = sum(1 for status in container_statuses if status.ready)
         return ready_count == len(container_statuses)
 
+    def _get_engine_sleep_status(self, pod_ip) -> Optional[bool]:
+        """
+        Get the engine sleeping status by querying the engine's
+        '/is_sleeping' endpoint.
+
+        Args:
+            pod_ip: the IP address of the pod running the engine
+
+        Returns:
+            the sleep status of the target engine
+        """
+        url = f"http://{pod_ip}:{self.port}/is_sleeping"
+        sleep = False
+        try:
+            headers = None
+            if VLLM_API_KEY := os.getenv("VLLM_API_KEY"):
+                logger.info(f"Using vllm server authentication")
+                headers = {"Authorization": f"Bearer {VLLM_API_KEY}"}
+            response = requests.get(url, headers=headers)
+            response.raise_for_status()
+            sleep = response.json()["is_sleeping"]
+        except Exception as e:
+            logger.warning(
+                f"Failed to get the sleep status for engine at {url} - sleep status is set to `False`: {e}"
+            )
+        return sleep
+
     def _get_model_names(self, pod_ip) -> List[str]:
         """
         Get the model names of the serving engine pod by querying the pod's
@@ -505,6 +536,7 @@ def _add_engine(
                 added_timestamp=int(time.time()),
                 Id=str(uuid.uuid5(uuid.NAMESPACE_DNS, engine_name)),
                 model_label=model_label,
+                sleep=self._get_engine_sleep_status(engine_ip),
                 pod_name=engine_name,
                 namespace=self.namespace,
                 model_info=model_info,
diff --git a/src/vllm_router/services/request_service/request.py b/src/vllm_router/services/request_service/request.py
@@ -223,15 +223,22 @@ async def route_general_request(
         update_content_length(request, request_body)
 
     if not request_endpoint:
-        endpoints = list(filter(lambda x: requested_model in x.model_names, endpoints))
+        endpoints = list(
+            filter(
+                lambda x: requested_model in x.model_names and x.sleep == False,
+                endpoints,
+            )
+        )
         engine_stats = request.app.state.engine_stats_scraper.get_engine_stats()
         request_stats = request.app.state.request_stats_monitor.get_request_stats(
             time.time()
         )
     else:
         endpoints = list(
             filter(
-                lambda x: requested_model in x.model_names and x.Id == request_endpoint,
+                lambda x: requested_model in x.model_names
+                and x.Id == request_endpoint
+                and x.sleep == False,
                 endpoints,
             )
         )