ROCm
diff --git a/‎vllm/v1/core/sched/scheduler.py‎
Lines changed: 19 additions & 34 deletions b/‎vllm/v1/core/sched/scheduler.py‎
Lines changed: 19 additions & 34 deletions
diff --git a/‎vllm/v1/engine/core.py‎
Lines changed: 53 additions & 38 deletions b/‎vllm/v1/engine/core.py‎
Lines changed: 53 additions & 38 deletions
@@ -103,6 +103,12 @@ def __init__(
 
         # Scheduling constraints.
         self.max_num_running_reqs = self.scheduler_config.max_num_seqs
+        self.enable_hybrid_pipeline = (
+            envs.VLLM_NPU_ASYNC_PIPELINE
+            and envs.VLLM_VISION_NPU_BACKEND.lower() == "flexmlrt"
+        )
+        # Set during schedule() when a request is deferred for NPU vision.
+        self.waiting_on_vision_encoding = False
         self.max_num_scheduled_tokens = (
             self.scheduler_config.max_num_scheduled_tokens
             if self.scheduler_config.max_num_scheduled_tokens
@@ -357,11 +363,7 @@ def schedule(self) -> SchedulerOutput:
         # chunked prefills, prefix caching, speculative decoding,
         # and the "jump decoding" optimization in the future.
 
-        # Check if hybrid NPU+GPU pipelining is enabled
-        enable_hybrid_pipeline = (
-            envs.VLLM_NPU_ASYNC_PIPELINE
-            and envs.VLLM_VISION_NPU_BACKEND.lower() == "flexmlrt"
-        )
+        self.waiting_on_vision_encoding = False
 
         scheduled_new_reqs: list[Request] = []
         scheduled_resumed_reqs: list[Request] = []
@@ -580,37 +582,21 @@ def schedule(self) -> SchedulerOutput:
                 request = request_queue.peek_request()
                 request_id = request.request_id
 
-                # HYBRID PIPELINING: Check if vision is ready BEFORE processing
-                if enable_hybrid_pipeline and self.max_num_running_reqs == 1:
+                # HYBRID PIPELINING: defer prefill until NPU vision is ready.
+                if self.enable_hybrid_pipeline and self.max_num_running_reqs == 1:
                     needs_vision = (
                         request.num_computed_tokens == 0 and request.mm_features
                     )
                     if needs_vision:
-                        # Check if vision encoding is complete
-                        from vllm.v1.engine.core import _VISION_PREENCODING_CACHE
-
-                        vision_ready = False
-                        if request_id in _VISION_PREENCODING_CACHE:
-                            cached = _VISION_PREENCODING_CACHE[request_id]
-                            if (
-                                cached == "COMPLETED"
-                                or hasattr(cached, "done")
-                                and cached.done()
-                            ):
-                                vision_ready = True
-
-                        immediate_key = f"immediate_{request_id}"
-                        if (
-                            not vision_ready
-                            and immediate_key in _VISION_PREENCODING_CACHE
-                        ):
-                            cached = _VISION_PREENCODING_CACHE[immediate_key]
-                            if hasattr(cached, "done") and cached.done():
-                                vision_ready = True
+                        from vllm.v1.engine.core import (
+                            _VISION_PREENCODING_CACHE,
+                            is_vision_preencoding_ready,
+                        )
 
-                        if not vision_ready:
-                            # Vision not ready - skip this request
-                            # Vision Scheduler will proactively process waiting requests
+                        if not is_vision_preencoding_ready(
+                            request_id, _VISION_PREENCODING_CACHE
+                        ):
+                            self.waiting_on_vision_encoding = True
                             request_queue.pop_request()
                             step_skipped_waiting.prepend_request(request)
                             continue
@@ -856,13 +842,12 @@ def schedule(self) -> SchedulerOutput:
 
                 self.running.append(request)
 
-                # Log hybrid scheduler decisions
-                if enable_hybrid_pipeline and self.max_num_running_reqs == 1:
+                if self.enable_hybrid_pipeline and self.max_num_running_reqs == 1:
                     is_vision_phase = (
                         request.num_computed_tokens == 0 and request.mm_features
                     )
                     phase_name = "VISION" if is_vision_phase else "LLM"
-                    logger.info(
+                    logger.debug(
                         "[Hybrid Scheduler] Scheduled %s in %s phase (running: %d)",
                         request.request_id,
                         phase_name,
 
@@ -88,6 +88,31 @@
 # Global vision pre-encoding cache (shared between EngineCore and workers)
 _VISION_PREENCODING_CACHE: dict[str, Any] = {}
 
+# Busy-loop backoff while deferred requests wait on NPU vision (seconds).
+_VISION_POLL_SLEEP_S = 0.02
+_DEFAULT_BUSY_LOOP_SLEEP_S = 0.001
+
+
+def is_vision_preencoding_ready(
+    request_id: str, cache: dict[str, Any] | None = None
+) -> bool:
+    """Return True when background vision encoding finished for a request."""
+    if cache is None:
+        cache = _VISION_PREENCODING_CACHE
+    cached = cache.get(request_id)
+    if cached == "COMPLETED":
+        return True
+    if cached is None:
+        return False
+    done = getattr(cached, "done", None)
+    return callable(done) and done()
+
+
+def _request_has_vision_mm(request: Any) -> bool:
+    if not request.mm_features:
+        return False
+    return any(f.modality in ("image", "video") for f in request.mm_features)
+
 
 class EngineCore:
     """Inner loop of vLLM's Engine."""
@@ -409,57 +434,47 @@ def _schedule_waiting_vision(self) -> None:
         - Vision scheduler processes ALL waiting requests' vision independently
         - Request 2's vision can process while Request 1's LLM runs
         """
-        # Only enable for NPU async pipelining
-        if not envs.VLLM_NPU_ASYNC_PIPELINE:
+        if not (
+            envs.VLLM_NPU_ASYNC_PIPELINE
+            and envs.VLLM_VISION_NPU_BACKEND.lower() == "flexmlrt"
+        ):
             return
 
-        # Import the pre-encoding cache
-        from vllm.v1.engine.core import _VISION_PREENCODING_CACHE
-
-        # Get waiting requests from scheduler
         try:
             waiting_requests = list(self.scheduler.waiting)  # type: ignore[attr-defined]
-            # Only log when there are actually waiting requests to process
-            if len(waiting_requests) > 0:
-                logger.info(
-                    "[Vision Scheduler] Found %d waiting requests",
-                    len(waiting_requests),
-                )
         except Exception as e:
             logger.exception("[Vision Scheduler] Error accessing waiting queue: %s", e)
             return
 
-        for idx, request in enumerate(waiting_requests):
-            # Only process requests with multimodal features
-            if not request.mm_features:
+        if not waiting_requests:
+            return
+
+        # Skip the scan when every waiting vision request is already submitted.
+        pending_submit = False
+        for request in waiting_requests:
+            if not _request_has_vision_mm(request):
                 continue
+            req_id = request.request_id
+            if req_id not in _VISION_PREENCODING_CACHE:
+                pending_submit = True
+                break
 
-            # Check if this is a vision request
-            has_vision = any(
-                f.modality in ("image", "video") for f in request.mm_features
-            )
-            if not has_vision:
+        if not pending_submit:
+            return
+
+        for request in waiting_requests:
+            if not _request_has_vision_mm(request):
                 continue
 
             req_id = request.request_id
-
-            # Check if already in cache (in progress or completed)
             if req_id in _VISION_PREENCODING_CACHE:
                 continue
 
-            # Compute mm_hash for this request
-            mm_hash = None
-            for mm_feature in request.mm_features:
-                if mm_feature.mm_hash:
-                    mm_hash = mm_feature.mm_hash
-                    break
-
-            if mm_hash is None:
+            has_mm_hash = any(mm_feature.mm_hash for mm_feature in request.mm_features)
+            if not has_mm_hash:
                 continue
 
-            # Submit vision encoding for this waiting request
-            # Pass only serializable data (req_id and mm_features) for RPC
-            logger.info(
+            logger.debug(
                 "[Vision Scheduler] Submitting pre-encoding for request %s", req_id
             )
             self.model_executor.submit_vision_encoding(req_id, request.mm_features)
@@ -476,11 +491,8 @@ def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
         if not self.scheduler.has_requests():
             return {}, False
 
-        # VISION SCHEDULER: Proactively trigger pre-encoding for waiting requests
-        # Request 2's vision can start while Request 1's LLM runs
-        logger.info("[EngineCore] About to call _schedule_waiting_vision()")
+        # Vision pre-encoding for waiting requests (overlaps with running LLM).
         self._schedule_waiting_vision()
-        logger.info("[EngineCore] Returned from _schedule_waiting_vision()")
 
         scheduler_output = self.scheduler.schedule()
         future = self.model_executor.execute_model(scheduler_output, non_block=True)
@@ -1289,7 +1301,10 @@ def _process_engine_step(self) -> bool:
         # background threads (like NIXL handshake) to make progress.
         # Without this, the tight polling loop can starve background threads.
         if not model_executed and self.scheduler.has_unfinished_requests():
-            time.sleep(0.001)
+            if getattr(self.scheduler, "waiting_on_vision_encoding", False):
+                time.sleep(_VISION_POLL_SLEEP_S)
+            else:
+                time.sleep(_DEFAULT_BUSY_LOOP_SLEEP_S)
 
         return model_executed