feat: Add NPU+GPU async vision pipelining for v1 engine

liangliangchang · cursoragent · liangliangchang · commit 988be0bcdb3e · 2026-06-01T15:42:34.000-06:00
Layer async NPU vision pre-encoding on top of the FlexMLRT backend:
vision scheduler in EngineCore, scheduler deferral when vision is not
ready, and gpu_model_runner pre-encoding thread pool. Gated by
VLLM_NPU_ASYNC_PIPELINE=1 (default off).

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -215,6 +215,7 @@
     VLLM_VISION_NPU_BACKEND: str = ""
     VLLM_VISION_NPU_CACHE: str | None = None
     VLLM_VISION_NPU_DEVICE: str | None = None
+    VLLM_NPU_ASYNC_PIPELINE: bool = False
     VLLM_NPU_TIMING: bool = False
     VLLM_MORIIO_QP_PER_TRANSFER: int = 1
     VLLM_MORIIO_POST_BATCH_SIZE: int = -1
@@ -1754,6 +1755,8 @@ def _get_or_set_default() -> str:
     "VLLM_VISION_NPU_CACHE": lambda: os.getenv("VLLM_VISION_NPU_CACHE"),
     # NPU device name (e.g., "stx" for Strix, "phx" for Phoenix)
     "VLLM_VISION_NPU_DEVICE": lambda: os.getenv("VLLM_VISION_NPU_DEVICE"),
+    # Enable async pipelining of NPU vision encoding with GPU LLM inference
+    "VLLM_NPU_ASYNC_PIPELINE": lambda: os.getenv("VLLM_NPU_ASYNC_PIPELINE", "0") == "1",
     # Enable NPU timing debug logs
     "VLLM_NPU_TIMING": lambda: os.getenv("VLLM_NPU_TIMING", "0") == "1",
     # Enable CUDA compatibility mode for datacenter GPUs with older
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
@@ -628,6 +628,7 @@ def get_npu_vision_backend():
 
     Returns:
         NPUVisionBackend instance if NPU backend is enabled, None otherwise.
+        Returns AsyncFlexMLRTVisionBackend if VLLM_NPU_ASYNC_PIPELINE=1.
 
     Raises:
         ValueError: If backend name is recognized but initialization fails.
@@ -647,8 +648,14 @@ def get_npu_vision_backend():
             )
         device_name = envs.VLLM_VISION_NPU_DEVICE or "stx"
 
-        from vllm.vision_npu.flexmlrt_backend import FlexMLRTVisionBackend
+        # Use async backend if pipelining is enabled
+        if envs.VLLM_NPU_ASYNC_PIPELINE:
+            from vllm.vision_npu.flexmlrt_backend import AsyncFlexMLRTVisionBackend
 
-        return FlexMLRTVisionBackend(model_cache, device_name)
+            return AsyncFlexMLRTVisionBackend(model_cache, device_name)
+        else:
+            from vllm.vision_npu.flexmlrt_backend import FlexMLRTVisionBackend
+
+            return FlexMLRTVisionBackend(model_cache, device_name)
 
     return None
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -103,6 +103,12 @@ def __init__(
 
         # Scheduling constraints.
         self.max_num_running_reqs = self.scheduler_config.max_num_seqs
+        self.enable_hybrid_pipeline = (
+            envs.VLLM_NPU_ASYNC_PIPELINE
+            and envs.VLLM_VISION_NPU_BACKEND.lower() == "flexmlrt"
+        )
+        # Set during schedule() when a request is deferred for NPU vision.
+        self.waiting_on_vision_encoding = False
         self.max_num_scheduled_tokens = (
             self.scheduler_config.max_num_scheduled_tokens
             if self.scheduler_config.max_num_scheduled_tokens
@@ -357,6 +363,8 @@ def schedule(self) -> SchedulerOutput:
         # chunked prefills, prefix caching, speculative decoding,
         # and the "jump decoding" optimization in the future.
 
+        self.waiting_on_vision_encoding = False
+
         scheduled_new_reqs: list[Request] = []
         scheduled_resumed_reqs: list[Request] = []
         scheduled_running_reqs: list[Request] = []
@@ -574,6 +582,25 @@ def schedule(self) -> SchedulerOutput:
                 request = request_queue.peek_request()
                 request_id = request.request_id
 
+                # HYBRID PIPELINING: defer prefill until NPU vision is ready.
+                if self.enable_hybrid_pipeline and self.max_num_running_reqs == 1:
+                    needs_vision = (
+                        request.num_computed_tokens == 0 and request.mm_features
+                    )
+                    if needs_vision:
+                        from vllm.v1.engine.core import (
+                            _VISION_PREENCODING_CACHE,
+                            is_vision_preencoding_ready,
+                        )
+
+                        if not is_vision_preencoding_ready(
+                            request_id, _VISION_PREENCODING_CACHE
+                        ):
+                            self.waiting_on_vision_encoding = True
+                            request_queue.pop_request()
+                            step_skipped_waiting.prepend_request(request)
+                            continue
+
                 # try to promote blocked statuses while traversing skipped queue.
                 if self._is_blocked_waiting_status(
                     request.status
@@ -814,6 +841,19 @@ def schedule(self) -> SchedulerOutput:
                     continue
 
                 self.running.append(request)
+
+                if self.enable_hybrid_pipeline and self.max_num_running_reqs == 1:
+                    is_vision_phase = (
+                        request.num_computed_tokens == 0 and request.mm_features
+                    )
+                    phase_name = "VISION" if is_vision_phase else "LLM"
+                    logger.debug(
+                        "[Hybrid Scheduler] Scheduled %s in %s phase (running: %d)",
+                        request.request_id,
+                        phase_name,
+                        len(self.running),
+                    )
+
                 if self.log_stats:
                     request.record_event(
                         EngineCoreEventType.SCHEDULED, scheduled_timestamp
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -85,6 +85,34 @@
 
 _R = TypeVar("_R")  # Return type for collective_rpc
 
+# Global vision pre-encoding cache (shared between EngineCore and workers)
+_VISION_PREENCODING_CACHE: dict[str, Any] = {}
+
+# Busy-loop backoff while deferred requests wait on NPU vision (seconds).
+_VISION_POLL_SLEEP_S = 0.02
+_DEFAULT_BUSY_LOOP_SLEEP_S = 0.001
+
+
+def is_vision_preencoding_ready(
+    request_id: str, cache: dict[str, Any] | None = None
+) -> bool:
+    """Return True when background vision encoding finished for a request."""
+    if cache is None:
+        cache = _VISION_PREENCODING_CACHE
+    cached = cache.get(request_id)
+    if cached == "COMPLETED":
+        return True
+    if cached is None:
+        return False
+    done = getattr(cached, "done", None)
+    return callable(done) and done()
+
+
+def _request_has_vision_mm(request: Any) -> bool:
+    if not request.mm_features:
+        return False
+    return any(f.modality in ("image", "video") for f in request.mm_features)
+
 
 class EngineCore:
     """Inner loop of vLLM's Engine."""
@@ -398,6 +426,59 @@ def log_iteration_details(self, scheduler_output: SchedulerOutput):
         )
         self._iteration_index += 1
 
+    def _schedule_waiting_vision(self) -> None:
+        """Vision Scheduler: Proactively trigger pre-encoding for waiting requests.
+
+        This is the key to enabling pipelining with max-num-seqs=1:
+        - Core scheduler only schedules 1 LLM at a time (max-num-seqs=1)
+        - Vision scheduler processes ALL waiting requests' vision independently
+        - Request 2's vision can process while Request 1's LLM runs
+        """
+        if not (
+            envs.VLLM_NPU_ASYNC_PIPELINE
+            and envs.VLLM_VISION_NPU_BACKEND.lower() == "flexmlrt"
+        ):
+            return
+
+        try:
+            waiting_requests = list(self.scheduler.waiting)  # type: ignore[attr-defined]
+        except Exception as e:
+            logger.exception("[Vision Scheduler] Error accessing waiting queue: %s", e)
+            return
+
+        if not waiting_requests:
+            return
+
+        # Skip the scan when every waiting vision request is already submitted.
+        pending_submit = False
+        for request in waiting_requests:
+            if not _request_has_vision_mm(request):
+                continue
+            req_id = request.request_id
+            if req_id not in _VISION_PREENCODING_CACHE:
+                pending_submit = True
+                break
+
+        if not pending_submit:
+            return
+
+        for request in waiting_requests:
+            if not _request_has_vision_mm(request):
+                continue
+
+            req_id = request.request_id
+            if req_id in _VISION_PREENCODING_CACHE:
+                continue
+
+            has_mm_hash = any(mm_feature.mm_hash for mm_feature in request.mm_features)
+            if not has_mm_hash:
+                continue
+
+            logger.debug(
+                "[Vision Scheduler] Submitting pre-encoding for request %s", req_id
+            )
+            self.model_executor.submit_vision_encoding(req_id, request.mm_features)
+
     def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
         """Schedule, execute, and make output.
 
@@ -409,6 +490,10 @@ def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
         # or finished and not yet removed from the batch.
         if not self.scheduler.has_requests():
             return {}, False
+
+        # Vision pre-encoding for waiting requests (overlaps with running LLM).
+        self._schedule_waiting_vision()
+
         scheduler_output = self.scheduler.schedule()
         future = self.model_executor.execute_model(scheduler_output, non_block=True)
         grammar_output = self.scheduler.get_grammar_bitmask(scheduler_output)
@@ -467,7 +552,12 @@ def step_with_batch_queue(
         model_executed = False
         deferred_scheduler_output = None
         if self.scheduler.has_requests():
+            # VISION SCHEDULER: Proactively trigger pre-encoding
+            # Request 2's vision can start while Request 1's LLM runs
+            self._schedule_waiting_vision()
+
             scheduler_output = self.scheduler.schedule()
+
             with self.log_error_detail(scheduler_output):
                 exec_future = self.model_executor.execute_model(
                     scheduler_output, non_block=True
@@ -1211,7 +1301,10 @@ def _process_engine_step(self) -> bool:
         # background threads (like NIXL handshake) to make progress.
         # Without this, the tight polling loop can starve background threads.
         if not model_executed and self.scheduler.has_unfinished_requests():
-            time.sleep(0.001)
+            if getattr(self.scheduler, "waiting_on_vision_encoding", False):
+                time.sleep(_VISION_POLL_SLEEP_S)
+            else:
+                time.sleep(_DEFAULT_BUSY_LOOP_SLEEP_S)
 
         return model_executed
 
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
@@ -10,6 +10,7 @@
 import numpy as np
 import torch
 
+from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import (
     STREAM_FINISHED,
@@ -38,6 +39,8 @@
     SchedulerStats,
 )
 
+logger = init_logger(__name__)
+
 # shared empty CPU tensor used as a placeholder pooling output
 EMPTY_CPU_TENSOR = torch.empty(0, device="cpu")
 
@@ -678,6 +681,33 @@ def process_outputs(
                     self._update_stats_from_finished(
                         req_state, finish_reason, iteration_stats
                     )
+
+                    # Debug logging for request timing
+                    if req_state.stats and iteration_stats:
+                        metrics = req_state.stats
+                        e2e_time = (
+                            iteration_stats.iteration_timestamp - metrics.arrival_time
+                        )
+                        queued_time = metrics.scheduled_ts - metrics.queued_ts
+                        prefill_time = metrics.first_token_ts - metrics.scheduled_ts
+                        decode_time = metrics.last_token_ts - metrics.first_token_ts
+                        num_tokens = metrics.num_generation_tokens
+                        tokens_per_sec = (
+                            num_tokens / decode_time if decode_time > 0 else 0
+                        )
+                        logger.debug(
+                            "Request %s: E2E=%.3fs, Queue=%.3fs, "
+                            "Prefill=%.3fs, Decode=%.3fs, "
+                            "Tokens=%d (%.1f tok/s)",
+                            req_state.request_id,
+                            e2e_time,
+                            queued_time,
+                            prefill_time,
+                            decode_time,
+                            num_tokens,
+                            tokens_per_sec,
+                        )
+
                     if self.tracing_enabled:
                         self.do_tracing(engine_core_output, req_state, iteration_stats)
 
diff --git a/vllm/v1/executor/uniproc_executor.py b/vllm/v1/executor/uniproc_executor.py
@@ -132,6 +132,17 @@ def check_health(self) -> None:
         # it's running.
         return
 
+    def submit_vision_encoding(self, req_id, mm_features):
+        """Submit vision encoding for a waiting request to enable pipelining.
+
+        This is called by the Vision Scheduler to proactively start vision processing
+        for requests that are waiting in the queue (not yet scheduled for LLM).
+        """
+        # Direct call to model_runner for UniProcExecutor (no RPC needed)
+        if hasattr(self.driver_worker, "model_runner"):
+            self.driver_worker.model_runner.submit_vision_encoding(req_id, mm_features)
+        return None
+
     def shutdown(self) -> None:
         if worker := self.driver_worker:
             worker.shutdown()
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
diff --git a/vllm/vision_npu/flexmlrt_backend.py b/vllm/vision_npu/flexmlrt_backend.py