8585
8686_R = TypeVar ("_R" ) # Return type for collective_rpc
8787
88+ # Global vision pre-encoding cache (shared between EngineCore and workers)
89+ _VISION_PREENCODING_CACHE : dict [str , Any ] = {}
90+
91+ # Busy-loop backoff while deferred requests wait on NPU vision (seconds).
92+ _VISION_POLL_SLEEP_S = 0.02
93+ _DEFAULT_BUSY_LOOP_SLEEP_S = 0.001
94+
95+
96+ def is_vision_preencoding_ready (
97+ request_id : str , cache : dict [str , Any ] | None = None
98+ ) -> bool :
99+ """Return True when background vision encoding finished for a request."""
100+ if cache is None :
101+ cache = _VISION_PREENCODING_CACHE
102+ cached = cache .get (request_id )
103+ if cached == "COMPLETED" :
104+ return True
105+ if cached is None :
106+ return False
107+ done = getattr (cached , "done" , None )
108+ return callable (done ) and done ()
109+
110+
111+ def _request_has_vision_mm (request : Any ) -> bool :
112+ if not request .mm_features :
113+ return False
114+ return any (f .modality in ("image" , "video" ) for f in request .mm_features )
115+
88116
89117class EngineCore :
90118 """Inner loop of vLLM's Engine."""
@@ -398,6 +426,59 @@ def log_iteration_details(self, scheduler_output: SchedulerOutput):
398426 )
399427 self ._iteration_index += 1
400428
429+ def _schedule_waiting_vision (self ) -> None :
430+ """Vision Scheduler: Proactively trigger pre-encoding for waiting requests.
431+
432+ This is the key to enabling pipelining with max-num-seqs=1:
433+ - Core scheduler only schedules 1 LLM at a time (max-num-seqs=1)
434+ - Vision scheduler processes ALL waiting requests' vision independently
435+ - Request 2's vision can process while Request 1's LLM runs
436+ """
437+ if not (
438+ envs .VLLM_NPU_ASYNC_PIPELINE
439+ and envs .VLLM_VISION_NPU_BACKEND .lower () == "flexmlrt"
440+ ):
441+ return
442+
443+ try :
444+ waiting_requests = list (self .scheduler .waiting ) # type: ignore[attr-defined]
445+ except Exception as e :
446+ logger .exception ("[Vision Scheduler] Error accessing waiting queue: %s" , e )
447+ return
448+
449+ if not waiting_requests :
450+ return
451+
452+ # Skip the scan when every waiting vision request is already submitted.
453+ pending_submit = False
454+ for request in waiting_requests :
455+ if not _request_has_vision_mm (request ):
456+ continue
457+ req_id = request .request_id
458+ if req_id not in _VISION_PREENCODING_CACHE :
459+ pending_submit = True
460+ break
461+
462+ if not pending_submit :
463+ return
464+
465+ for request in waiting_requests :
466+ if not _request_has_vision_mm (request ):
467+ continue
468+
469+ req_id = request .request_id
470+ if req_id in _VISION_PREENCODING_CACHE :
471+ continue
472+
473+ has_mm_hash = any (mm_feature .mm_hash for mm_feature in request .mm_features )
474+ if not has_mm_hash :
475+ continue
476+
477+ logger .debug (
478+ "[Vision Scheduler] Submitting pre-encoding for request %s" , req_id
479+ )
480+ self .model_executor .submit_vision_encoding (req_id , request .mm_features )
481+
401482 def step (self ) -> tuple [dict [int , EngineCoreOutputs ], bool ]:
402483 """Schedule, execute, and make output.
403484
@@ -409,6 +490,10 @@ def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]:
409490 # or finished and not yet removed from the batch.
410491 if not self .scheduler .has_requests ():
411492 return {}, False
493+
494+ # Vision pre-encoding for waiting requests (overlaps with running LLM).
495+ self ._schedule_waiting_vision ()
496+
412497 scheduler_output = self .scheduler .schedule ()
413498 future = self .model_executor .execute_model (scheduler_output , non_block = True )
414499 grammar_output = self .scheduler .get_grammar_bitmask (scheduler_output )
@@ -467,7 +552,12 @@ def step_with_batch_queue(
467552 model_executed = False
468553 deferred_scheduler_output = None
469554 if self .scheduler .has_requests ():
555+ # VISION SCHEDULER: Proactively trigger pre-encoding
556+ # Request 2's vision can start while Request 1's LLM runs
557+ self ._schedule_waiting_vision ()
558+
470559 scheduler_output = self .scheduler .schedule ()
560+
471561 with self .log_error_detail (scheduler_output ):
472562 exec_future = self .model_executor .execute_model (
473563 scheduler_output , non_block = True
@@ -1211,7 +1301,10 @@ def _process_engine_step(self) -> bool:
12111301 # background threads (like NIXL handshake) to make progress.
12121302 # Without this, the tight polling loop can starve background threads.
12131303 if not model_executed and self .scheduler .has_unfinished_requests ():
1214- time .sleep (0.001 )
1304+ if getattr (self .scheduler , "waiting_on_vision_encoding" , False ):
1305+ time .sleep (_VISION_POLL_SLEEP_S )
1306+ else :
1307+ time .sleep (_DEFAULT_BUSY_LOOP_SLEEP_S )
12151308
12161309 return model_executed
12171310
0 commit comments