@@ -581,18 +581,40 @@ def __init__(
581581 ) -> None :
582582 super ().__init__ ()
583583
584+ # Store minimal config needed for both NPU and PyTorch paths
585+ self .out_hidden_size = vision_config .out_hidden_size
586+ self .spatial_merge_size = vision_config .spatial_merge_size
587+ self .spatial_merge_unit = self .spatial_merge_size ** 2
588+
589+ # Check NPU backend before creating PyTorch modules
590+ from vllm .model_executor .models .vision import (
591+ get_npu_vision_backend ,
592+ use_npu_vision_backend ,
593+ )
594+
595+ if use_npu_vision_backend ():
596+ try :
597+ self .npu_backend = get_npu_vision_backend ()
598+ logger .info ("[Qwen2.5VL] Using NPU vision backend" )
599+ return
600+ except Exception as e :
601+ logger .error ("[Qwen2.5VL] NPU backend init failed: %s" , e )
602+ raise RuntimeError (
603+ f"NPU vision backend initialization failed: { e } . "
604+ "Set VLLM_VISION_NPU_BACKEND='' to use PyTorch backend."
605+ ) from e
606+
607+ self .npu_backend = None
584608 patch_size = vision_config .patch_size
585609 temporal_patch_size = vision_config .temporal_patch_size
586610 in_channels = vision_config .in_channels
587611 depth = vision_config .depth
588612 self .hidden_size = vision_config .hidden_size
589613 self .num_heads = vision_config .num_heads
590- self .out_hidden_size = vision_config .out_hidden_size
591614
592615 # args for get_window_index_thw
593616 self .window_size = vision_config .window_size
594617 self .patch_size = vision_config .patch_size
595- self .spatial_merge_size = vision_config .spatial_merge_size
596618 self .fullatt_block_indexes = vision_config .fullatt_block_indexes
597619 self .spatial_merge_unit = self .spatial_merge_size ** 2
598620 self .patch_embed = Qwen2_5_VisionPatchEmbed (
@@ -653,11 +675,22 @@ def __init__(
653675
654676 @property
655677 def dtype (self ) -> torch .dtype :
656- return self .patch_embed .proj .weight .dtype
678+ if hasattr (self , "npu_backend" ) and self .npu_backend is not None :
679+ return torch .bfloat16
680+ if hasattr (self , "patch_embed" ):
681+ return self .patch_embed .proj .weight .dtype
682+ # Safe fallback if neither exists
683+ return torch .bfloat16
657684
658685 @property
659686 def device (self ) -> torch .device :
660- return self .patch_embed .proj .weight .device
687+ if hasattr (self , "npu_backend" ) and self .npu_backend is not None :
688+ # NPU outputs are on CPU, transfer to GPU happens in forward
689+ return torch .device ("cpu" )
690+ if hasattr (self , "patch_embed" ):
691+ return self .patch_embed .proj .weight .device
692+ # Safe fallback
693+ return torch .device ("cpu" )
661694
662695 def rotary_pos_emb_thw (self , t , h , w ):
663696 hpos_ids = torch .arange (h ).unsqueeze (1 ).expand (- 1 , w )
@@ -787,6 +820,94 @@ def forward(
787820 x : torch .Tensor ,
788821 grid_thw : list [list [int ]],
789822 ) -> torch .Tensor :
823+ # Dispatch to NPU or PyTorch backend
824+ if hasattr (self , "npu_backend" ) and self .npu_backend is not None :
825+ return self ._forward_npu (x , grid_thw )
826+ else :
827+ return self ._forward_pytorch (x , grid_thw )
828+
829+ def _forward_npu (
830+ self , pixel_values : torch .Tensor , grid_thw : list [list [int ]]
831+ ) -> torch .Tensor :
832+ """Forward pass using NPU backend."""
833+ import logging
834+ import time
835+
836+ import numpy as np
837+
838+ logger = logging .getLogger (__name__ )
839+
840+ # Convert PyTorch → NumPy (handle bfloat16 by converting to float32 first)
841+ if pixel_values .dtype == torch .bfloat16 :
842+ pixel_values_np = pixel_values .cpu ().float ().numpy ()
843+ else :
844+ pixel_values_np = pixel_values .cpu ().numpy ().astype (np .float32 )
845+ grid_thw_np = np .array (grid_thw , dtype = np .int64 )
846+
847+ # Run NPU inference
848+ embeddings_np = self .npu_backend .forward (pixel_values_np , grid_thw_np )
849+
850+ # Convert back to PyTorch and transfer to GPU for LLM
851+ import vllm .envs as envs
852+
853+ if envs .VLLM_NPU_TIMING :
854+ gpu_transfer_start = time .monotonic ()
855+ embeddings = torch .from_numpy (embeddings_np ).to (
856+ device = "cuda" , dtype = torch .bfloat16
857+ )
858+ gpu_transfer_ms = (time .monotonic () - gpu_transfer_start ) * 1000
859+ logger .debug (
860+ "[NPU Timing] CPU→GPU transfer: %.2fms (%.2f MB)" ,
861+ gpu_transfer_ms ,
862+ embeddings_np .nbytes / 1024 ** 2 ,
863+ )
864+ logger .debug ("[Vision→LLM] Vision embeddings shape: %s" , embeddings .shape )
865+ else :
866+ embeddings = torch .from_numpy (embeddings_np ).to (
867+ device = "cuda" , dtype = torch .bfloat16
868+ )
869+
870+ # NPU model outputs compressed tokens but vLLM expects uncompressed
871+ # count. We need to pad/repeat to match expected count based on grid_thw
872+ actual_tokens = embeddings .shape [0 ]
873+ merge_size = self .spatial_merge_size
874+ expected_tokens_per_image = [
875+ (t * h * w ) // (merge_size * merge_size ) for t , h , w in grid_thw
876+ ]
877+ total_expected = sum (expected_tokens_per_image )
878+
879+ if actual_tokens != total_expected :
880+ logger .warning (
881+ "[NPU] Token count mismatch: NPU output %s tokens, "
882+ "but vLLM expects %s based on grid_thw. "
883+ "Repeating tokens to match expected count." ,
884+ actual_tokens ,
885+ total_expected ,
886+ )
887+ repeat_factor = total_expected / actual_tokens
888+ if repeat_factor == int (repeat_factor ):
889+ embeddings = embeddings .repeat_interleave (int (repeat_factor ), dim = 0 )
890+ else :
891+ embeddings = embeddings .unsqueeze (0 ).unsqueeze (0 )
892+ embeddings = torch .nn .functional .interpolate (
893+ embeddings ,
894+ size = (total_expected , embeddings .shape [- 1 ]),
895+ mode = "nearest" ,
896+ )
897+ embeddings = embeddings .squeeze (0 ).squeeze (0 )
898+
899+ logger .debug (
900+ "[NPU] Padded from %s to %s tokens" , actual_tokens , embeddings .shape [0 ]
901+ )
902+
903+ return embeddings
904+
905+ def _forward_pytorch (
906+ self ,
907+ x : torch .Tensor ,
908+ grid_thw : list [list [int ]],
909+ ) -> torch .Tensor :
910+ """Original PyTorch forward pass."""
790911 # patchify
791912 seq_len , _ = x .size ()
792913 rotary_pos_emb_cos = []
@@ -889,6 +1010,12 @@ def forward(
8891010 return hidden_states
8901011
8911012 def load_weights (self , weights : Iterable [tuple [str , torch .Tensor ]]) -> set [str ]:
1013+ if self .npu_backend is not None :
1014+ logger .info (
1015+ "[Qwen2.5VL Vision] Skipping weight loading (using NPU backend)"
1016+ )
1017+ return set ()
1018+
8921019 stacked_params_mapping = [
8931020 # (param_name, shard_name, shard_id)
8941021 ("attn.qkv." , "attn.q." , "q" ),
@@ -1231,8 +1358,25 @@ def _process_image_input(
12311358 image_embeds = self .visual (pixel_values , grid_thw = grid_thw_list )
12321359
12331360 # Split concatenated embeddings for each image item.
1234- merge_size = self .visual .spatial_merge_size
1235- sizes = (grid_thw .prod (- 1 ) // merge_size // merge_size ).tolist ()
1361+ # When using NPU backend, merge is already done in NPU, so use actual
1362+ # output size
1363+ if hasattr (self .visual , "npu_backend" ) and self .visual .npu_backend is not None :
1364+ # NPU backend already did spatial merging - use actual output sizes
1365+ # For single image: sizes = [actual_num_tokens]
1366+ # For batched images: split based on actual output
1367+ num_images = len (grid_thw_list )
1368+ if num_images == 1 :
1369+ # Single image - return the whole embedding
1370+ sizes = [image_embeds .shape [0 ]]
1371+ else :
1372+ # Multiple images - need to split based on actual grid sizes
1373+ # Each image: (T*H*W) // (merge_size^2) tokens after NPU
1374+ merge_size = self .visual .spatial_merge_size
1375+ sizes = (grid_thw .prod (- 1 ) // merge_size // merge_size ).tolist ()
1376+ else :
1377+ # PyTorch backend - calculate expected size
1378+ merge_size = self .visual .spatial_merge_size
1379+ sizes = (grid_thw .prod (- 1 ) // merge_size // merge_size ).tolist ()
12361380 return image_embeds .split (sizes )
12371381
12381382 def _postprocess_image_embeds_evs (
@@ -1495,6 +1639,22 @@ def compute_logits(
14951639 return self .language_model .compute_logits (hidden_states )
14961640
14971641 def load_weights (self , weights : Iterable [tuple [str , torch .Tensor ]]) -> set [str ]:
1642+ if hasattr (self .visual , "npu_backend" ) and self .visual .npu_backend is not None :
1643+ logger .info (
1644+ "[Qwen2.5VL Model] Filtering out visual weights (using NPU backend)"
1645+ )
1646+ filtered_weights = []
1647+ visual_weight_count = 0
1648+ for name , weight in weights :
1649+ if name .startswith ("visual." ):
1650+ visual_weight_count += 1
1651+ continue
1652+ filtered_weights .append ((name , weight ))
1653+ logger .info (
1654+ "[Qwen2.5VL Model] Skipped %s visual weights" , visual_weight_count
1655+ )
1656+ weights = filtered_weights
1657+
14981658 loader = AutoWeightsLoader (self )
14991659 return loader .load_weights (weights , mapper = self .hf_to_vllm_mapper )
15001660
@@ -1526,3 +1686,25 @@ def get_num_mm_connector_tokens(
15261686 vision_config = hf_config .vision_config
15271687 merge_size = vision_config .spatial_merge_size
15281688 return num_vision_tokens // merge_size ** 2
1689+
1690+ def embed_input_ids (
1691+ self ,
1692+ input_ids : torch .Tensor ,
1693+ multimodal_embeddings : tuple [torch .Tensor , ...] | None = None ,
1694+ is_multimodal : torch .Tensor | None = None ,
1695+ ) -> torch .Tensor :
1696+ """Embed token ids and merge multimodal embeddings (V1 MM path)."""
1697+ inputs_embeds = self .language_model .model .embed_input_ids (input_ids )
1698+ if (
1699+ multimodal_embeddings is not None
1700+ and is_multimodal is not None
1701+ and len (multimodal_embeddings ) > 0
1702+ ):
1703+ from vllm .model_executor .models .utils import _merge_multimodal_embeddings
1704+
1705+ inputs_embeds = _merge_multimodal_embeddings (
1706+ inputs_embeds ,
1707+ multimodal_embeddings ,
1708+ is_multimodal ,
1709+ )
1710+ return inputs_embeds
0 commit comments