openvinotoolkit · yatarkan · Mar 12, 2026 · Feb 26, 2026 · Mar 11, 2026 · Mar 11, 2026
@@ -514,9 +514,11 @@ class ModelRunner {
         if (hidden_state_input && hidden_state_input.get_size() > 0) {
             m_request.set_tensor("hidden_states", hidden_state_input);
         }
-        if (position_ids.get_shape().size() == 3) {
-            // flatten positions ids for 3D position ids case
-            position_ids.set_shape({ov::shape_size(position_ids.get_shape())});
+        if (position_ids.get_shape().size() == 3 && position_ids.get_shape()[0] == 3 &&
+            position_ids.get_shape()[1] == 1) {
+            // M-RoPE: squeeze pseudo-batch dim [3, 1, total_token_num] -> [3, total_token_num]
+            const auto& position_ids_shape = position_ids.get_shape();
-        if (position_ids.get_shape().size() == 3 && position_ids.get_shape()[0] == 3 &&
-            position_ids.get_shape()[1] == 1) {
-            // M-RoPE: squeeze pseudo-batch dim [3, 1, total_token_num] -> [3, total_token_num]
-            const auto& position_ids_shape = position_ids.get_shape();
+        // Qwen2-VL M-RoPE compatibility:
+        // Qwen2-VL uses Multi-dimensional Rotary Position Embedding (M-RoPE), where position_ids
+        // are produced with a fixed 3D layout [3, 1, total_token_num]. The leading dimension (3)
+        // corresponds to M-RoPE's internal components and the middle dimension is a pseudo-batch
+        // of size 1. The model implementation (see qwen2vl/classes.cpp, around line 1208) expects
+        // a squeezed layout [3, total_token_num] at inference time.
+        //
+        // To support Qwen2-VL without changing other models, handle only this specific M-RoPE
+        // shape by removing the pseudo-batch dimension.
+        constexpr std::size_t qwen2vl_mrope_components = 3;  // M-RoPE internal components
+        constexpr std::size_t qwen2vl_mrope_batch      = 1;  // pseudo-batch dimension
+        const auto& position_ids_shape = position_ids.get_shape();
+        if (position_ids_shape.size() == 3 &&
+            position_ids_shape[0] == qwen2vl_mrope_components &&
+            position_ids_shape[1] == qwen2vl_mrope_batch) {
+            // M-RoPE: squeeze pseudo-batch dim [3, 1, total_token_num] -> [3, total_token_num]
-        if (position_ids.get_shape().size() == 3 && position_ids.get_shape()[0] == 3 &&
-            position_ids.get_shape()[1] == 1) {
-            // M-RoPE: squeeze pseudo-batch dim [3, 1, total_token_num] -> [3, total_token_num]
-            const auto& position_ids_shape = position_ids.get_shape();
+        // Qwen2-VL M-RoPE compatibility:
+        // Qwen2-VL uses Multi-dimensional Rotary Position Embedding (M-RoPE), where position_ids
+        // are produced with a fixed 3D layout [3, 1, total_token_num]. The leading dimension (3)
+        // corresponds to M-RoPE's internal components and the middle dimension is a pseudo-batch
+        // of size 1. The model implementation (see qwen2vl/classes.cpp, around line 1208) expects
+        // a squeezed layout [3, total_token_num] at inference time.
+        //
+        // To support Qwen2-VL without changing other models, handle only this specific M-RoPE
+        // shape by removing the pseudo-batch dimension.
+        constexpr std::size_t qwen2vl_mrope_components = 3;  // M-RoPE internal components
+        constexpr std::size_t qwen2vl_mrope_batch      = 1;  // pseudo-batch dimension
+        const auto& position_ids_shape = position_ids.get_shape();
+        if (position_ids_shape.size() == 3 &&
+            position_ids_shape[0] == qwen2vl_mrope_components &&
+            position_ids_shape[1] == qwen2vl_mrope_batch) {
+            // M-RoPE: squeeze pseudo-batch dim [3, 1, total_token_num] -> [3, total_token_num]
+            position_ids.set_shape({position_ids_shape[0], position_ids_shape[2]});
         }
         // typical LLM parameters
         if (!m_cached_position_ids) {

@@ -637,12 +637,10 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
     }
 };
 
-// TODO: remove it when QWEN ticket-167316/GEMMA3 ticket-171180 is fixed
+// TODO: remove it when GEMMA3 ticket-171180 is fixed
 bool requires_sdpa(const std::filesystem::path& models_dir) {
     auto vlm_config = utils::from_config_json_if_exists<VLMConfig>(models_dir, "config.json");
-    return vlm_config.model_type == VLMModelType::QWEN2_VL ||
-           vlm_config.model_type == VLMModelType::QWEN2_5_VL ||
-           vlm_config.model_type == VLMModelType::GEMMA3;
+    return vlm_config.model_type == VLMModelType::GEMMA3;
 }
 
 VLMPipeline::VLMPipeline(