Update RopeFusion to support Qwen model after SDPA to PA conversion. (#28620)

itikhono · web-flow · commit c0094c6e8181 · 2025-02-13T08:09:38.000Z
Details: Update RopeFusion for qwen model Tickets: [CVS-161067](https://jira.devtools.intel.com/browse/CVS-161067)
diff --git a/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp b/src/common/transformations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp
@@ -723,6 +723,7 @@ ov::pass::RoPEFusionQwen::RoPEFusionQwen(int split_output_id) {
     auto rotary_emb_cos = makePattern("[1,?,1,?]");  // [1,..4096,1,128]
     auto rotary_emb_sin = makePattern("[1,?,1,?]");  // [1,..4096,1,128]
     auto qkv_proj = makePattern("[?,?,?]");          // [?,?,12288]
+    auto position_ids = makePattern();
 
     auto head_cnt = ov::gen_pattern::Symbol("head_cnt");
     auto head_size = ov::gen_pattern::Symbol("head_size");
@@ -749,14 +750,19 @@ ov::pass::RoPEFusionQwen::RoPEFusionQwen(int split_output_id) {
     auto ScatterUpdate_463814 = makePattern<opset3::ScatterUpdate>({{0, 0}, {1}, Gather_377635 | neg_Multiply, {0}});
     auto slice_Slice_446 =
         makePattern<ov::opset8::Slice>({rotary_emb_cos, Gather_377635 | neg_Multiply, {INT_MAX}, {1}, {1}});
+
+    auto gather_cos_by_pos_ids = makePattern<opset8::Gather>({rotary_emb_cos, position_ids, 1}, {{"batch_dims", 0}});
+    auto reshape_cos_to_expected_layout =
+        makePattern<opset8::Reshape>({gather_cos_by_pos_ids, {-1, 1, 1, 128}}, {{"special_zero", false}});
+
     auto slice_StridedSlice_446 = GenStridedSlice(rotary_emb_cos,
                                                   ScatterUpdate_463814,
                                                   {0, INT_MAX},
                                                   {1, 1},
                                                   1);  //  tensor_array<f32[1,..4096,1,128]>
-    auto mul_Multiply_552 =
-        makePattern<opset1::Multiply>({slice_Slice_543, slice_StridedSlice_446 | slice_Slice_446},
-                                      {{"auto_broadcast", "numpy"}});  //  tensor_array<f32[?,?,32,128]>
+    auto mul_Multiply_552 = makePattern<opset1::Multiply>(
+        {slice_Slice_543, slice_StridedSlice_446 | slice_Slice_446 | reshape_cos_to_expected_layout},
+        {{"auto_broadcast", "numpy"}});  //  tensor_array<f32[?,?,32,128]>
 
     auto reshape_opt1 = [&](std::shared_ptr<Node> input_BLHS) {
         auto ShapeOf_485814 = makePattern<opset1::ShapeOf>({input_BLHS}, {});
@@ -790,18 +796,28 @@ ov::pass::RoPEFusionQwen::RoPEFusionQwen(int split_output_id) {
         makePattern<opset1::Squeeze>({Multiply_567527, -2});  //  tensor_array<f32[?,?,32,64]>
     auto ListUnpack_586_Squeeze =
         makePattern<opset1::Squeeze>({ListUnpack_586_Split->output(0), -2});  //  tensor_array<f32[?,?,32,64]>
-    auto cat_Concat_593 = makePattern<opset1::Concat>({ListUnpack_586_Squeeze_0, ListUnpack_586_Squeeze},
-                                                      {{"axis", -1}});  //  tensor_array<f32[?,?,32,128]>
+
+    auto ListUnpack_Squeeze_0_1 =
+        makePattern<opset1::Reshape>({Multiply_567527, {-1, 1, 32, 64}}, {{"special_zero", false}});
+    auto ListUnpack_Squeeze_1 =
+        makePattern<opset1::Reshape>({ListUnpack_586_Split->output(0), {-1, 1, 32, 64}}, {{"special_zero", false}});
+
+    auto cat_Concat_593 = makePattern<opset1::Concat>(
+        {ListUnpack_586_Squeeze_0 | ListUnpack_Squeeze_0_1, ListUnpack_586_Squeeze | ListUnpack_Squeeze_1},
+        {{"axis", -1}});  //  tensor_array<f32[?,?,32,128]>
     auto slice_StridedSlice_470 = GenStridedSlice(rotary_emb_sin,
                                                   ScatterUpdate_463814,
                                                   {0, INT_MAX},
                                                   {1, 1},
                                                   1);  //  tensor_array<f32[1,..4096,1,128]>
     auto slice_Slice_470 =
         makePattern<opset8::Slice>({rotary_emb_sin, Gather_377635 | neg_Multiply, {INT_MAX}, {1}, {1}});
-    auto mul_Multiply_594 =
-        makePattern<opset1::Multiply>({cat_Concat_593, slice_StridedSlice_470 | slice_Slice_470},
-                                      {{"auto_broadcast", "numpy"}});  //  tensor_array<f32[?,?,32,128]>
+    auto gather_sin_by_pos_ids = makePattern<opset8::Gather>({rotary_emb_sin, position_ids, 1}, {{"batch_dims", 0}});
+    auto reshape_sin_to_expected_layout =
+        makePattern<opset8::Reshape>({gather_sin_by_pos_ids, {-1, 1, 1, 128}}, {{"special_zero", false}});
+    auto mul_Multiply_594 = makePattern<opset1::Multiply>(
+        {cat_Concat_593, slice_StridedSlice_470 | slice_Slice_470 | reshape_sin_to_expected_layout},
+        {{"auto_broadcast", "numpy"}});  //  tensor_array<f32[?,?,32,128]>
     auto add_Add_597 = makePattern<opset1::Add>({mul_Multiply_552, mul_Multiply_594},
                                                 {{"auto_broadcast", "numpy"}});  //  tensor_array<f32[?,?,32,128]>
 
@@ -858,16 +874,25 @@ ov::pass::RoPEFusionQwen::RoPEFusionQwen(int split_output_id) {
         new_args.push_back(pattern_map.at(rotary_emb_cos));
         new_args.push_back(pattern_map.at(rotary_emb_sin));
 
+        ov::NodeVector rt_from = {pattern_map.at(Multiply_567527).get_node_shared_ptr(),
+                                  pattern_map.at(cat_Concat_593).get_node_shared_ptr(),
+                                  pattern_map.at(mul_Multiply_594).get_node_shared_ptr(),
+                                  pattern_map.at(add_Add_597).get_node_shared_ptr()};
+
+        if (pattern_map.count(position_ids)) {
+            new_args.push_back(pattern_map.at(position_ids));
+            config.gather_position_arg_id = 3;
+            rt_from.push_back(pattern_map.at(ListUnpack_Squeeze_0_1).get_node_shared_ptr());
+            rt_from.push_back(pattern_map.at(ListUnpack_Squeeze_1).get_node_shared_ptr());
+        } else {
+            rt_from.push_back(pattern_map.at(ListUnpack_586_Squeeze_0).get_node_shared_ptr());
+            rt_from.push_back(pattern_map.at(ListUnpack_586_Squeeze).get_node_shared_ptr());
+        }
+
         auto old_node = root;
         auto new_node = std::make_shared<op::internal::RoPE>(new_args, config);
         new_node->set_friendly_name(old_node->get_friendly_name());
-        ov::copy_runtime_info({pattern_map.at(Multiply_567527).get_node_shared_ptr(),
-                               pattern_map.at(ListUnpack_586_Squeeze_0).get_node_shared_ptr(),
-                               pattern_map.at(ListUnpack_586_Squeeze).get_node_shared_ptr(),
-                               pattern_map.at(cat_Concat_593).get_node_shared_ptr(),
-                               pattern_map.at(mul_Multiply_594).get_node_shared_ptr(),
-                               pattern_map.at(add_Add_597).get_node_shared_ptr()},
-                              new_node);
+        ov::copy_runtime_info(rt_from, new_node);
         ov::replace_node(old_node, new_node);
         return true;
     };
diff --git a/src/common/transformations/tests/common_optimizations/fuse_rotary_positional_embeddings.cpp b/src/common/transformations/tests/common_optimizations/fuse_rotary_positional_embeddings.cpp
@@ -1217,6 +1217,86 @@ TEST_F(TransformationTestsF, ConvertToROPE_chatGLM3_PagedAttention) {
     }
 }
 
+TEST_F(TransformationTestsF, ConvertToROPE_Qwen_PagedAttention) {
+    using namespace ov;
+
+    {
+        auto position_ids = std::make_shared<opset1::Parameter>(ov::element::i64, ov::PartialShape{-1, -1});
+        auto qkv = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::PartialShape{-1, 1, 3 * 4096});
+
+        auto qkv_proj = makeOP<opset1::VariadicSplit>({qkv, 2, {4096, 4096, -1}});
+
+        auto view_Reshape = makeOP<opset1::Reshape>({qkv_proj->output(0), {0, 0, 32, 128}}, {{"special_zero", true}});
+        auto slice_Slice_4 = makeOP<opset8::Slice>({view_Reshape, {0}, {128}, {1}, {3}});
+        auto slice_Slice = makeConst(element::f32, ov::Shape({1, 4096, 1, 128}), {1});
+
+        auto Convert_50535 = makeOP<opset1::Convert>({position_ids}, {{"destination_type", "i32"}});
+        auto Unsqueeze_23750 = makeOP<opset1::Reshape>({Convert_50535, {-1, 1}}, {{"special_zero", false}});
+
+        auto slice_Slice_1 = makeOP<opset8::Gather>({slice_Slice, Unsqueeze_23750, 1}, {{"batch_dims", 0}});
+        auto Reshape_27400 = makeOP<opset1::Reshape>({slice_Slice_1, {-1, 1, 1, 128}}, {{"special_zero", false}});
+
+        auto mul_Multiply = makeOP<opset1::Multiply>({slice_Slice_4, Reshape_27400}, {{"auto_broadcast", "numpy"}});
+        auto reshape_Reshape = makeOP<opset1::Reshape>({slice_Slice_4, {0, 0, 32, 2, 64}}, {{"special_zero", true}});
+        auto ListUnpack_Split = makeOP<opset1::Split>({reshape_Reshape, -2}, {{"num_splits", 2}});
+        auto Multiply_54136 =
+            makeOP<opset1::Multiply>({ListUnpack_Split->output(1), -1.000000f}, {{"auto_broadcast", "numpy"}});
+        auto ListUnpack_Squeeze_0 =
+            makeOP<opset1::Reshape>({Multiply_54136, {-1, 1, 32, 64}}, {{"special_zero", false}});
+        auto ListUnpack_Squeeze =
+            makeOP<opset1::Reshape>({ListUnpack_Split->output(0), {-1, 1, 32, 64}}, {{"special_zero", false}});
+        auto cat_Concat = makeOP<opset1::Concat>({ListUnpack_Squeeze_0, ListUnpack_Squeeze}, {{"axis", -1}});
+
+        auto slice_Slice_2 = makeConst(element::f32, ov::Shape({1, 4096, 1, 128}), {1});
+        auto slice_Slice_6 = makeOP<opset8::Gather>({slice_Slice_2, Unsqueeze_23750, 1}, {{"batch_dims", 0}});
+        auto Reshape_27408 = makeOP<opset1::Reshape>({slice_Slice_6, {-1, 1, 1, 128}}, {{"special_zero", false}});
+        auto mul_Multiply_1 = makeOP<opset1::Multiply>({cat_Concat, Reshape_27408}, {{"auto_broadcast", "numpy"}});
+        auto add_Add = makeOP<opset1::Add>({mul_Multiply, mul_Multiply_1}, {{"auto_broadcast", "numpy"}});
+
+        auto slice_Slice_10 = makeConst(element::f32, ov::Shape({1, 32767, 1, 1}), {1});
+        auto view_Reshape_1 = makeOP<opset1::Reshape>({qkv_proj->output(1), {0, 0, 32, 128}}, {{"special_zero", true}});
+        auto slice_Slice_11 = makeOP<opset8::Slice>({view_Reshape_1, {0}, {128}, {1}, {3}});
+        auto mul_Multiply_2 = makeOP<opset1::Multiply>({slice_Slice_11, Reshape_27400}, {{"auto_broadcast", "numpy"}});
+        auto reshape_Reshape_1 = makeOP<opset1::Reshape>({slice_Slice_11, {0, 0, 32, 2, 64}}, {{"special_zero", true}});
+        auto ListUnpack_Split_1 = makeOP<opset1::Split>({reshape_Reshape_1, -2}, {{"num_splits", 2}});
+        auto Multiply_54139 =
+            makeOP<opset1::Multiply>({ListUnpack_Split_1->output(1), -1.000000f}, {{"auto_broadcast", "numpy"}});
+        auto ListUnpack_Squeeze_0_1 =
+            makeOP<opset1::Reshape>({Multiply_54139, {-1, 1, 32, 64}}, {{"special_zero", false}});
+        auto ListUnpack_Squeeze_1 =
+            makeOP<opset1::Reshape>({ListUnpack_Split_1->output(0), {-1, 1, 32, 64}}, {{"special_zero", false}});
+        auto cat_Concat_2 = makeOP<opset1::Concat>({ListUnpack_Squeeze_0_1, ListUnpack_Squeeze_1}, {{"axis", -1}});
+        auto mul_Multiply_3 = makeOP<opset1::Multiply>({cat_Concat_2, Reshape_27408}, {{"auto_broadcast", "numpy"}});
+        auto add_Add_1 = makeOP<opset1::Add>({mul_Multiply_2, mul_Multiply_3}, {{"auto_broadcast", "numpy"}});
+        model = std::make_shared<ov::Model>(ov::NodeVector{add_Add_1}, ov::ParameterVector{position_ids, qkv});
+    }
+
+    manager.register_pass<ov::pass::RoPEFusion>(false);
+
+    {
+        auto input = std::make_shared<ov::opset1::Parameter>(ov::element::f32, ov::PartialShape{-1, 1, 4096 * 3});
+        auto rotary_emp_sin = makeConst(element::f32, ov::Shape({1, 4096, 1, 128}), {1});
+        auto rotary_emp_cos = makeConst(element::f32, ov::Shape({1, 4096, 1, 128}), {1});
+        auto position_ids = std::make_shared<opset1::Parameter>(ov::element::i64, ov::PartialShape{-1, -1});
+        auto Convert_50535 = makeOP<opset1::Convert>({position_ids}, {{"destination_type", "i32"}});
+        auto Unsqueeze_23750 = makeOP<opset1::Reshape>({Convert_50535, {-1, 1}}, {{"special_zero", false}});
+        auto rope = makeOP<ov::op::internal::RoPE>({input, rotary_emp_sin, rotary_emp_cos, Unsqueeze_23750},
+                                                   {{"config.slice_start", 4096},
+                                                    {"config.slice_stop", 8192},
+                                                    {"config.input_trans0213", false},
+                                                    {"config.output_trans0213", false},
+                                                    {"config.is_interleaved", false},
+                                                    {"config.rotary_ndims", 128},
+                                                    {"config.is_chatglm", false},
+                                                    {"config.support_2d_rope", false},
+                                                    {"config.is_qwen", true},
+                                                    {"config.head_cnt", 32},
+                                                    {"config.head_size", 128},
+                                                    {"config.gather_position_arg_id", 3}});
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{rope}, ov::ParameterVector{input, position_ids});
+    }
+}
+
 TEST_F(TransformationTestsF, ConvertToROPE_GPTJ_PagedAttention) {
     disable_rt_info_check();
     const int batch = -1;
diff --git a/src/plugins/intel_cpu/src/nodes/rope.cpp b/src/plugins/intel_cpu/src/nodes/rope.cpp
@@ -338,11 +338,16 @@ struct RoPE::RoPEExecutorQwen : public RoPE::Executor {
         ov::intel_cpu::PlainTensor t_cos(inputs[1]);   // [1, present-kv-length, 1, rotary_dims]
         ov::intel_cpu::PlainTensor t_sin(inputs[2]);   // [1, present-kv-length, 1, rotary_dims]
         ov::intel_cpu::PlainTensor t_dst(outputs[0]);  // [batch, length, head_cnt, head_size]>
+        ov::intel_cpu::PlainTensor gather;
+
         auto rotary_dims = t_cos.size(3);
 
         if (m_config.slice_stop - m_config.slice_start > 0) {
             t_src = t_src.slice(2, m_config.slice_start, m_config.slice_stop);
         }
+        if (m_config.gather_position_arg_id > 0) {
+            gather.reset(inputs[m_config.gather_position_arg_id]);
+        }
 
         auto batch_size = t_src.size(0);
         auto seq_len = t_src.size(1);
@@ -351,9 +356,20 @@ struct RoPE::RoPEExecutorQwen : public RoPE::Executor {
         auto present_kv_len = t_cos.size(1);
 
         parallel_for3d(batch_size, seq_len, head_cnt, [&](size_t b, size_t p, size_t h) {
+            size_t sincos_pos;
+            if (gather) {
+                if (gather.m_rank == 4) {
+                    sincos_pos = gather.at<int32_t>({b, h, p, 0}, true);
+                } else {
+                    sincos_pos = gather.at<int32_t>({b, p}, true);
+                }
+            } else {
+                sincos_pos = present_kv_len - seq_len + p;
+            }
+
             auto* src = t_src.ptr<T>(b, p, h * head_size);
-            auto* cos = &t_cos.at<float>({b, present_kv_len - seq_len + p, h, 0}, true);
-            auto* sin = &t_sin.at<float>({b, present_kv_len - seq_len + p, h, 0}, true);
+            auto* cos = &t_cos.at<float>({b, sincos_pos, h, 0}, true);
+            auto* sin = &t_sin.at<float>({b, sincos_pos, h, 0}, true);
             auto* dst = t_dst.ptr<T>(b, p, h);
 
             if (m_rotaryKernel) {