[CPU] Add gate_up_swapped support for LLM MLP fusion

liubo-intel · liubo-intel · commit a82c0f8c735a · 2025-11-07T01:40:59.000-05:00
w_scale_gate and w_scale_up also need to exchange for gate_up_swapped cases

verify ovms pipeline
diff --git a/src/plugins/intel_cpu/src/nodes/eltwise.cpp b/src/plugins/intel_cpu/src/nodes/eltwise.cpp
@@ -546,20 +546,23 @@ bool Eltwise::isWithBroadcast() {
 }
 
 void Eltwise::init() {
-    // Bf16 saturation handling for gamma parameter when input precision is bf16 to make sure it stays within the valid
-    // range for bfloat16.
+    // Bf16 saturation handling for PowerStatic parameters
+    // to make sure they stay within the valid range for bfloat16.
     if (m_attrs.data.algo == Algorithm::EltwisePowerStatic && getOriginalInputPrecisionAtPort(0) == ov::element::bf16) {
-        const float lowest = static_cast<float>(std::numeric_limits<ov::bfloat16>::lowest());
-        const float max = static_cast<float>(std::numeric_limits<ov::bfloat16>::max());
-        auto& gamma = m_attrs.data.gamma;
+        static const float bf16_lowest = static_cast<float>(std::numeric_limits<ov::bfloat16>::lowest());
+        static const float bf16_max = static_cast<float>(std::numeric_limits<ov::bfloat16>::max());
 
-        if (gamma < lowest) {
-            gamma = lowest;
-        }
+        // Helper lambda to clamp parameter values within bf16 range
+        auto clampBf16Parameter = [&](auto& param) {
+            if (std::isfinite(param)) {
+                param = std::clamp(static_cast<float>(param), bf16_lowest, bf16_max);
+            }
+        };
 
-        if (gamma > max) {
-            gamma = max;
-        }
+        // Clamp all PowerStatic parameters
+        clampBf16Parameter(m_attrs.data.alpha);
+        clampBf16Parameter(m_attrs.data.beta);
+        clampBf16Parameter(m_attrs.data.gamma);
     }
 }
 
diff --git a/src/plugins/intel_cpu/src/nodes/llm_mlp.cpp b/src/plugins/intel_cpu/src/nodes/llm_mlp.cpp
@@ -381,7 +381,13 @@ struct LLMMLP::Executor : public LLMMLP::ExecutorBase {
         OPENVINO_ASSERT(w_gate.stride_bytes(0) == w_up.stride_bytes(0));
         if (m_config.gate_up_combined) {
             N = w_gate.size(0) / 2;
-            gate_up.setup(w_gate.ptr_v(), w_up.ptr_v(N, 0), w_up.stride_bytes(0), N * 2, K, config);
+            if (m_config.gate_up_swapped) {
+                // When VariadicSplit output[1] connects to gate instead of up, swap the pointers
+                gate_up.setup(w_gate.ptr_v(N, 0), w_gate.ptr_v(), w_gate.stride_bytes(0), N * 2, K, config);
+            } else {
+                // Normal case: VariadicSplit output[1] connects to up
+                gate_up.setup(w_gate.ptr_v(), w_gate.ptr_v(N, 0), w_gate.stride_bytes(0), N * 2, K, config);
+            }
         } else {
             gate_up.setup(w_gate.ptr_v(), w_up.ptr_v(), w_up.stride_bytes(0), N * 2, K, config);
         }
@@ -395,10 +401,20 @@ struct LLMMLP::Executor : public LLMMLP::ExecutorBase {
             if (m_config.gate_up_combined) {
                 w_scale_up = w_scale_gate + N;
             }
+
+            // When gate_up_combined=true and gate_up_swapped=true, we need to swap the scales
+            // to match the swapped weight layout
+            auto* scale_first = w_scale_gate;
+            auto* scale_second = w_scale_up;
+            if (m_config.gate_up_combined && m_config.gate_up_swapped) {
+                scale_first = w_scale_up;
+                scale_second = w_scale_gate;
+            }
+
             for (size_t i = 0; i < N; i += 16) {
-                memcpy(dst, w_scale_gate + i, 16 * sizeof(float));
+                memcpy(dst, scale_first + i, 16 * sizeof(float));
                 dst += 16;
-                memcpy(dst, w_scale_up + i, 16 * sizeof(float));
+                memcpy(dst, scale_second + i, 16 * sizeof(float));
                 dst += 16;
             }
         }
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/llm_mlp.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/llm_mlp.cpp
@@ -38,6 +38,7 @@ bool LLMMLPNode::visit_attributes(ov::AttributeVisitor& visitor) {
     visitor.on_attribute("hidden_size", m_config.hidden_size);
     visitor.on_attribute("up_size", m_config.up_size);
     visitor.on_attribute("gate_up_combined", m_config.gate_up_combined);
+    visitor.on_attribute("gate_up_swapped", m_config.gate_up_swapped);
     visitor.finish_structure();
     return true;
 }
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/llm_mlp.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/llm_mlp.hpp
@@ -32,6 +32,7 @@ class LLMMLPNode : public ov::op::Op {
         int hidden_size;
         int up_size;
         bool gate_up_combined;
+        bool gate_up_swapped;  // true when VariadicSplit output[1] connects to gate instead of up
     };
 
     // args:
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/mlp_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/mlp_fusion.cpp
@@ -122,6 +122,23 @@ ov::intel_cpu::MLPFusionPass::MLPFusionPass() {
     matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) {
         const auto& pattern_map = m.get_pattern_value_map();
         auto root = m.get_match_root();
+
+        // Check VariadicSplit output connections in combined mode
+        bool gate_up_swapped = false;
+        if (pattern_map.count(gate_up_proj_split)) {
+            auto mlp_gated_up_node = pattern_map.at(mlp_gated_up).get_node_shared_ptr();
+            auto input0 = mlp_gated_up_node->input_value(0);
+            auto input1 = mlp_gated_up_node->input_value(1);
+
+            // Check if VariadicSplit output[0] connects to Multiply (swapped case)
+            // Since pattern matching succeeded, we know one of the outputs connects to Multiply
+            if ((input0.get_node() == pattern_map.at(gate_up_proj_split).get_node() && input0.get_index() == 0) ||
+                (input1.get_node() == pattern_map.at(gate_up_proj_split).get_node() && input1.get_index() == 0)) {
+                gate_up_swapped = true;
+            }
+            // Otherwise, it's the normal case where output[1] connects to Multiply
+        }
+
         auto src = pattern_map.at(input);
         if (!src.get_element_type().is_real()) {
             // FakeQuantize, should skip fusion
@@ -224,6 +241,7 @@ ov::intel_cpu::MLPFusionPass::MLPFusionPass() {
             cfg.hidden_size = down_size;
             cfg.up_size = up_size;
             cfg.gate_up_combined = is_gate_up_combined;
+            cfg.gate_up_swapped = gate_up_swapped;
 
             if (pattern_map.count(mlp_silu_gate) > 0) {
                 cfg.act = LLMMLPNode::ACT_FN::SILU;
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mlp_fusion.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mlp_fusion.cpp
@@ -6,13 +6,15 @@
 #include <vector>
 
 #include "common_test_utils/ov_tensor_utils.hpp"
-#include "openvino/runtime/exec_model_info.hpp"
-#include "shared_test_classes/base/ov_subgraph.hpp"
 #include "openvino/op/convert.hpp"
 #include "openvino/op/gelu.hpp"
 #include "openvino/op/matmul.hpp"
 #include "openvino/op/multiply.hpp"
 #include "openvino/op/swish.hpp"
+#include "openvino/op/variadic_split.hpp"
+#include "openvino/runtime/exec_model_info.hpp"
+#include "shared_test_classes/base/ov_subgraph.hpp"
+#include "transformations/rt_info/decompression.hpp"
 
 namespace ov {
 namespace test {
@@ -23,6 +25,7 @@ struct LLMMLPFusionParams {
     size_t up_size;
     std::string act_type;
     bool use_dynamic_quant;
+    bool use_swapped_outputs;  // true = create pattern with swapped VariadicSplit outputs (should still fuse)
 };
 
 class LLMMLPFusionTest : public testing::WithParamInterface<LLMMLPFusionParams>, public ov::test::SubgraphBaseTest {
@@ -39,6 +42,7 @@ class LLMMLPFusionTest : public testing::WithParamInterface<LLMMLPFusionParams>,
         result << "up_size=" << obj.param.up_size << "_";
         result << "act_type=" << obj.param.act_type << "_";
         result << "use_dynamic_quant=" << obj.param.use_dynamic_quant << "_";
+        result << "use_swapped_outputs=" << obj.param.use_swapped_outputs << "_";
         result << obj.index;
         return result.str();
     }
@@ -70,7 +74,8 @@ class LLMMLPFusionTest : public testing::WithParamInterface<LLMMLPFusionParams>,
                 in_data.start_from = 0;
                 in_data.range = 1;
                 in_data.resolution = 128;
-                auto tensor_scale_per_oc = ov::test::utils::create_and_fill_tensor(ov::element::f32, ov::Shape{OC, 1}, in_data);
+                auto tensor_scale_per_oc =
+                    ov::test::utils::create_and_fill_tensor(ov::element::f32, ov::Shape{OC, 1}, in_data);
                 auto scale_per_oc = std::make_shared<ov::op::v0::Constant>(tensor_scale_per_oc);
 
                 auto weight_deq = std::make_shared<ov::op::v1::Multiply>(weight_const_f32, scale_per_oc);
@@ -85,38 +90,89 @@ class LLMMLPFusionTest : public testing::WithParamInterface<LLMMLPFusionParams>,
             return std::make_shared<ov::op::v0::Constant>(tensor);
         };
         if (param.use_dynamic_quant)
-            configuration.insert({ov::hint::dynamic_quantization_group_size.name(), std::numeric_limits<uint64_t>::max()});
-
-        auto gate_weight = create_const(param.up_size, param.down_size, 100);
-        auto up_weight = create_const(param.up_size, param.down_size, 100);
-        // down_proj has special cache blocking along K dimension requires lower weight resolution
-        auto down_weight = create_const(param.down_size, param.up_size, 16);
-
-        auto gate_proj = std::make_shared<ov::op::v0::MatMul>(src, gate_weight, false, true);
-        auto up_proj = std::make_shared<ov::op::v0::MatMul>(src, up_weight, false, true);
+            configuration.insert(
+                {ov::hint::dynamic_quantization_group_size.name(), std::numeric_limits<uint64_t>::max()});
 
         std::shared_ptr<Node> gate_act;
-        if (param.act_type == "Swish")
-            gate_act = std::make_shared<ov::op::v4::Swish>(gate_proj);
-        if (param.act_type == "Gelu")
-            gate_act = std::make_shared<ov::op::v7::Gelu>(gate_proj);
+        ov::Output<ov::Node> up_output;
 
-        auto gate_up = std::make_shared<ov::op::v1::Multiply>(gate_act, up_proj);
+        if (param.use_swapped_outputs) {
+            // Create pattern with swapped VariadicSplit outputs to test gate_up_swapped support
+            ov::test::utils::InputGenerateData in_data;
+            in_data.start_from = -0.5;
+            in_data.range = 1.0;
+            in_data.resolution = 16;
+
+            // Combined gate_up weight in FP16 format
+            auto tensor_f16 = ov::test::utils::create_and_fill_tensor(ov::element::f16,
+                                                                      ov::Shape{param.up_size * 2, param.down_size},
+                                                                      in_data);
+            auto gate_up_weight_f16 = std::make_shared<ov::op::v0::Constant>(tensor_f16);
+            auto gate_up_weight_f32 = std::make_shared<ov::op::v0::Convert>(gate_up_weight_f16, ov::element::f32);
+            mark_as_decompression(gate_up_weight_f32);
+
+            auto gate_up_proj = std::make_shared<ov::op::v0::MatMul>(src, gate_up_weight_f32, false, true);
+
+            auto split_lengths = std::make_shared<ov::op::v0::Constant>(
+                ov::element::i32,
+                ov::Shape{2},
+                std::vector<int32_t>{static_cast<int32_t>(param.up_size), static_cast<int32_t>(param.up_size)});
+            auto axis_const = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, -1);
+            auto gate_up_split = std::make_shared<ov::op::v1::VariadicSplit>(gate_up_proj, axis_const, split_lengths);
+
+            // Swap outputs to test gate_up_swapped support
+            auto gate_part = gate_up_split->output(1);  // activation on output[1]
+            if (param.act_type == "Swish")
+                gate_act = std::make_shared<ov::op::v4::Swish>(gate_part);
+            if (param.act_type == "Gelu")
+                gate_act = std::make_shared<ov::op::v7::Gelu>(gate_part);
+
+            auto up_part = gate_up_split->output(0);  // up branch from output[0] (swapped case)
+            up_output = up_part;
+        } else {
+            // Standard separate weights pattern
+            auto gate_weight = create_const(param.up_size, param.down_size, 100);
+            auto up_weight = create_const(param.up_size, param.down_size, 100);
+
+            auto gate_proj = std::make_shared<ov::op::v0::MatMul>(src, gate_weight, false, true);
+            auto up_proj = std::make_shared<ov::op::v0::MatMul>(src, up_weight, false, true);
+
+            if (param.act_type == "Swish")
+                gate_act = std::make_shared<ov::op::v4::Swish>(gate_proj);
+            if (param.act_type == "Gelu")
+                gate_act = std::make_shared<ov::op::v7::Gelu>(gate_proj);
+
+            up_output = up_proj;
+        }
+
+        // Create compressed down projection weight
+        ov::test::utils::InputGenerateData down_data;
+        down_data.start_from = -0.5;
+        down_data.range = 1;
+        down_data.resolution = 16;
+        auto tensor_f16_down = ov::test::utils::create_and_fill_tensor(ov::element::f16,
+                                                                       ov::Shape{param.down_size, param.up_size},
+                                                                       down_data);
+        auto down_weight_f16 = std::make_shared<ov::op::v0::Constant>(tensor_f16_down);
+        auto down_weight = std::make_shared<ov::op::v0::Convert>(down_weight_f16, ov::element::f32);
+
+        auto gate_up = std::make_shared<ov::op::v1::Multiply>(gate_act, up_output);
         auto output = std::make_shared<ov::op::v0::MatMul>(gate_up, down_weight, false, true);
 
         function = std::make_shared<ov::Model>(ov::OutputVector{output}, ov::ParameterVector{src});
     }
 
     void check_results() {
         auto exec_model = compiledModel.get_runtime_model();
-
         int fused_node_found = 0;
         for (const auto& n : exec_model->get_ordered_ops()) {
             auto layer_type = n->get_rt_info().at(ov::exec_model_info::LAYER_TYPE).as<std::string>();
             if (layer_type == "LLMMLP")
                 fused_node_found++;
         }
-        ASSERT_EQ(fused_node_found, 1);
+
+        // Both normal and swapped cases should fuse successfully
+        ASSERT_EQ(fused_node_found, 1) << "Fusion should occur with valid MLP patterns (both normal and swapped cases)";
     }
 };
 
@@ -129,13 +185,18 @@ TEST_P(LLMMLPFusionTest, CompareWithRefs) {
 
 namespace {
 
-static ov::test::InputShape ishape{ov::PartialShape{-1, -1, 4096 / 4}, {ov::Shape{1, 8, 4096 / 4}, ov::Shape{5, 37, 4096 / 4}}};
+static ov::test::InputShape ishape{ov::PartialShape{-1, -1, 4096 / 4},
+                                   {ov::Shape{1, 8, 4096 / 4}, ov::Shape{5, 37, 4096 / 4}}};
 
 const std::vector<LLMMLPFusionParams> mlp_params = {
-    {ishape, 4096 / 4, 11008 / 4, "Gelu", false},
-    {ishape, 4096 / 4, 11008 / 4, "Gelu", true},
-    {ishape, 4096 / 4, 11008 / 4, "Swish", false},
-    {ishape, 4096 / 4, 11008 / 4, "Swish", true},
+    // Standard separate weights cases (should all fuse successfully)
+    {ishape, 4096 / 4, 11008 / 4, "Gelu", false, false},
+    {ishape, 4096 / 4, 11008 / 4, "Gelu", true, false},
+    {ishape, 4096 / 4, 11008 / 4, "Swish", false, false},
+    {ishape, 4096 / 4, 11008 / 4, "Swish", true, false},
+
+    // Test case with swapped VariadicSplit outputs (should fuse with gate_up_swapped=true)
+    {ishape, 4096 / 4, 11008 / 4, "Gelu", false, true},
 };
 
 INSTANTIATE_TEST_SUITE_P(smoke_LLMMLPFusion,

Original file line number	Diff line number	Diff line change
`@@ -38,6 +38,7 @@ bool LLMMLPNode::visit_attributes(ov::AttributeVisitor& visitor) {`
`38`	`38`	`visitor.on_attribute("hidden_size", m_config.hidden_size);`
`39`	`39`	`visitor.on_attribute("up_size", m_config.up_size);`
`40`	`40`	`visitor.on_attribute("gate_up_combined", m_config.gate_up_combined);`
	`41`	`+ visitor.on_attribute("gate_up_swapped", m_config.gate_up_swapped);`
`41`	`42`	`visitor.finish_structure();`
`42`	`43`	`return true;`
`43`	`44`	`}`