use new 'GATE_UP_TYPE' in mlp kernel

liubo-intel · liubo-intel · commit b05b36793146 · 2025-11-13T00:40:52.000-05:00
diff --git a/src/plugins/intel_cpu/src/nodes/llm_mlp.cpp b/src/plugins/intel_cpu/src/nodes/llm_mlp.cpp
@@ -379,9 +379,9 @@ struct LLMMLP::Executor : public LLMMLP::ExecutorBase {
         auto K = w_gate.size(1);
         auto N = w_gate.size(0);
         OPENVINO_ASSERT(w_gate.stride_bytes(0) == w_up.stride_bytes(0));
-        if (m_config.gate_up_combined) {
+        if (m_config.gate_up_type != LLMMLPNode::GATE_UP_TYPE::SEPARATE) {
             N = w_gate.size(0) / 2;
-            if (m_config.gate_up_swapped) {
+            if (m_config.gate_up_type == LLMMLPNode::GATE_UP_TYPE::COMBINED_UP_GATE) {
                 // When VariadicSplit output[1] connects to gate instead of up, swap the pointers
                 gate_up.setup(w_gate.ptr_v(N, 0), w_gate.ptr_v(), w_gate.stride_bytes(0), N * 2, K, config);
             } else {
@@ -398,19 +398,18 @@ struct LLMMLP::Executor : public LLMMLP::ExecutorBase {
             auto* w_scale_gate = pnode->getSrcMemoryAtPort(4)->getDataAs<float>();
             auto* w_scale_up = pnode->getSrcMemoryAtPort(5)->getDataAs<float>();
             auto* dst = m_w_scale_gateup.ptr<float>();
-            if (m_config.gate_up_combined) {
+            if (m_config.gate_up_type != LLMMLPNode::GATE_UP_TYPE::SEPARATE) {
                 w_scale_up = w_scale_gate + N;
             }
 
-            // When gate_up_combined=true and gate_up_swapped=true, we need to swap the scales
+            // When gate_up_type is COMBINED_UP_GATE, we need to swap the scales
             // to match the swapped weight layout
             auto* scale_first = w_scale_gate;
             auto* scale_second = w_scale_up;
-            if (m_config.gate_up_combined && m_config.gate_up_swapped) {
+            if (m_config.gate_up_type == LLMMLPNode::GATE_UP_TYPE::COMBINED_UP_GATE) {
                 scale_first = w_scale_up;
                 scale_second = w_scale_gate;
             }
-
             for (size_t i = 0; i < N; i += 16) {
                 memcpy(dst, scale_first + i, 16 * sizeof(float));
                 dst += 16;
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/llm_mlp.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/llm_mlp.cpp
@@ -23,10 +23,24 @@ EnumNames<ov::intel_cpu::LLMMLPNode::ACT_FN>& EnumNames<ov::intel_cpu::LLMMLPNod
     return enum_names;
 }
 
+template <>
+EnumNames<ov::intel_cpu::LLMMLPNode::GATE_UP_TYPE>& EnumNames<ov::intel_cpu::LLMMLPNode::GATE_UP_TYPE>::get() {
+    static auto enum_names = EnumNames<ov::intel_cpu::LLMMLPNode::GATE_UP_TYPE>(
+        "op::intel_cpu::LLMMLPNode::GATE_UP_TYPE",
+        {{"SEPARATE", ov::intel_cpu::LLMMLPNode::GATE_UP_TYPE::SEPARATE},
+         {"COMBINED_GATE_UP", ov::intel_cpu::LLMMLPNode::GATE_UP_TYPE::COMBINED_GATE_UP},
+         {"COMBINED_UP_GATE", ov::intel_cpu::LLMMLPNode::GATE_UP_TYPE::COMBINED_UP_GATE}});
+    return enum_names;
+}
+
 std::ostream& operator<<(std::ostream& os, const ov::intel_cpu::LLMMLPNode::ACT_FN& type) {
     return os << as_string(type);
 }
 
+std::ostream& operator<<(std::ostream& os, const ov::intel_cpu::LLMMLPNode::GATE_UP_TYPE& type) {
+    return os << as_string(type);
+}
+
 namespace intel_cpu {
 
 bool LLMMLPNode::visit_attributes(ov::AttributeVisitor& visitor) {
@@ -37,8 +51,7 @@ bool LLMMLPNode::visit_attributes(ov::AttributeVisitor& visitor) {
     visitor.on_attribute("down_quantized", m_config.down_quantized);
     visitor.on_attribute("hidden_size", m_config.hidden_size);
     visitor.on_attribute("up_size", m_config.up_size);
-    visitor.on_attribute("gate_up_combined", m_config.gate_up_combined);
-    visitor.on_attribute("gate_up_swapped", m_config.gate_up_swapped);
+    visitor.on_attribute("gate_up_type", m_config.gate_up_type);
     visitor.finish_structure();
     return true;
 }
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/llm_mlp.hpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/op/llm_mlp.hpp
@@ -25,14 +25,19 @@ class LLMMLPNode : public ov::op::Op {
 
     enum class ACT_FN : uint8_t { SILU = 0, GELU = 1 };
 
+    enum class GATE_UP_TYPE : uint8_t {
+        SEPARATE = 0,          // separate gate and up projections
+        COMBINED_GATE_UP = 1,  // combined weights, gate first (normal)
+        COMBINED_UP_GATE = 2   // combined weights, up first (swapped)
+    };
+
     struct Config {
         ACT_FN act;
         bool gate_up_quantized;
         bool down_quantized;
         int hidden_size;
         int up_size;
-        bool gate_up_combined;
-        bool gate_up_swapped;  // true when VariadicSplit output[1] connects to gate instead of up
+        GATE_UP_TYPE gate_up_type;
     };
 
     // args:
@@ -70,6 +75,17 @@ class AttributeAdapter<ov::intel_cpu::LLMMLPNode::ACT_FN>
     OPENVINO_RTTI("AttributeAdapter<ov::intel_cpu::LLMMLPNode::ACT_FN>");
 };
 
+template <>
+class AttributeAdapter<ov::intel_cpu::LLMMLPNode::GATE_UP_TYPE>
+    : public EnumAttributeAdapterBase<ov::intel_cpu::LLMMLPNode::GATE_UP_TYPE> {
+public:
+    explicit AttributeAdapter(ov::intel_cpu::LLMMLPNode::GATE_UP_TYPE& value)
+        : EnumAttributeAdapterBase<ov::intel_cpu::LLMMLPNode::GATE_UP_TYPE>(value) {}
+
+    OPENVINO_RTTI("AttributeAdapter<ov::intel_cpu::LLMMLPNode::GATE_UP_TYPE>");
+};
+
 std::ostream& operator<<(std::ostream& s, const ov::intel_cpu::LLMMLPNode::ACT_FN& type);
+std::ostream& operator<<(std::ostream& s, const ov::intel_cpu::LLMMLPNode::GATE_UP_TYPE& type);
 
 }  // namespace ov
diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/mlp_fusion.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/x64/pass/mlp_fusion.cpp
@@ -123,8 +123,8 @@ ov::intel_cpu::MLPFusionPass::MLPFusionPass() {
         const auto& pattern_map = m.get_pattern_value_map();
         auto root = m.get_match_root();
 
-        // Check VariadicSplit output connections in combined mode
-        bool gate_up_swapped = false;
+        // Determine gate_up_type based on pattern matching
+        LLMMLPNode::GATE_UP_TYPE gate_up_type = LLMMLPNode::GATE_UP_TYPE::SEPARATE;
         if (pattern_map.count(gate_up_proj_split)) {
             auto mlp_gated_up_node = pattern_map.at(mlp_gated_up).get_node_shared_ptr();
             auto input0 = mlp_gated_up_node->input_value(0);
@@ -134,9 +134,10 @@ ov::intel_cpu::MLPFusionPass::MLPFusionPass() {
             // Since pattern matching succeeded, we know one of the outputs connects to Multiply
             if ((input0.get_node() == pattern_map.at(gate_up_proj_split).get_node() && input0.get_index() == 0) ||
                 (input1.get_node() == pattern_map.at(gate_up_proj_split).get_node() && input1.get_index() == 0)) {
-                gate_up_swapped = true;
+                gate_up_type = LLMMLPNode::GATE_UP_TYPE::COMBINED_UP_GATE;  // swapped case
+            } else {
+                gate_up_type = LLMMLPNode::GATE_UP_TYPE::COMBINED_GATE_UP;  // normal combined case
             }
-            // Otherwise, it's the normal case where output[1] connects to Multiply
         }
 
         auto src = pattern_map.at(input);
@@ -151,17 +152,20 @@ ov::intel_cpu::MLPFusionPass::MLPFusionPass() {
         // down projection is harder to quantize w/o causing accuracy problem, so it may be un-quantized instead
         bool is_gate_up_quantized_int8 = false;
         bool is_down_proj_int8 = false;
-        bool is_gate_up_combined = false;
         if (pattern_map.count(gate_up_proj_weight_const_i8) > 0 && pattern_map.count(down_proj_weight_compressed) > 0) {
             // gate-up combined & quantized
             is_gate_up_quantized_int8 = true;
-            is_gate_up_combined = true;
+            gate_up_type = (gate_up_type == LLMMLPNode::GATE_UP_TYPE::SEPARATE)
+                               ? LLMMLPNode::GATE_UP_TYPE::COMBINED_GATE_UP
+                               : gate_up_type;
             gate_proj_w = pattern_map.at(gate_up_proj_weight_const_i8);
             up_proj_w = pattern_map.at(gate_up_proj_weight_const_i8);
             down_proj_w = pattern_map.at(down_proj_weight_compressed);
         } else if (pattern_map.count(gate_up_proj_weight) > 0 && pattern_map.count(down_proj_weight_compressed) > 0) {
             // gate-up combined
-            is_gate_up_combined = true;
+            gate_up_type = (gate_up_type == LLMMLPNode::GATE_UP_TYPE::SEPARATE)
+                               ? LLMMLPNode::GATE_UP_TYPE::COMBINED_GATE_UP
+                               : gate_up_type;
             gate_proj_w = pattern_map.at(gate_up_proj_weight);
             up_proj_w = pattern_map.at(gate_up_proj_weight);
             down_proj_w = pattern_map.at(down_proj_weight_compressed);
@@ -224,7 +228,7 @@ ov::intel_cpu::MLPFusionPass::MLPFusionPass() {
             return false;
         }
 
-        auto up_size = is_gate_up_combined ? (up_shape[0] / 2) : (up_shape[0]);
+        auto up_size = (gate_up_type != LLMMLPNode::GATE_UP_TYPE::SEPARATE) ? (up_shape[0] / 2) : (up_shape[0]);
         auto down_size = up_shape[1];
         if (down_shape[0] != down_size) {
             return false;
@@ -240,8 +244,7 @@ ov::intel_cpu::MLPFusionPass::MLPFusionPass() {
             cfg.down_quantized = is_down_proj_int8;
             cfg.hidden_size = down_size;
             cfg.up_size = up_size;
-            cfg.gate_up_combined = is_gate_up_combined;
-            cfg.gate_up_swapped = gate_up_swapped;
+            cfg.gate_up_type = gate_up_type;
 
             if (pattern_map.count(mlp_silu_gate) > 0) {
                 cfg.act = LLMMLPNode::ACT_FN::SILU;
@@ -266,7 +269,7 @@ ov::intel_cpu::MLPFusionPass::MLPFusionPass() {
         new_args.push_back(up_proj_w);
         new_args.push_back(down_proj_w);
         if (is_gate_up_quantized_int8) {
-            if (is_gate_up_combined) {
+            if (gate_up_type != LLMMLPNode::GATE_UP_TYPE::SEPARATE) {
                 new_args.push_back(pattern_map.at(gate_up_proj_weight_scales_per_OC));
                 new_args.push_back(pattern_map.at(gate_up_proj_weight_scales_per_OC));
             } else {
@@ -284,7 +287,7 @@ ov::intel_cpu::MLPFusionPass::MLPFusionPass() {
         ov::copy_runtime_info(
             {pattern_map.at(gate_act).get_node_shared_ptr(), pattern_map.at(down_proj).get_node_shared_ptr()},
             new_node);
-        if (is_gate_up_combined) {
+        if (gate_up_type != LLMMLPNode::GATE_UP_TYPE::SEPARATE) {
             ov::copy_runtime_info({pattern_map.at(gate_up_proj).get_node_shared_ptr()}, new_node);
         } else {
             ov::copy_runtime_info({pattern_map.at(mlp_gate_proj).get_node_shared_ptr(),
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mlp_fusion.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/mlp_fusion.cpp
@@ -97,7 +97,7 @@ class LLMMLPFusionTest : public testing::WithParamInterface<LLMMLPFusionParams>,
         ov::Output<ov::Node> up_output;
 
         if (param.use_swapped_outputs) {
-            // Create pattern with swapped VariadicSplit outputs to test gate_up_swapped support
+            // Create pattern with swapped VariadicSplit outputs to test COMBINED_UP_GATE type
             ov::test::utils::InputGenerateData in_data;
             in_data.start_from = -0.5;
             in_data.range = 1.0;
@@ -120,7 +120,7 @@ class LLMMLPFusionTest : public testing::WithParamInterface<LLMMLPFusionParams>,
             auto axis_const = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, -1);
             auto gate_up_split = std::make_shared<ov::op::v1::VariadicSplit>(gate_up_proj, axis_const, split_lengths);
 
-            // Swap outputs to test gate_up_swapped support
+            // Swap outputs to test COMBINED_UP_GATE type
             auto gate_part = gate_up_split->output(1);  // activation on output[1]
             if (param.act_type == "Swish")
                 gate_act = std::make_shared<ov::op::v4::Swish>(gate_part);
@@ -195,7 +195,7 @@ const std::vector<LLMMLPFusionParams> mlp_params = {
     {ishape, 4096 / 4, 11008 / 4, "Swish", false, false},
     {ishape, 4096 / 4, 11008 / 4, "Swish", true, false},
 
-    // Test case with swapped VariadicSplit outputs (should fuse with gate_up_swapped=true)
+    // Test case with swapped VariadicSplit outputs (should fuse with COMBINED_UP_GATE type)
     {ishape, 4096 / 4, 11008 / 4, "Gelu", false, true},
 };