Add RMS operator constructor without gamma to avoid dummy constant creation

andrew-k-park · andrew-k-park · commit cb5fb50a6148 · 2026-01-29T21:53:55.000+09:00
diff --git a/src/common/transformations/include/ov_ops/rms.hpp b/src/common/transformations/include/ov_ops/rms.hpp
@@ -31,6 +31,17 @@ class TRANSFORMATIONS_API RMS : public ov::op::Op {
         const ov::element::Type output_type = ov::element::dynamic,
         bool elementwise_affine = true);
 
+    /// @brief Constructs an RMS operation without gamma.
+    ///
+    /// @param data Input tensor with data
+    /// @param eps Epsilon for not dividing by zero while normalizing the value
+    /// @param output_type Output element type
+    /// @param elementwise_affine A boolean value that when set to True, RMS has learnable affine parameters
+    RMS(const Output<Node>& data,
+        double epsilson,
+        const ov::element::Type output_type = ov::element::dynamic,
+        bool elementwise_affine = false);
+
     bool visit_attributes(ov::AttributeVisitor& visitor) override;
 
     void validate_and_infer_types() override;
diff --git a/src/common/transformations/src/ov_ops/rms.cpp b/src/common/transformations/src/ov_ops/rms.cpp
@@ -20,6 +20,17 @@ RMS::RMS(const Output<Node>& data,
     validate_and_infer_types();
 }
 
+RMS::RMS(const Output<Node>& data,
+         double epsilson,
+         const ov::element::Type output_type,
+         bool elementwise_affine)
+    : Op({data}),
+      m_epsilon(epsilson),
+      m_output_type(output_type),
+      m_elementwise_affine(elementwise_affine) {
+    validate_and_infer_types();
+}
+
 bool RMS::visit_attributes(ov::AttributeVisitor& visitor) {
     visitor.on_attribute("epsilon", m_epsilon);
     visitor.on_attribute("output_type", m_output_type);
@@ -34,6 +45,9 @@ void RMS::validate_and_infer_types() {
 
 std::shared_ptr<Node> RMS::clone_with_new_inputs(const ov::OutputVector& new_args) const {
     check_new_args_count(this, new_args);
+    if (new_args.size() == 1) {
+        return std::make_shared<RMS>(new_args.at(0), m_epsilon, m_output_type);
+    }
     return std::make_shared<RMS>(new_args.at(0), new_args.at(1), m_epsilon, m_output_type, m_elementwise_affine);
 }
 
diff --git a/src/common/transformations/src/transformations/common_optimizations/rms_fusion.cpp b/src/common/transformations/src/transformations/common_optimizations/rms_fusion.cpp
@@ -133,15 +133,6 @@ RMSFusion::RMSFusion(bool force_tail_convert, bool enable_div_x) {
             if (pattern_map.find(gamma_convert) != pattern_map.end()) {
                 gamma_node = pattern_map.at(gamma_convert).get_node_shared_ptr();
             }
-        } else {
-            auto input_shape = x_output.get_partial_shape();
-            if (input_shape.rank().is_dynamic() || input_shape[input_shape.size() - 1].is_dynamic()) {
-                return false;
-            }
-            auto last_dim = input_shape[input_shape.size() - 1].get_length();
-            auto gamma_shape = ov::Shape{static_cast<size_t>(last_dim)};
-            auto output_type = mul_or_div_node->get_output_element_type(0);
-            gamma_node = v0::Constant::create(output_type, gamma_shape, {1.0f});
         }
 
         const auto& mean_node = pattern_map.at(mean).get_node_shared_ptr();
@@ -156,7 +147,12 @@ RMSFusion::RMSFusion(bool force_tail_convert, bool enable_div_x) {
 
         auto output_type =
             has_gamma ? m.get_match_root()->get_output_element_type(0) : mul_or_div_node->get_output_element_type(0);
-        auto rms = std::make_shared<ov::op::internal::RMS>(x_output, gamma_node, eps_value, output_type, has_gamma);
+        std::shared_ptr<ov::op::internal::RMS> rms;
+        if (has_gamma) {
+            rms = std::make_shared<ov::op::internal::RMS>(x_output, gamma_node, eps_value, output_type, true);
+        } else {
+            rms = std::make_shared<ov::op::internal::RMS>(x_output, eps_value, output_type, false);
+        }
         if (has_gamma) {
             rms->set_friendly_name(m.get_match_root()->get_friendly_name());
             ov::copy_runtime_info(m.get_matched_nodes(), rms);
diff --git a/src/common/transformations/tests/common_optimizations/rms_norm_decomposition_test.cpp b/src/common/transformations/tests/common_optimizations/rms_norm_decomposition_test.cpp
@@ -345,9 +345,7 @@ TEST_F(TransformationTestsF, RMSNormFusionTest10) {
         auto input = std::make_shared<ov::opset10::Parameter>(ov::element::f32, ov::Shape{1, 2, 6});
         auto scale = std::make_shared<ov::opset10::Parameter>(ov::element::f32, ov::Shape{1, 2, 6});
 
-        auto rms_const =
-            ov::opset10::Constant::create(ov::element::f32, ov::Shape{6}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-        auto rms = std::make_shared<ov::op::internal::RMS>(input, rms_const, 1e-5f, ov::element::f32, false);
+        auto rms = std::make_shared<ov::op::internal::RMS>(input, 1e-5f, ov::element::f32, false);
         auto mul = std::make_shared<ov::opset10::Multiply>(rms, scale);
 
         model_ref = std::make_shared<ov::Model>(ov::OutputVector{mul}, ov::ParameterVector{input, scale});
@@ -379,9 +377,7 @@ TEST_F(TransformationTestsF, RMSNormFusionTest11) {
         auto input = std::make_shared<ov::opset10::Parameter>(ov::element::f32, ov::PartialShape{-1, -1, 6});
         auto scale = std::make_shared<ov::opset10::Parameter>(ov::element::f32, ov::PartialShape{-1, -1, 6});
 
-        auto rms_const =
-            ov::opset10::Constant::create(ov::element::f32, ov::Shape{6}, {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
-        auto rms = std::make_shared<ov::op::internal::RMS>(input, rms_const, 1e-6f, ov::element::f32, false);
+        auto rms = std::make_shared<ov::op::internal::RMS>(input, 1e-6f, ov::element::f32, false);
         auto mul = std::make_shared<ov::opset10::Multiply>(rms, scale);
 
         model_ref = std::make_shared<ov::Model>(ov::OutputVector{mul}, ov::ParameterVector{input, scale});
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/rms.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/rms.hpp
@@ -29,6 +29,19 @@ struct rms : public primitive_base<rms> {
           epsilon(epsilon),
           elementwise_affine(elementwise_affine) {}
 
+    /// @brief Constructs rms primitive without gamma
+    /// @param id This primitive id
+    /// @param input Input primitive id
+    /// @param epsilon Epsilon for not dividing by zero while normalizing
+    /// @param elementwise_affine A boolean value that when set to True, RMS has learnable affine parameters
+    rms(const primitive_id& id,
+        const input_info& input,
+        const float epsilon,
+        const bool elementwise_affine = false)
+        : primitive_base(id, {input}),
+          epsilon(epsilon),
+          elementwise_affine(elementwise_affine) {}
+
     /// @brief Epsilon for not dividing by zero while normalizing
     float epsilon;
     /// @brief A boolean value that when set to True, RMS has learnable affine parameters (gamma)
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/rms.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/rms.cpp
@@ -36,7 +36,9 @@ struct rms_impl : typed_primitive_impl_ocl<rms> {
         const auto& primitive = impl_param.typed_desc<rms>();
         auto params = get_default_params<kernel_selector::rms_params>(impl_param, is_shape_agnostic);
 
-        params.inputs.push_back(convert_data_tensor(impl_param.get_input_layout(1)));
+        if (primitive->elementwise_affine) {
+            params.inputs.push_back(convert_data_tensor(impl_param.get_input_layout(1)));
+        }
         params.epsilon = primitive->epsilon;
         params.ov_input_rank = static_cast<int32_t>(impl_param.get_input_layout().get_partial_shape().size());
         params.elementwise_affine = primitive->elementwise_affine;
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rms_gpu_bfyx_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rms_gpu_bfyx_opt.cl
@@ -27,7 +27,9 @@ REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
 KERNEL(rms_gpu_bfyx_opt)(
     OPTIONAL_SHAPE_INFO_ARG
     const __global INPUT0_TYPE* input,
+#if ELEMENTWISE_AFFINE
     const __global INPUT1_TYPE* gamma,
+#endif
     __global OUTPUT_TYPE* output
     #if HAS_FUSED_OPS_DECLS
         , FUSED_OPS_DECLS
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rms_gpu_ref.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/rms_gpu_ref.cl
@@ -7,7 +7,9 @@
 KERNEL(rms_gpu_ref)(
     OPTIONAL_SHAPE_INFO_ARG
     const __global INPUT0_TYPE* input,
+#if ELEMENTWISE_AFFINE
     const __global INPUT1_TYPE* gamma,
+#endif
     __global OUTPUT_TYPE* output
     #if HAS_FUSED_OPS_DECLS
         , FUSED_OPS_DECLS
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/rms/rms_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/rms/rms_kernel_base.cpp
@@ -67,6 +67,7 @@ KernelsData RMSKernelBase::GetCommonKernelsData(const Params& params) const {
     GetUpdateDispatchDataFunc(kd);
 
     auto& kernel = kd.kernels[0];
+    auto inputs_count = orgParams.elementwise_affine ? 2 : 1;
     FillCLKernelData(kernel,
                      dispatchData,
                      params.engineInfo,
@@ -76,7 +77,7 @@ KernelsData RMSKernelBase::GetCommonKernelsData(const Params& params) const {
                      EXE_MODE_DEFAULT,
                      false,
                      false,
-                     2,
+                     inputs_count,
                      GetFusedPrimitiveInputsCount(params),
                      1,
                      orgParams.is_shape_agnostic);
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/rms/rms_kernel_bfyx_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/rms/rms_kernel_bfyx_opt.cpp
@@ -197,12 +197,14 @@ bool RMSKernelBfyxOpt::Validate(const Params& p) const {
         DO_NOT_USE_THIS_KERNEL(p.layerID);
 
     const rms_params& params = static_cast<const rms_params&>(p);
-    const auto& gamma = params.inputs[1];
-
-    if (!gamma.is_dynamic()) {
-        size_t data_size = gamma.LogicalSize();
-        if (data_size < subgroup_size) {
-            DO_NOT_USE_THIS_KERNEL(p.layerID);
+    if (params.elementwise_affine) {
+        const auto& gamma = params.inputs[1];
+
+        if (!gamma.is_dynamic()) {
+            size_t data_size = gamma.LogicalSize();
+            if (data_size < subgroup_size) {
+                DO_NOT_USE_THIS_KERNEL(p.layerID);
+            }
         }
     }
     return true;
diff --git a/src/plugins/intel_gpu/src/plugin/ops/rms.cpp b/src/plugins/intel_gpu/src/plugin/ops/rms.cpp
@@ -12,15 +12,13 @@ using RMS = ov::op::internal::RMS;
 namespace ov::intel_gpu {
 
 static void CreateRMSOp(ProgramBuilder& p, const std::shared_ptr<RMS>& op) {
-    validate_inputs_count(op, {2});
+    validate_inputs_count(op, {1, 2});
     auto inputs = p.GetInputInfo(op);
     std::string primitive_name = layer_type_name_ID(op);
 
-    auto rms = cldnn::rms(primitive_name,
-                          inputs[0],
-                          inputs[1],
-                          op->get_epsilon(),
-                          op->get_elementwise_affine());
+    auto rms = op->get_elementwise_affine()
+        ? cldnn::rms(primitive_name, inputs[0], inputs[1], op->get_epsilon(), true)
+        : cldnn::rms(primitive_name, inputs[0], op->get_epsilon(), false);
     rms.output_data_types = get_output_data_types(op);
     p.add_primitive(*op, rms);
 }
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/rms_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/rms_gpu_test.cpp
@@ -432,28 +432,22 @@ TEST(rms_gpu_test, rms_test_without_gamma_bfyx_ref) {
     auto& engine = get_test_engine();
 
     auto input = engine.allocate_memory({ov::PartialShape{1, 2, 6}, data_types::f32, format::bfyx});
-    auto gamma = engine.allocate_memory({ov::PartialShape{1, 6}, data_types::f32, format::bfyx});
     auto output_ref = engine.allocate_memory({ov::PartialShape{1, 2, 6}, data_types::f32, format::bfyx});
 
     set_values(input, {
         0.001839f, -0.003815f, 0.000961f, 0.002930f, -0.003998f, -0.008057f,
         0.006744f, -0.000004f, 0.004303f, -0.002380f, 0.000072f, 0.001404f
     });
-    set_values(gamma, {
-        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f
-    });
 
-    rms_ref<float>(input, gamma, output_ref, 1e-5f, false);
+    rms_ref<float>(input, nullptr, output_ref, 1e-5f, false);
 
     topology topology;
     topology.add(input_layout("input", input->get_layout()));
-    topology.add(input_layout("gamma", gamma->get_layout()));
-    topology.add(rms("rms", input_info("input"), input_info("gamma"), 1e-5f, false));
+    topology.add(rms("rms", input_info("input"), 1e-5f));
 
     network network(engine, topology, get_test_default_config(engine));
 
     network.set_input_data("input", input);
-    network.set_input_data("gamma", gamma);
 
     auto outputs = network.execute();
     ASSERT_EQ(outputs.size(), size_t(1));
@@ -472,7 +466,6 @@ TEST(rms_gpu_test, rms_test_without_gamma_bfyx_opt) {
     auto& engine = get_test_engine();
 
     auto input = engine.allocate_memory({ov::PartialShape{1, 2, 16}, data_types::f32, format::bfyx});
-    auto gamma = engine.allocate_memory({ov::PartialShape{1, 16}, data_types::f32, format::bfyx});
     auto output_ref = engine.allocate_memory({ov::PartialShape{1, 2, 16}, data_types::f32, format::bfyx});
 
     set_values(input, {
@@ -481,22 +474,16 @@ TEST(rms_gpu_test, rms_test_without_gamma_bfyx_opt) {
         0.003098f, -0.006989f, -0.000244f, 0.010193f, 0.002899f, -0.005798f, -0.026978f, 0.008789f,
         0.002258f, 0.006500f, 0.003159f, -0.012329f, 0.026245f, -0.001839f, 0.000259f, 0.002670f
     });
-    set_values(gamma, {
-        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-        1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f
-    });
 
-    rms_ref<float>(input, gamma, output_ref, 1e-5f, false);
+    rms_ref<float>(input, nullptr, output_ref, 1e-5f, false);
 
     topology topology;
     topology.add(input_layout("input", input->get_layout()));
-    topology.add(input_layout("gamma", gamma->get_layout()));
-    topology.add(rms("rms", input_info("input"), input_info("gamma"), 1e-5f, false));
+    topology.add(rms("rms", input_info("input"), 1e-5f, false));
 
     network network(engine, topology, get_test_default_config(engine));
 
     network.set_input_data("input", input);
-    network.set_input_data("gamma", gamma);
 
     auto outputs = network.execute();
     ASSERT_EQ(outputs.size(), size_t(1));
@@ -517,28 +504,22 @@ TEST(rms_gpu_test, rms_test_without_gamma_dyn) {
     auto input_layout_dynamic = layout{ov::PartialShape{ov::Dimension::dynamic(), ov::Dimension::dynamic(), 4096},
                                        data_types::f32, format::bfyx};
     auto input = engine.allocate_memory({ov::PartialShape{2, 1, 4096}, data_types::f32, format::bfyx});
-    auto gamma = engine.allocate_memory({ov::PartialShape{1, 1, 4096}, data_types::f32, format::bfyx});
     auto output_ref = engine.allocate_memory({ov::PartialShape{2, 1, 4096}, data_types::f32, format::bfyx});
 
     tests::set_random_values<float>(input, true, 8, 100);
-    // Set gamma to all 1.0 for has_gamma=false case
-    std::vector<float> gamma_data(4096, 1.0f);
-    set_values(gamma, gamma_data);
 
-    rms_ref<float>(input, gamma, output_ref, 1e-5f, false);
+    rms_ref<float>(input, nullptr, output_ref, 1e-5f, false);
 
     topology topology;
     topology.add(input_layout("input", input_layout_dynamic));
-    topology.add(input_layout("gamma", gamma->get_layout()));
-    topology.add(rms("rms", input_info("input"), input_info("gamma"), 1e-5f, false));
+    topology.add(rms("rms", input_info("input"), 1e-5f, false));
 
     ExecutionConfig config = get_test_default_config(engine);
     config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
 
     network network(engine, topology, config);
 
     network.set_input_data("input", input);
-    network.set_input_data("gamma", gamma);
 
     auto inst = network.get_primitive("rms");
     auto impl = inst->get_impl();
diff --git a/src/plugins/template/backend/ops/rms_internal.cpp b/src/plugins/template/backend/ops/rms_internal.cpp
@@ -24,26 +24,46 @@ bool evaluate(const std::shared_ptr<ov::op::internal::RMS>& node,
 
     const auto& in_type = inputs[0].get_element_type();
     const auto& out_type = outputs[0].get_element_type();
+    const bool has_gamma = node->get_elementwise_affine();
 
     // The type compression mechanism is implemented for F16 only
     // The scale is expected to have the same type as the first input
     if (in_type != out_type && out_type == ov::element::f16) {
-        ov::reference::rms_norm_mul_convert_out(inputs[0].data<ET>(),
-                                                normalized_axes,
-                                                outputs[0].data<ov::float16>(),
-                                                inputs[0].get_shape(),
-                                                node->get_epsilon(),
-                                                inputs[1].get_shape(),
-                                                inputs[1].data<ET>());
-
+        if (has_gamma) {
+            ov::reference::rms_norm_mul_convert_out(inputs[0].data<ET>(),
+                                                    normalized_axes,
+                                                    outputs[0].data<ov::float16>(),
+                                                    inputs[0].get_shape(),
+                                                    node->get_epsilon(),
+                                                    inputs[1].get_shape(),
+                                                    inputs[1].data<ET>());
+        } else {
+            std::vector<ET> temp_output(shape_size(inputs[0].get_shape()));
+            ov::reference::rms_norm(inputs[0].data<ET>(),
+                                    normalized_axes,
+                                    temp_output.data(),
+                                    inputs[0].get_shape(),
+                                    node->get_epsilon());
+            ov::reference::convert(temp_output.data(),
+                                   outputs[0].data<ov::float16>(),
+                                   temp_output.size());
+        }
     } else {
-        ov::reference::rms_norm(inputs[0].data<ET>(),
-                                normalized_axes,
-                                outputs[0].data<ET>(),
-                                inputs[0].get_shape(),
-                                node->get_epsilon(),
-                                inputs[1].get_shape(),
-                                inputs[1].data<ET>());
+        if (has_gamma) {
+            ov::reference::rms_norm(inputs[0].data<ET>(),
+                                    normalized_axes,
+                                    outputs[0].data<ET>(),
+                                    inputs[0].get_shape(),
+                                    node->get_epsilon(),
+                                    inputs[1].get_shape(),
+                                    inputs[1].data<ET>());
+        } else {
+            ov::reference::rms_norm(inputs[0].data<ET>(),
+                                    normalized_axes,
+                                    outputs[0].data<ET>(),
+                                    inputs[0].get_shape(),
+                                    node->get_epsilon());
+        }
     }
     return true;
 }