[GPU] Extend ClampFP16Output pass to support clippling input for RMS (openvinotoolkit#29744)

andrew-k-park · web-flow · commit d060d7ad4958 · 2025-03-27T04:43:36.000Z
### Details: - Extend `ClampFP16Output` pass to add clamp primitive between `Add` and `RMS` operation which is targeting language model in VLM models which may have an fp16 overflow on Add output tensor which could lead to Inf Value and affecting the result of RMS ![image](https://github.com/user-attachments/assets/70624d20-b9dc-405e-a3ff-993365ec3f0c) ### Tickets: - 164349
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/clamp_fp16_output.cpp b/src/plugins/intel_gpu/src/plugin/transformations/clamp_fp16_output.cpp
@@ -4,7 +4,7 @@
 
 #include "clamp_fp16_output.hpp"
 
-#include "openvino/core/rt_info.hpp"
+#include "ov_ops/rms.hpp"
 #include "openvino/op/clamp.hpp"
 #include "openvino/op/constant.hpp"
 #include "openvino/op/matmul.hpp"
@@ -14,6 +14,7 @@
 #include "openvino/op/multiply.hpp"
 #include "openvino/op/subtract.hpp"
 #include "openvino/op/divide.hpp"
+#include "openvino/core/rt_info.hpp"
 #include "openvino/pass/pattern/op/pattern.hpp"
 #include "openvino/pass/pattern/op/wrap_type.hpp"
 #include "openvino/pass/pattern/op/or.hpp"
@@ -24,6 +25,11 @@
 namespace ov::intel_gpu {
 
 ClampFP16Output::ClampFP16Output() {
+    add_matcher<ClampFP16OutputSoftmaxMatcher>();
+    add_matcher<ClampFP16OutputRMSMatcher>();
+}
+
+ClampFP16OutputSoftmaxMatcher::ClampFP16OutputSoftmaxMatcher() {
     using namespace ov::op;
     using namespace ov::pass::pattern;
     using namespace ov::pass::pattern::op;
@@ -58,7 +64,39 @@ ClampFP16Output::ClampFP16Output() {
         return true;
     };
 
-    auto m = std::make_shared<ov::pass::pattern::Matcher>(softmax_m, "ClampFP16Output");
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(softmax_m, "ClampFP16OutputSoftmaxMatcher");
+    this->register_matcher(m, callback);
+}
+
+ClampFP16OutputRMSMatcher::ClampFP16OutputRMSMatcher() {
+    using namespace ov::pass::pattern;
+
+    auto add_m = wrap_type<ov::op::v1::Add>({any_input(), any_input()}, type_matches(element::f16));
+    auto rms_post_m = wrap_type<ov::op::internal::RMS>({any_input(), wrap_type<ov::op::v0::Constant>()}, type_matches(element::f16));
+    auto add_1_m = wrap_type<ov::op::v1::Add>({add_m, rms_post_m}, type_matches(element::f16));
+    auto rms_m = wrap_type<ov::op::internal::RMS>({add_1_m, wrap_type<ov::op::v0::Constant>()}, type_matches(element::f16));
+
+    ov::matcher_pass_callback callback = [OV_CAPTURE_CPY_AND_THIS](ov::pass::pattern::Matcher& m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        auto rms = ov::as_type_ptr<ov::op::internal::RMS>(pattern_map.at(rms_m).get_node_shared_ptr());
+        if (!rms || transformation_callback(rms)) {
+            return false;
+        }
+
+        auto add_1 = pattern_map.at(add_1_m).get_node_shared_ptr();
+
+        auto min = static_cast<double>(std::numeric_limits<ov::float16>::lowest());
+        auto max = static_cast<double>(std::numeric_limits<ov::float16>::max());
+        auto clamp = std::make_shared<ov::op::v0::Clamp>(rms->get_input_source_output(0), min, max);
+        clamp->set_friendly_name(add_1->get_friendly_name() + "/ClampFP16Output");
+        ov::copy_runtime_info(add_1, clamp);
+
+        rms->input(0).replace_source_output(clamp);
+
+        return true;
+    };
+
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(rms_m, "ClampFP16OutputRMSMatcher");
     this->register_matcher(m, callback);
 }
 
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/clamp_fp16_output.hpp b/src/plugins/intel_gpu/src/plugin/transformations/clamp_fp16_output.hpp
@@ -9,17 +9,28 @@
 
 namespace ov::intel_gpu {
 
+class ClampFP16Output: public ov::pass::GraphRewrite {
+public:
+    OPENVINO_GRAPH_REWRITE_RTTI("ClampFP16Output");
+    ClampFP16Output();
+};
+
 /**
  * @brief This transformation adds Clamp primitive between MatMul and Softmax operation
  * which is targeting some transformer based models (mainly Stable Diffusion) which may have an fp16 overflow
  * on MatMul output tensor which could lead to Inf/Nan values on the model output.
  * We assume that Clamp operation handling costs almost nothing from the performance perspective as it's supposed to be fused to MatMul later
  */
-class ClampFP16Output: public ov::pass::MatcherPass {
+class ClampFP16OutputSoftmaxMatcher: public ov::pass::MatcherPass {
 public:
-    OPENVINO_MATCHER_PASS_RTTI("ov::intel_gpu::ClampFP16Output");
+    OPENVINO_MATCHER_PASS_RTTI("ClampFP16OutputSoftmaxMatcher");
+    ClampFP16OutputSoftmaxMatcher();
+};
 
-    ClampFP16Output();
+class ClampFP16OutputRMSMatcher: public ov::pass::MatcherPass {
+public:
+    OPENVINO_MATCHER_PASS_RTTI("ClampFP16OutputRMSMatcher");
+    ClampFP16OutputRMSMatcher();
 };
 
 }   // namespace ov::intel_gpu
diff --git a/src/plugins/intel_gpu/tests/unit/transformations/clamp_fp16_output_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/clamp_fp16_output_test.cpp
@@ -13,6 +13,7 @@
 #include "openvino/core/coordinate_diff.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include <openvino/op/constant.hpp>
+#include "ov_ops/rms.hpp"
 #include "openvino/op/clamp.hpp"
 #include "openvino/op/reshape.hpp"
 #include "openvino/op/add.hpp"
@@ -157,3 +158,37 @@ TEST_F(TransformationTestsF, ClampFp16OutputTest6) {
     }
     comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
 }
+
+TEST_F(TransformationTestsF, ClampFp16OutputRMS) {
+    {
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{ -1, -1, 2560 });
+        auto input2 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{ -1, -1, 2560 });
+        auto add = std::make_shared<ov::op::v1::Add>(input1, input2);
+        auto data = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{ -1, -1, 2560 });
+        auto gamma1 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1, 1, 2560 }, {1});
+        auto rms_post = std::make_shared<ov::op::internal::RMS>(data, gamma1, 1e-5f, ov::element::f16);
+        auto add1 = std::make_shared<ov::op::v1::Add>(add, rms_post);
+        auto gamma2 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1, 1, 2560 }, {1});
+        auto rms = std::make_shared<ov::op::internal::RMS>(add1, gamma2, 1e-5f, ov::element::f16);
+
+        model = std::make_shared<ov::Model>(ov::NodeVector{ rms }, ov::ParameterVector{ input1, input2, data });
+        manager.register_pass<ClampFP16Output>();
+    }
+    {
+        auto input1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{ -1, -1, 2560 });
+        auto input2 = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{ -1, -1, 2560 });
+        auto add = std::make_shared<ov::op::v1::Add>(input1, input2);
+        auto data = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, ov::PartialShape{ -1, -1, 2560 });
+        auto gamma1 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1, 1, 2560 }, {1});
+        auto rms_post = std::make_shared<ov::op::internal::RMS>(data, gamma1, 1e-5f, ov::element::f16);
+        auto add1 = std::make_shared<ov::op::v1::Add>(add, rms_post);
+        auto min = static_cast<double>(std::numeric_limits<ov::float16>::lowest());
+        auto max = static_cast<double>(std::numeric_limits<ov::float16>::max());
+        auto clamp = std::make_shared<ov::op::v0::Clamp>(add1, min, max);
+        auto gamma2 = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 1, 1, 2560 }, {1});
+        auto rms = std::make_shared<ov::op::internal::RMS>(clamp, gamma2, 1e-5f, ov::element::f16);
+
+        model_ref = std::make_shared<ov::Model>(ov::NodeVector{ rms }, ov::ParameterVector{ input1, input2, data });
+    }
+    comparator.enable(FunctionsComparator::CmpValues::ATTRIBUTES);
+}