Optimize tail 'Convert' nodes time cost in f16 precision mark-up transformation

liubo-intel · liubo-intel · commit 5eb325c37fc7 · 2025-06-12T08:30:20.000+08:00
diff --git a/src/plugins/intel_cpu/src/graph.cpp b/src/plugins/intel_cpu/src/graph.cpp
@@ -2004,20 +2004,18 @@ void Graph::EnforceInferencePrecision() {
     CPU_DEBUG_CAP_ENABLE(EnforceInferPrcDebug inferPrecDebug);
 
     const auto inferPrec = getConfig().inferencePrecision;
-    if (one_of(inferPrec, element::f32, element::dynamic, ov::element::f16, element::dynamic)) {
+    if (one_of(inferPrec, element::f32, element::dynamic)) {
         return;  // nothing to do, only precision reduction is currently allowed
     }
-#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
     if (inferPrec == ov::element::f16) {
+#if defined(OPENVINO_ARCH_ARM) || defined(OPENVINO_ARCH_ARM64)
         return;  // precision of configured by ov::pass::ConvertPrecision
-    }
 #endif
-    std::function<void(const NodePtr&, std::unordered_set<NodePtr>& skipNodes)> searchForNodesToSkip;
-    searchForNodesToSkip = [&](const NodePtr& node, std::unordered_set<NodePtr>& skipNodes) -> void {
-        for (size_t i = 0; i < node->getParentEdges().size(); i++) {
-            const auto& parent = node->getParentEdgeAt(i)->getParent();
-            if (inferPrec == ov::element::bf16) {
-                /* list of node types that must be forced to be executed in BF16 precision
+        std::function<void(const NodePtr&, std::unordered_map<NodePtr, bool>&)> searchForTailNodes;
+        searchForTailNodes = [&](const NodePtr& node, std::unordered_map<NodePtr, bool>& tailNodes) -> void {
+            for (size_t i = 0; i < node->getParentEdges().size(); i++) {
+                const auto& parent = node->getParentEdgeAt(i)->getParent();
+                /* list of node types that must be forced to be executed in F16 precision
                  * because of performance gains */
                 if (one_of(parent->getType(),
                            Type::Convolution,     // conv nets
@@ -2029,19 +2027,177 @@ void Graph::EnforceInferencePrecision() {
                            Type::Interpolate,     // super resolution nets
                            Type::PagedAttention,  // page attention
                            Type::QKVProjection,
-                           Type::LLMMLP)) {
+                           Type::LLMMLP,
+                           Type::Pooling)) {
                     continue;  // stop at significant nodes
                 }
-            } else if (inferPrec == ov::element::f16) {
-                /* list of node types that must be forced to be executed in FP16 precision
+                const auto res = tailNodes.insert({parent, false});
+                if (res.second) {  // node not visited yet
+                    searchForTailNodes(parent, tailNodes);
+                }
+            }
+        };
+        // collect the tail nodes
+        std::unordered_map<NodePtr, bool> tailNodesMap;
+        std::unordered_set<ov::element::Type_t> outputPrecisions;
+        // starting from output nodes
+        for (const auto& entry : outputNodesMap) {
+            const auto& output = entry.second;
+            if (output->getOriginalInputPrecisionAtPort(0) == inferPrec) {
+                continue;
+            }
+            outputPrecisions.insert(output->getOriginalInputPrecisionAtPort(0));
+            searchForTailNodes(output, tailNodesMap);
+        }
+        if (outputPrecisions.empty()) {
+            return;
+        }
+
+        const std::vector<Type> kStartTypes = {Type::Eltwise, Type::MVN};
+        const std::vector<Type> kPathTypes = {Type::Reshape, Type::Concatenation, Type::Split};
+        std::function<bool(const NodePtr&)> suitableForTailOptimization;
+        suitableForTailOptimization = [&](const NodePtr& node) -> bool {
+            NodePtr cur = node;
+            std::unordered_set<NodePtr> visited;
+            while (cur) {
+                if (visited.count(cur))
+                    break;
+                visited.insert(cur);
+
+                size_t parentNum = cur->getParentEdges().size();
+                if (parentNum == 0)
+                    return false;
+
+                bool allParentSuitable = true;
+                for (size_t i = 0; i < parentNum; ++i) {
+                    auto parent = cur->getParentEdgeAt(i)->getParent();
+                    if (!parent)
+                        return false;
+                    if ((std::find(kStartTypes.begin(), kStartTypes.end(), parent->getType()) != kStartTypes.end()) &&
+                        tailNodesMap.count(parent)) {
+                        continue;
+                    }
+                    if ((std::find(kPathTypes.begin(), kPathTypes.end(), parent->getType()) != kPathTypes.end()) &&
+                        tailNodesMap.count(parent)) {
+                        if (!suitableForTailOptimization(parent)) {
+                            allParentSuitable = false;
+                            break;
+                        }
+                    } else {
+                        return false;
+                    }
+                }
+                return allParentSuitable;
+            }
+            return false;
+        };
+        std::function<void(const NodePtr&, const ov::element::Type_t&)> resetTailPrecision;
+        resetTailPrecision = [&](const NodePtr& node, const ov::element::Type_t& outputPrecision) -> void {
+            // traverse upwards until encountering the first kStartTypes
+            for (size_t i = 0; i < node->getParentEdges().size(); ++i) {
+                auto parent = node->getParentEdgeAt(i)->getParent();
+                if (!parent)
+                    continue;
+                OPENVINO_ASSERT(tailNodesMap.count(parent),
+                                "resetTailPrecision: node ",
+                                parent->getName(),
+                                " with type ",
+                                NameFromType(parent->getType()),
+                                " is not in suitableForTailOptimization set");
+                if (tailNodesMap[parent]) {
+                    continue;
+                }
+                tailNodesMap[parent] = true;
+                if (std::find(kStartTypes.begin(), kStartTypes.end(), parent->getType()) != kStartTypes.end()) {
+                    // set the output precision of kStartTypes nodes to f32, input precision remains unchanged
+                    for (size_t j = 0; j < parent->getOriginalOutputsNumber(); ++j) {
+                        parent->setOriginalOutputPrecisionAtPort(j, outputPrecision);
+                    }
+                } else {
+                    // recursively process upwards
+                    // set all input and output precisions of the current nodes to f32
+                    for (size_t j = 0; j < parent->getOriginalInputsNumber(); ++j) {
+                        parent->setOriginalInputPrecisionAtPort(j, outputPrecision);
+                    }
+                    for (size_t j = 0; j < parent->getOriginalOutputsNumber(); ++j) {
+                        parent->setOriginalOutputPrecisionAtPort(j, outputPrecision);
+                    }
+                    resetTailPrecision(parent, outputPrecision);
+                }
+            }
+        };
+
+        std::function<void(const NodePtr&)> tailNodesPrecisionOptimizeMain;
+        tailNodesPrecisionOptimizeMain = [&](const NodePtr& node) -> void {
+            for (size_t i = 0; i < node->getParentEdges().size(); i++) {
+                const auto& parent = node->getParentEdgeAt(i)->getParent();
+                if (!tailNodesMap.count(parent)) {
+                    continue;
+                }
+                if (one_of(parent->getType(), Type::Input, Type::Output, Type::MemoryInput, Type::MemoryOutput)) {
+                    continue;
+                }
+                if (parent->keepOrigPrecision()) {
+                    continue;
+                }
+                if ((parent->getType() == Type::Convert) && (parent->getOriginalInputPrecisionAtPort(0) == inferPrec) &&
+                    outputPrecisions.count(parent->getOriginalOutputPrecisionAtPort(0))) {
+                    bool suitableCase = false;
+                    auto outprecision = parent->getOriginalOutputPrecisionAtPort(0);
+                    for (size_t i = 0; i < parent->getParentEdges().size(); ++i) {
+                        auto p = parent->getParentEdgeAt(i)->getParent();
+                        if (!p)
+                            continue;
+                        if (std::find(kPathTypes.begin(), kPathTypes.end(), p->getType()) != kPathTypes.end()) {
+                            if (suitableForTailOptimization(p)) {
+                                suitableCase = true;
+                                continue;
+                            }
+                        } else if (std::find(kStartTypes.begin(), kStartTypes.end(), p->getType()) !=
+                                   kStartTypes.end()) {
+                            suitableCase = true;
+                            continue;
+                        }
+                    }
+                    if (suitableCase) {
+                        // suitable case for tail optimization
+                        resetTailPrecision(parent, outprecision);
+                        DropNode(parent);
+                    }
+                    continue;
+                }
+                tailNodesPrecisionOptimizeMain(parent);
+            }
+        };
+        // tail optimization main process
+        for (const auto& entry : outputNodesMap) {
+            const auto& output = entry.second;
+            if (output->getOriginalInputPrecisionAtPort(0) == inferPrec) {
+                continue;
+            }
+            tailNodesPrecisionOptimizeMain(output);
+        }
+        return;
+    }
+
+    std::function<void(const NodePtr&, std::unordered_set<NodePtr>& skipNodes)> searchForNodesToSkip;
+    searchForNodesToSkip = [&](const NodePtr& node, std::unordered_set<NodePtr>& skipNodes) -> void {
+        for (size_t i = 0; i < node->getParentEdges().size(); i++) {
+            const auto& parent = node->getParentEdgeAt(i)->getParent();
+            if (inferPrec == ov::element::bf16) {
+                /* list of node types that must be forced to be executed in BF16 precision
                  * because of performance gains */
                 if (one_of(parent->getType(),
                            Type::Convolution,     // conv nets
-                           Type::Deconvolution,   // deconv
                            Type::FullyConnected,  // conv / bert nets
+                           Type::RNNCell,         // recurrent nets
+                           Type::RNNSeq,          // recurrent nets
                            Type::MatMul,          // bert nets
-                           Type::Pooling,
-                           Type::MVN)) {
+                           Type::ROIPooling,      // object detection nets
+                           Type::Interpolate,     // super resolution nets
+                           Type::PagedAttention,  // page attention
+                           Type::QKVProjection,
+                           Type::LLMMLP)) {
                     continue;  // stop at significant nodes
                 }
             }
diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/enforce_inference_precision_fp16_tail.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/x64/enforce_inference_precision_fp16_tail.cpp
@@ -0,0 +1,81 @@
+// Copyright (C) 2018-2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "openvino/op/concat.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/convert.hpp"
+#include "openvino/op/convolution.hpp"
+#include "openvino/op/multiply.hpp"
+#include "openvino/op/parameter.hpp"
+#include "openvino/op/result.hpp"
+#include "openvino/op/sigmoid.hpp"
+#include "shared_test_classes/base/ov_subgraph.hpp"
+#include "utils/cpu_test_utils.hpp"
+
+using namespace CPUTestUtils;
+
+namespace ov {
+namespace test {
+
+class EnforceInferencePrecisionFP16TailTest : virtual public SubgraphBaseTest {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<std::tuple<>> /*obj*/) {
+        return "EnforceInferencePrecisionFP16TailTest";
+    }
+
+    void SetUp() override {
+        targetDevice = ov::test::utils::DEVICE_CPU;
+        configuration = {{ov::hint::inference_precision.name(), ov::element::f16}};
+
+        std::vector<InputShape> inputShapes = {{{-1, 16, 16, 16}, {{1, 16, 16, 16}, {2, 16, 16, 16}}}};
+
+        init_input_shapes(inputShapes);
+
+        auto input = std::make_shared<ov::op::v0::Parameter>(ov::element::f16, inputDynamicShapes[0]);
+        ov::Shape weights_shape = {16, 16, 1, 1};  // OIHW for 1x1 conv
+
+        auto weights = ov::op::v0::Constant::create(ov::element::f16, weights_shape, {1.0f});
+        auto conv = std::make_shared<ov::op::v1::Convolution>(input,
+                                                              weights,
+                                                              ov::Strides{1, 1},
+                                                              ov::CoordinateDiff{0, 0},
+                                                              ov::CoordinateDiff{0, 0},
+                                                              ov::Strides{1, 1});
+        conv->set_friendly_name("conv_node");
+        auto mul_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{1, 16, 16, 16}, {2.0f});
+        auto mul = std::make_shared<ov::op::v1::Multiply>(conv, mul_const);
+
+        auto sigmoid = std::make_shared<ov::op::v0::Sigmoid>(conv);
+
+        auto concat = std::make_shared<ov::op::v0::Concat>(ov::OutputVector{mul, sigmoid}, 1);
+
+        auto convert_to_f32 = std::make_shared<ov::op::v0::Convert>(concat, ov::element::f32);
+
+        auto result = std::make_shared<ov::op::v0::Result>(convert_to_f32);
+
+        function = std::make_shared<ov::Model>(ov::ResultVector{result},
+                                               ov::ParameterVector{input},
+                                               "enforce_inference_precision_fp16_tail");
+    }
+
+    void checkResults() {
+        for (const auto& node : compiledModel.get_runtime_model()->get_ops()) {
+            if (node->get_friendly_name() == "conv_node") {
+                ASSERT_EQ(node->get_output_element_type(0), ElementType::f16);
+            }
+        }
+        CheckNumberOfNodesWithType(compiledModel, "Convert", 0);
+    }
+};
+namespace {
+TEST_F(EnforceInferencePrecisionFP16TailTest, CompareWithRefs) {
+    if (!ov::with_cpu_x86_avx512_core_amx_fp16())
+        GTEST_SKIP() << "Skipping test, only fp16 runtime inference precision platform needed" << std::endl;
+    run();
+    serialize();
+    checkResults();
+}
+}  // namespace
+}  // namespace test
+}  // namespace ov