diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp index cd75c3b2b16fab..c9979b644e24e1 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp @@ -79,6 +79,12 @@ void jit_add_emitter::emit_isa(const std::vector& in_vec_idxs, const std case ov::element::i32: h->uni_vpaddd(vmm_dst, vmm_src0, vmm_src1); break; + case ov::element::u8: + // u8 addition uses vpaddb which naturally wraps around (mod 256). + // This gives correct behavior: e.g., 255 + 1 = 0. + OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::u8, "u8 add JIT must only be used for u8 output"); + h->uni_vpaddb(vmm_dst, vmm_src0, vmm_src1); + break; default: OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); } @@ -92,9 +98,20 @@ void jit_add_emitter::emit_isa(const std::vector& in_vec_idxs, const std } } -std::set> jit_add_emitter::get_supported_precisions( - [[maybe_unused]] const std::shared_ptr& node) { - return {{element::f32, element::f32}, {element::i32, element::i32}}; +std::set> jit_add_emitter::get_supported_precisions(const std::shared_ptr& node) { + std::set> supported = {{element::f32, element::f32}, {element::i32, element::i32}}; + + // Only enable u8 wrap-around for pure u8->u8 arithmetic. + // QDQ/dequantization patterns (u8 input, f32/i32 output) must NOT use u8 execution. + // node may be nullptr when called from SupportedPrecisions functor (general query). + if (node && ov::intel_cpu::all_of(element::u8, + node->get_input_element_type(0), + node->get_input_element_type(1), + node->get_output_element_type(0))) { + supported.insert({element::u8, element::u8}); + } + + return supported; } /// MUL_ADD /// @@ -243,6 +260,12 @@ void jit_subtract_emitter::emit_isa(const std::vector& in_vec_idxs, case ov::element::i32: h->uni_vpsubd(vmm_dst, vmm_src0, vmm_src1); break; + case ov::element::u8: + // u8 subtraction uses vpsubb which naturally wraps around (mod 256). + // This gives correct behavior: e.g., 3 - 4 = 255. + OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::u8, "u8 subtract JIT must only be used for u8 output"); + h->uni_vpsubb(vmm_dst, vmm_src0, vmm_src1); + break; default: OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); } @@ -257,8 +280,20 @@ void jit_subtract_emitter::emit_isa(const std::vector& in_vec_idxs, } std::set> jit_subtract_emitter::get_supported_precisions( - [[maybe_unused]] const std::shared_ptr& node) { - return {{element::f32, element::f32}, {element::i32, element::i32}}; + const std::shared_ptr& node) { + std::set> supported = {{element::f32, element::f32}, {element::i32, element::i32}}; + + // Only enable u8 wrap-around for pure u8->u8 arithmetic. + // QDQ/dequantization patterns (u8 input, f32/i32 output) must NOT use u8 execution. + // node may be nullptr when called from SupportedPrecisions functor (general query). + if (node && ov::intel_cpu::all_of(element::u8, + node->get_input_element_type(0), + node->get_input_element_type(1), + node->get_output_element_type(0))) { + supported.insert({element::u8, element::u8}); + } + + return supported; } /// MULTIPLY /// diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp index e57c57dc296843..2bfe602c4f96e1 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp @@ -36,6 +36,7 @@ #include "openvino/core/except.hpp" #include "openvino/core/type/element_type.hpp" #include "utils/debug_capabilities.h" +#include "utils/general_utils.h" namespace ov::intel_cpu { @@ -264,19 +265,29 @@ bool AclEltwiseExecutor::init(const std::vector& srcDescs, const std::function(void)> exec_func; switch (aclEltwiseAttrs.data.algo) { - case Algorithm::EltwiseAdd: + case Algorithm::EltwiseAdd: { + // For u8, Add must wrap on overflow (e.g. 255 + 1 = 0), not saturate. + // Only use wrap-around for pure u8->u8 add. + // QDQ patterns with u8 input but f32/i32 output must saturate. + const bool is_u8_u8_to_u8 = ov::intel_cpu::all_of(ov::element::u8, + srcDescs[0]->getPrecision(), + srcDescs[1]->getPrecision(), + dstDescs[0]->getPrecision()); + const auto convert_policy = is_u8_u8_to_u8 ? ConvertPolicy::WRAP : ConvertPolicy::SATURATE; + if (!NEArithmeticAddition::validate(srcTensorsInfo.data(), &srcTensorsInfo[1], dstTensorsInfo.data(), - ConvertPolicy::SATURATE)) { + convert_policy)) { return false; } - exec_func = [this]() -> std::unique_ptr { + exec_func = [this, convert_policy]() -> std::unique_ptr { auto acl_op = std::make_unique(); - acl_op->configure(srcTensors.data(), &srcTensors[1], dstTensors.data(), ConvertPolicy::SATURATE); + acl_op->configure(srcTensors.data(), &srcTensors[1], dstTensors.data(), convert_policy); return acl_op; }; break; + } case Algorithm::EltwiseMultiply: if (!NEPixelWiseMultiplication::validate(srcTensorsInfo.data(), &srcTensorsInfo[1], @@ -297,19 +308,30 @@ bool AclEltwiseExecutor::init(const std::vector& srcDescs, const return acl_op; }; break; - case Algorithm::EltwiseSubtract: + case Algorithm::EltwiseSubtract: { + // For u8, Subtract must wrap (e.g. 3 - 4 = 255), not saturate to 0. + // Only use wrap-around for pure u8->u8 subtract. + // QDQ patterns with u8 input but f32/i32 output must saturate. + const bool is_u8_u8_to_u8 = ov::intel_cpu::all_of(ov::element::u8, + srcDescs[0]->getPrecision(), + srcDescs[1]->getPrecision(), + dstDescs[0]->getPrecision()); + + const auto convert_policy = is_u8_u8_to_u8 ? ConvertPolicy::WRAP : ConvertPolicy::SATURATE; + if (!NEArithmeticSubtraction::validate(srcTensorsInfo.data(), &srcTensorsInfo[1], dstTensorsInfo.data(), - ConvertPolicy::SATURATE)) { + convert_policy)) { return false; } - exec_func = [this]() -> std::unique_ptr { + exec_func = [this, convert_policy]() -> std::unique_ptr { auto acl_op = std::make_unique(); - acl_op->configure(srcTensors.data(), &srcTensors[1], dstTensors.data(), ConvertPolicy::SATURATE); + acl_op->configure(srcTensors.data(), &srcTensors[1], dstTensors.data(), convert_policy); return acl_op; }; break; + } case Algorithm::EltwiseDivide: if (!NEElementwiseDivision::validate(srcTensorsInfo.data(), &srcTensorsInfo[1], dstTensorsInfo.data())) { return false; diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.cpp new file mode 100644 index 00000000000000..e747d64d56df09 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.cpp @@ -0,0 +1,90 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "eltwise_overflow.hpp" + +#include "common_test_utils/ov_tensor_utils.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/result.hpp" +#include "openvino/op/subtract.hpp" + +namespace ov { +namespace test { + +std::string EltwiseOverflowLayerCPUTest::getTestCaseName(const testing::TestParamInfo& obj) { + const auto& [kind, shape] = obj.param; + std::ostringstream result; + result << "kind=" << (kind == EltwiseOverflowKind::UNDERFLOW ? "UNDERFLOW" : "OVERFLOW"); + result << "_shape=" << shape; + return result.str(); +} + +void EltwiseOverflowLayerCPUTest::SetUp() { + targetDevice = ov::test::utils::DEVICE_CPU; + abs_threshold = 0; + + const auto& [kind, shape] = GetParam(); + overflowKind = kind; + + InputShape inShape = {{}, {shape}}; + init_input_shapes({inShape, inShape}); + + auto a = std::make_shared(ov::element::u8, ov::PartialShape(shape)); + auto b = std::make_shared(ov::element::u8, ov::PartialShape(shape)); + + std::shared_ptr op; + if (kind == EltwiseOverflowKind::UNDERFLOW) { + op = std::make_shared(a, b); + } else { + op = std::make_shared(a, b); + } + + auto result = std::make_shared(op); + function = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{a, b}, "EltwiseOverflow"); +} + +void EltwiseOverflowLayerCPUTest::generate_inputs(const std::vector& targetInputStaticShapes) { + inputs.clear(); + const auto& modelInputs = function->inputs(); + ASSERT_EQ(modelInputs.size(), 2u); + ASSERT_EQ(targetInputStaticShapes.size(), 2u); + + const size_t size = ov::shape_size(targetInputStaticShapes[0]); + + // Hardcoded values that guarantee underflow/overflow regardless of shape size. + // Pattern repeats to fill any shape. + static const std::vector underflow_a = {3, 0, 1, 5, 10, 0, 100, 50}; + static const std::vector underflow_b = {4, 1, 2, 3, 20, 1, 200, 51}; + // Expected results (wrap): 255, 255, 255, 2, 246, 255, 156, 255 + + static const std::vector overflow_a = {255, 254, 200, 128, 255, 250, 255, 1}; + static const std::vector overflow_b = {1, 2, 100, 128, 255, 10, 128, 255}; + // Expected results (wrap): 0, 0, 44, 0, 254, 4, 127, 0 + + const auto& src_a = (overflowKind == EltwiseOverflowKind::UNDERFLOW) ? underflow_a : overflow_a; + const auto& src_b = (overflowKind == EltwiseOverflowKind::UNDERFLOW) ? underflow_b : overflow_b; + + std::vector data0(size); + std::vector data1(size); + + for (size_t i = 0; i < size; ++i) { + data0[i] = src_a[i % src_a.size()]; + data1[i] = src_b[i % src_b.size()]; + } + + auto t0 = ov::Tensor(ov::element::u8, targetInputStaticShapes[0]); + auto t1 = ov::Tensor(ov::element::u8, targetInputStaticShapes[1]); + std::copy(data0.begin(), data0.end(), t0.data()); + std::copy(data1.begin(), data1.end(), t1.data()); + + inputs.insert({modelInputs[0].get_node_shared_ptr(), t0}); + inputs.insert({modelInputs[1].get_node_shared_ptr(), t1}); +} + +TEST_P(EltwiseOverflowLayerCPUTest, CompareWithRefs) { + run(); +} + +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.hpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.hpp new file mode 100644 index 00000000000000..6d65f2952e0312 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include "shared_test_classes/base/ov_subgraph.hpp" + +namespace ov { +namespace test { + +enum class EltwiseOverflowKind { UNDERFLOW, OVERFLOW }; + +typedef std::tuple EltwiseOverflowTestParams; + +class EltwiseOverflowLayerCPUTest : public testing::WithParamInterface, + virtual public SubgraphBaseTest { +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj); + +protected: + void SetUp() override; + void generate_inputs(const std::vector& targetInputStaticShapes) override; + +private: + EltwiseOverflowKind overflowKind; +}; + +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/eltwise_overflow.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/eltwise_overflow.cpp new file mode 100644 index 00000000000000..45c3633e2dcaaa --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/eltwise_overflow.cpp @@ -0,0 +1,24 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "custom/single_layer_tests/classes/eltwise_overflow.hpp" + +namespace ov { +namespace test { + +const std::vector overflowKinds = {EltwiseOverflowKind::UNDERFLOW, EltwiseOverflowKind::OVERFLOW}; + +const std::vector testShapes = { + {4}, // small 1D + {64}, // larger 1D to exercise vectorized JIT path + {1, 2, 2, 2}, // 4D typical NN shape +}; + +INSTANTIATE_TEST_SUITE_P(smoke_EltwiseOverflowU8, + EltwiseOverflowLayerCPUTest, + ::testing::Combine(::testing::ValuesIn(overflowKinds), ::testing::ValuesIn(testShapes)), + EltwiseOverflowLayerCPUTest::getTestCaseName); + +} // namespace test +} // namespace ov