Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,12 @@ void jit_add_emitter::emit_isa(const std::vector<size_t>& in_vec_idxs, const std
case ov::element::i32:
h->uni_vpaddd(vmm_dst, vmm_src0, vmm_src1);
break;
case ov::element::u8:
// u8 addition uses vpaddb which naturally wraps around (mod 256).
// This gives correct behavior: e.g., 255 + 1 = 0.
OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::u8, "u8 add JIT must only be used for u8 output");
h->uni_vpaddb(vmm_dst, vmm_src0, vmm_src1);
break;
default:
OV_CPU_JIT_EMITTER_THROW("Unsupported precision");
}
Expand All @@ -92,9 +98,20 @@ void jit_add_emitter::emit_isa(const std::vector<size_t>& in_vec_idxs, const std
}
}

std::set<std::vector<element::Type>> jit_add_emitter::get_supported_precisions(
[[maybe_unused]] const std::shared_ptr<ov::Node>& node) {
return {{element::f32, element::f32}, {element::i32, element::i32}};
std::set<std::vector<element::Type>> jit_add_emitter::get_supported_precisions(const std::shared_ptr<ov::Node>& node) {
std::set<std::vector<element::Type>> supported = {{element::f32, element::f32}, {element::i32, element::i32}};

// Only enable u8 wrap-around for pure u8->u8 arithmetic.
// QDQ/dequantization patterns (u8 input, f32/i32 output) must NOT use u8 execution.
// node may be nullptr when called from SupportedPrecisions functor (general query).
if (node && ov::intel_cpu::all_of(element::u8,
node->get_input_element_type(0),
node->get_input_element_type(1),
node->get_output_element_type(0))) {
supported.insert({element::u8, element::u8});
}

return supported;
}

/// MUL_ADD ///
Expand Down Expand Up @@ -243,6 +260,12 @@ void jit_subtract_emitter::emit_isa(const std::vector<size_t>& in_vec_idxs,
case ov::element::i32:
h->uni_vpsubd(vmm_dst, vmm_src0, vmm_src1);
break;
case ov::element::u8:
// u8 subtraction uses vpsubb which naturally wraps around (mod 256).
// This gives correct behavior: e.g., 3 - 4 = 255.
OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::u8, "u8 subtract JIT must only be used for u8 output");
h->uni_vpsubb(vmm_dst, vmm_src0, vmm_src1);
break;
default:
OV_CPU_JIT_EMITTER_THROW("Unsupported precision");
}
Expand All @@ -257,8 +280,20 @@ void jit_subtract_emitter::emit_isa(const std::vector<size_t>& in_vec_idxs,
}

std::set<std::vector<element::Type>> jit_subtract_emitter::get_supported_precisions(
[[maybe_unused]] const std::shared_ptr<ov::Node>& node) {
return {{element::f32, element::f32}, {element::i32, element::i32}};
const std::shared_ptr<ov::Node>& node) {
std::set<std::vector<element::Type>> supported = {{element::f32, element::f32}, {element::i32, element::i32}};

// Only enable u8 wrap-around for pure u8->u8 arithmetic.
// QDQ/dequantization patterns (u8 input, f32/i32 output) must NOT use u8 execution.
// node may be nullptr when called from SupportedPrecisions functor (general query).
if (node && ov::intel_cpu::all_of(element::u8,
node->get_input_element_type(0),
node->get_input_element_type(1),
node->get_output_element_type(0))) {
supported.insert({element::u8, element::u8});
}

return supported;
}

/// MULTIPLY ///
Expand Down
38 changes: 30 additions & 8 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include "openvino/core/except.hpp"
#include "openvino/core/type/element_type.hpp"
#include "utils/debug_capabilities.h"
#include "utils/general_utils.h"

namespace ov::intel_cpu {

Expand Down Expand Up @@ -264,19 +265,29 @@ bool AclEltwiseExecutor::init(const std::vector<MemoryDescPtr>& srcDescs, const

std::function<std::unique_ptr<IFunction>(void)> exec_func;
switch (aclEltwiseAttrs.data.algo) {
case Algorithm::EltwiseAdd:
case Algorithm::EltwiseAdd: {
// For u8, Add must wrap on overflow (e.g. 255 + 1 = 0), not saturate.
// Only use wrap-around for pure u8->u8 add.
// QDQ patterns with u8 input but f32/i32 output must saturate.
const bool is_u8_u8_to_u8 = ov::intel_cpu::all_of(ov::element::u8,
srcDescs[0]->getPrecision(),
srcDescs[1]->getPrecision(),
dstDescs[0]->getPrecision());
const auto convert_policy = is_u8_u8_to_u8 ? ConvertPolicy::WRAP : ConvertPolicy::SATURATE;

if (!NEArithmeticAddition::validate(srcTensorsInfo.data(),
&srcTensorsInfo[1],
dstTensorsInfo.data(),
ConvertPolicy::SATURATE)) {
convert_policy)) {
return false;
}
exec_func = [this]() -> std::unique_ptr<IFunction> {
exec_func = [this, convert_policy]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEArithmeticAddition>();
acl_op->configure(srcTensors.data(), &srcTensors[1], dstTensors.data(), ConvertPolicy::SATURATE);
acl_op->configure(srcTensors.data(), &srcTensors[1], dstTensors.data(), convert_policy);
return acl_op;
};
break;
}
case Algorithm::EltwiseMultiply:
if (!NEPixelWiseMultiplication::validate(srcTensorsInfo.data(),
&srcTensorsInfo[1],
Expand All @@ -297,19 +308,30 @@ bool AclEltwiseExecutor::init(const std::vector<MemoryDescPtr>& srcDescs, const
return acl_op;
};
break;
case Algorithm::EltwiseSubtract:
case Algorithm::EltwiseSubtract: {
// For u8, Subtract must wrap (e.g. 3 - 4 = 255), not saturate to 0.
// Only use wrap-around for pure u8->u8 subtract.
// QDQ patterns with u8 input but f32/i32 output must saturate.
const bool is_u8_u8_to_u8 = ov::intel_cpu::all_of(ov::element::u8,
srcDescs[0]->getPrecision(),
srcDescs[1]->getPrecision(),
dstDescs[0]->getPrecision());

const auto convert_policy = is_u8_u8_to_u8 ? ConvertPolicy::WRAP : ConvertPolicy::SATURATE;

if (!NEArithmeticSubtraction::validate(srcTensorsInfo.data(),
&srcTensorsInfo[1],
dstTensorsInfo.data(),
ConvertPolicy::SATURATE)) {
convert_policy)) {
return false;
}
exec_func = [this]() -> std::unique_ptr<IFunction> {
exec_func = [this, convert_policy]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEArithmeticSubtraction>();
acl_op->configure(srcTensors.data(), &srcTensors[1], dstTensors.data(), ConvertPolicy::SATURATE);
acl_op->configure(srcTensors.data(), &srcTensors[1], dstTensors.data(), convert_policy);
return acl_op;
};
break;
}
case Algorithm::EltwiseDivide:
if (!NEElementwiseDivision::validate(srcTensorsInfo.data(), &srcTensorsInfo[1], dstTensorsInfo.data())) {
return false;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// Copyright (C) 2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Copyright (C) 2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
// Copyright (C) 2018-2026 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//


#include "eltwise_overflow.hpp"

#include "common_test_utils/ov_tensor_utils.hpp"
#include "openvino/op/add.hpp"
#include "openvino/op/result.hpp"
#include "openvino/op/subtract.hpp"

namespace ov {
namespace test {

std::string EltwiseOverflowLayerCPUTest::getTestCaseName(const testing::TestParamInfo<EltwiseOverflowTestParams>& obj) {
const auto& [kind, shape] = obj.param;
std::ostringstream result;
result << "kind=" << (kind == EltwiseOverflowKind::UNDERFLOW ? "UNDERFLOW" : "OVERFLOW");
result << "_shape=" << shape;
return result.str();
}

void EltwiseOverflowLayerCPUTest::SetUp() {
targetDevice = ov::test::utils::DEVICE_CPU;
abs_threshold = 0;

const auto& [kind, shape] = GetParam();
overflowKind = kind;

InputShape inShape = {{}, {shape}};
init_input_shapes({inShape, inShape});

auto a = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::PartialShape(shape));
auto b = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::PartialShape(shape));

std::shared_ptr<ov::Node> op;
if (kind == EltwiseOverflowKind::UNDERFLOW) {
op = std::make_shared<ov::op::v1::Subtract>(a, b);
} else {
op = std::make_shared<ov::op::v1::Add>(a, b);
}

auto result = std::make_shared<ov::op::v0::Result>(op);
function = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{a, b}, "EltwiseOverflow");
}

void EltwiseOverflowLayerCPUTest::generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) {
inputs.clear();
const auto& modelInputs = function->inputs();
ASSERT_EQ(modelInputs.size(), 2u);
ASSERT_EQ(targetInputStaticShapes.size(), 2u);

const size_t size = ov::shape_size(targetInputStaticShapes[0]);

// Hardcoded values that guarantee underflow/overflow regardless of shape size.
// Pattern repeats to fill any shape.
static const std::vector<uint8_t> underflow_a = {3, 0, 1, 5, 10, 0, 100, 50};
static const std::vector<uint8_t> underflow_b = {4, 1, 2, 3, 20, 1, 200, 51};
// Expected results (wrap): 255, 255, 255, 2, 246, 255, 156, 255

static const std::vector<uint8_t> overflow_a = {255, 254, 200, 128, 255, 250, 255, 1};
static const std::vector<uint8_t> overflow_b = {1, 2, 100, 128, 255, 10, 128, 255};
// Expected results (wrap): 0, 0, 44, 0, 254, 4, 127, 0

const auto& src_a = (overflowKind == EltwiseOverflowKind::UNDERFLOW) ? underflow_a : overflow_a;
const auto& src_b = (overflowKind == EltwiseOverflowKind::UNDERFLOW) ? underflow_b : overflow_b;

std::vector<uint8_t> data0(size);
std::vector<uint8_t> data1(size);

for (size_t i = 0; i < size; ++i) {
data0[i] = src_a[i % src_a.size()];
data1[i] = src_b[i % src_b.size()];
}

auto t0 = ov::Tensor(ov::element::u8, targetInputStaticShapes[0]);
auto t1 = ov::Tensor(ov::element::u8, targetInputStaticShapes[1]);
std::copy(data0.begin(), data0.end(), t0.data<uint8_t>());
std::copy(data1.begin(), data1.end(), t1.data<uint8_t>());

inputs.insert({modelInputs[0].get_node_shared_ptr(), t0});
inputs.insert({modelInputs[1].get_node_shared_ptr(), t1});
}

TEST_P(EltwiseOverflowLayerCPUTest, CompareWithRefs) {
run();
}

} // namespace test
} // namespace ov
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Copyright (C) 2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Copyright (C) 2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
// Copyright (C) 2018-2026 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//


#pragma once

#include <gtest/gtest.h>

#include "shared_test_classes/base/ov_subgraph.hpp"

namespace ov {
namespace test {

enum class EltwiseOverflowKind { UNDERFLOW, OVERFLOW };

typedef std::tuple<EltwiseOverflowKind, ov::Shape> EltwiseOverflowTestParams;

class EltwiseOverflowLayerCPUTest : public testing::WithParamInterface<EltwiseOverflowTestParams>,
virtual public SubgraphBaseTest {
public:
static std::string getTestCaseName(const testing::TestParamInfo<EltwiseOverflowTestParams>& obj);

protected:
void SetUp() override;
void generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) override;

private:
EltwiseOverflowKind overflowKind;
};

} // namespace test
} // namespace ov
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
// Copyright (C) 2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// Copyright (C) 2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//
// Copyright (C) 2018-2026 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//


#include "custom/single_layer_tests/classes/eltwise_overflow.hpp"

namespace ov {
namespace test {

const std::vector<EltwiseOverflowKind> overflowKinds = {EltwiseOverflowKind::UNDERFLOW, EltwiseOverflowKind::OVERFLOW};

const std::vector<ov::Shape> testShapes = {
{4}, // small 1D
{64}, // larger 1D to exercise vectorized JIT path
{1, 2, 2, 2}, // 4D typical NN shape
};

INSTANTIATE_TEST_SUITE_P(smoke_EltwiseOverflowU8,
EltwiseOverflowLayerCPUTest,
::testing::Combine(::testing::ValuesIn(overflowKinds), ::testing::ValuesIn(testShapes)),
EltwiseOverflowLayerCPUTest::getTestCaseName);

} // namespace test
} // namespace ov