From 4ca0fc2b931a9d1e5f423a68577dd8590b1aa3b9 Mon Sep 17 00:00:00 2001 From: Nishant-ZFYII Date: Sun, 4 Jan 2026 03:02:53 +0530 Subject: [PATCH 1/8] [CPU] Fix u8 Subtract to use wrap-around instead of saturation Fixes #33164 - Changed ACL executor to use ConvertPolicy::WRAP for u8 subtract - Added u8 support to x64 JIT subtract emitter using vpsubb instruction - Added regression tests for u8 subtract wrap-around behavior --- .../plugin/x64/jit_eltwise_emitters.cpp | 10 +- .../src/nodes/executors/acl/acl_eltwise.cpp | 14 +- .../instances/common/subtract_u8_wrap.cpp | 145 ++++++++++++++++++ 3 files changed, 164 insertions(+), 5 deletions(-) create mode 100644 src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/subtract_u8_wrap.cpp diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp index cd75c3b2b16fab..285ac4ce9d1c01 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp @@ -243,6 +243,12 @@ void jit_subtract_emitter::emit_isa(const std::vector& in_vec_idxs, case ov::element::i32: h->uni_vpsubd(vmm_dst, vmm_src0, vmm_src1); break; + case ov::element::u8: + // u8 subtraction uses vpsubb which naturally wraps around (mod 256). + // This gives correct behavior: e.g., 3 - 4 = 255. + // See https://github.com/openvinotoolkit/openvino/issues/33164 + h->uni_vpsubb(vmm_dst, vmm_src0, vmm_src1); + break; default: OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); } @@ -258,7 +264,9 @@ void jit_subtract_emitter::emit_isa(const std::vector& in_vec_idxs, std::set> jit_subtract_emitter::get_supported_precisions( [[maybe_unused]] const std::shared_ptr& node) { - return {{element::f32, element::f32}, {element::i32, element::i32}}; + // u8 added to support wrap-around behavior for unsigned subtraction. + // See https://github.com/openvinotoolkit/openvino/issues/33164 + return {{element::f32, element::f32}, {element::i32, element::i32}, {element::u8, element::u8}}; } /// MULTIPLY /// diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp index e57c57dc296843..30c17a9b9ad2c7 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp @@ -297,19 +297,25 @@ bool AclEltwiseExecutor::init(const std::vector& srcDescs, const return acl_op; }; break; - case Algorithm::EltwiseSubtract: + case Algorithm::EltwiseSubtract: { + // For u8, Subtract must wrap (e.g. 3 - 4 = 255), not saturate to 0. + // See https://github.com/openvinotoolkit/openvino/issues/33164 + const auto convert_policy = + (dstDescs[0]->getPrecision() == ov::element::u8) ? ConvertPolicy::WRAP : ConvertPolicy::SATURATE; + if (!NEArithmeticSubtraction::validate(srcTensorsInfo.data(), &srcTensorsInfo[1], dstTensorsInfo.data(), - ConvertPolicy::SATURATE)) { + convert_policy)) { return false; } - exec_func = [this]() -> std::unique_ptr { + exec_func = [this, convert_policy]() -> std::unique_ptr { auto acl_op = std::make_unique(); - acl_op->configure(srcTensors.data(), &srcTensors[1], dstTensors.data(), ConvertPolicy::SATURATE); + acl_op->configure(srcTensors.data(), &srcTensors[1], dstTensors.data(), convert_policy); return acl_op; }; break; + } case Algorithm::EltwiseDivide: if (!NEElementwiseDivision::validate(srcTensorsInfo.data(), &srcTensorsInfo[1], dstTensorsInfo.data())) { return false; diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/subtract_u8_wrap.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/subtract_u8_wrap.cpp new file mode 100644 index 00000000000000..e51c42f0e5abea --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/subtract_u8_wrap.cpp @@ -0,0 +1,145 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +// Regression test for GitHub issue #33164: +// u8 Subtract must wrap around (e.g., 3 - 4 = 255), not saturate to 0. +// https://github.com/openvinotoolkit/openvino/issues/33164 + +#include + +#include "openvino/op/parameter.hpp" +#include "openvino/op/result.hpp" +#include "openvino/op/subtract.hpp" +#include "openvino/openvino.hpp" + +namespace ov { +namespace test { +namespace { + +class SubtractU8WrapAroundTest : public ::testing::Test { +protected: + void SetUp() override { + core = std::make_shared(); + } + + std::shared_ptr core; +}; + +// Test that u8 subtraction wraps around instead of saturating. +// This is a regression test for https://github.com/openvinotoolkit/openvino/issues/33164 +TEST_F(SubtractU8WrapAroundTest, WrapAroundBehavior) { + // Create a simple model: out = a - b (both u8) + auto a = std::make_shared(ov::element::u8, ov::Shape{4}); + auto b = std::make_shared(ov::element::u8, ov::Shape{4}); + auto subtract = std::make_shared(a, b); + auto result = std::make_shared(subtract); + auto model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{a, b}); + + // Compile for CPU + auto compiled_model = core->compile_model(model, "CPU"); + auto infer_request = compiled_model.create_infer_request(); + + // Test cases that exercise underflow: + // Input A: [3, 0, 1, 5] + // Input B: [4, 1, 2, 3] + // Expected with wrap-around (mod 256): [255, 255, 255, 2] + // Wrong saturation result would be: [0, 0, 0, 2] + std::vector input_a = {3, 0, 1, 5}; + std::vector input_b = {4, 1, 2, 3}; + std::vector expected = {255, 255, 255, 2}; + + auto tensor_a = ov::Tensor(ov::element::u8, {4}, input_a.data()); + auto tensor_b = ov::Tensor(ov::element::u8, {4}, input_b.data()); + + infer_request.set_tensor(a, tensor_a); + infer_request.set_tensor(b, tensor_b); + infer_request.infer(); + + auto output = infer_request.get_output_tensor(0); + auto output_data = output.data(); + + for (size_t i = 0; i < expected.size(); ++i) { + EXPECT_EQ(output_data[i], expected[i]) + << "Mismatch at index " << i << ": got " << static_cast(output_data[i]) << ", expected " + << static_cast(expected[i]) << ". u8 subtraction should wrap around (mod 256), not saturate to 0."; + } +} + +// Test with larger tensor to exercise vector path in JIT +TEST_F(SubtractU8WrapAroundTest, WrapAroundBehaviorLargeVector) { + const size_t size = 64; // Large enough to trigger vectorized JIT path + + auto a = std::make_shared(ov::element::u8, ov::Shape{size}); + auto b = std::make_shared(ov::element::u8, ov::Shape{size}); + auto subtract = std::make_shared(a, b); + auto result = std::make_shared(subtract); + auto model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{a, b}); + + auto compiled_model = core->compile_model(model, "CPU"); + auto infer_request = compiled_model.create_infer_request(); + + std::vector input_a(size); + std::vector input_b(size); + std::vector expected(size); + + for (size_t i = 0; i < size; ++i) { + input_a[i] = static_cast(i % 10); // 0-9 repeating + input_b[i] = static_cast((i % 10) + 1); // 1-10 repeating + // Each result should be -1 mod 256 = 255, except when a >= b + expected[i] = static_cast((256 + input_a[i] - input_b[i]) % 256); + } + + auto tensor_a = ov::Tensor(ov::element::u8, {size}, input_a.data()); + auto tensor_b = ov::Tensor(ov::element::u8, {size}, input_b.data()); + + infer_request.set_tensor(a, tensor_a); + infer_request.set_tensor(b, tensor_b); + infer_request.infer(); + + auto output = infer_request.get_output_tensor(0); + auto output_data = output.data(); + + for (size_t i = 0; i < expected.size(); ++i) { + EXPECT_EQ(output_data[i], expected[i]) + << "Mismatch at index " << i << ": got " << static_cast(output_data[i]) << ", expected " + << static_cast(expected[i]); + } +} + +// Test with 4D tensor to match typical NN tensor shapes +TEST_F(SubtractU8WrapAroundTest, WrapAroundBehavior4D) { + auto a = std::make_shared(ov::element::u8, ov::Shape{1, 2, 2, 2}); + auto b = std::make_shared(ov::element::u8, ov::Shape{1, 2, 2, 2}); + auto subtract = std::make_shared(a, b); + auto result = std::make_shared(subtract); + auto model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{a, b}); + + auto compiled_model = core->compile_model(model, "CPU"); + auto infer_request = compiled_model.create_infer_request(); + + // All zeros minus all ones should give all 255s (wrap-around) + std::vector input_a(8, 0); + std::vector input_b(8, 1); + std::vector expected(8, 255); + + auto tensor_a = ov::Tensor(ov::element::u8, {1, 2, 2, 2}, input_a.data()); + auto tensor_b = ov::Tensor(ov::element::u8, {1, 2, 2, 2}, input_b.data()); + + infer_request.set_tensor(a, tensor_a); + infer_request.set_tensor(b, tensor_b); + infer_request.infer(); + + auto output = infer_request.get_output_tensor(0); + auto output_data = output.data(); + + for (size_t i = 0; i < expected.size(); ++i) { + EXPECT_EQ(output_data[i], expected[i]) + << "4D tensor mismatch at index " << i << ": got " << static_cast(output_data[i]) << ", expected " + << static_cast(expected[i]); + } +} + +} // namespace +} // namespace test +} // namespace ov From 6a70449407ee0666fd3421a0bcd49f6019fb1ff0 Mon Sep 17 00:00:00 2001 From: Nishant-ZFYII Date: Tue, 6 Jan 2026 22:24:06 +0530 Subject: [PATCH 2/8] [CPU] Fix u8 subtract to wrap instead of saturate Gate u8 subtract execution to only u8->u8 operations. This ensures wrap-around behavior (e.g., 3 - 4 = 255) for pure u8 arithmetic while preventing u8 execution for dequantization patterns (u8 input, f32/i32 output) where wrap-around would corrupt the math. Changes: - Modified get_supported_precisions() to conditionally enable u8 support only when both inputs AND output are u8 - Added defensive assertion in emit_isa() u8 case - Removed [[maybe_unused]] attribute as node parameter is now used Fixes #33164 --- .../plugin/x64/jit_eltwise_emitters.cpp | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp index 285ac4ce9d1c01..f0d48ea6592eef 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp @@ -247,6 +247,7 @@ void jit_subtract_emitter::emit_isa(const std::vector& in_vec_idxs, // u8 subtraction uses vpsubb which naturally wraps around (mod 256). // This gives correct behavior: e.g., 3 - 4 = 255. // See https://github.com/openvinotoolkit/openvino/issues/33164 + OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::u8, "u8 subtract JIT must only be used for u8 output"); h->uni_vpsubb(vmm_dst, vmm_src0, vmm_src1); break; default: @@ -263,10 +264,25 @@ void jit_subtract_emitter::emit_isa(const std::vector& in_vec_idxs, } std::set> jit_subtract_emitter::get_supported_precisions( - [[maybe_unused]] const std::shared_ptr& node) { - // u8 added to support wrap-around behavior for unsigned subtraction. - // See https://github.com/openvinotoolkit/openvino/issues/33164 - return {{element::f32, element::f32}, {element::i32, element::i32}, {element::u8, element::u8}}; + const std::shared_ptr& node) { + std::set> supported = { + {element::f32, element::f32}, + {element::i32, element::i32} + }; + + // Only enable u8 wrap-around for pure u8->u8 arithmetic (issue #33164). + // QDQ/dequantization patterns (u8 input, f32/i32 output) must NOT use u8 execution. + if (node) { + const auto in0 = node->get_input_element_type(0); + const auto in1 = node->get_input_element_type(1); + const auto out = node->get_output_element_type(0); + + if (in0 == element::u8 && in1 == element::u8 && out == element::u8) { + supported.insert({element::u8, element::u8}); + } + } + + return supported; } /// MULTIPLY /// From d8e9383bc17ca929573cafa6e9174079203234c8 Mon Sep 17 00:00:00 2001 From: Nishant-ZFYII Date: Tue, 6 Jan 2026 22:56:24 +0530 Subject: [PATCH 3/8] [CPU] Fix u8 subtract to wrap instead of saturate Gate u8 subtract execution to only pure u8->u8 operations. This ensures wrap-around behavior (e.g., 3 - 4 = 255) for unsigned arithmetic while preventing u8 execution for dequantization patterns (u8 input, f32/i32 output) where wrap-around would corrupt the math. Changes: - JIT: Modified get_supported_precisions() to enable u8 only when both inputs AND output are u8 - ACL: Added same u8->u8 gating for ConvertPolicy::WRAP - Tests: Added TypeRelaxed regression tests to catch LPT/dequant failures Fixes #33164 --- .../plugin/x64/jit_eltwise_emitters.cpp | 11 +- .../src/nodes/executors/acl/acl_eltwise.cpp | 10 +- .../instances/common/subtract_u8_wrap.cpp | 139 ++++++++++++++++++ 3 files changed, 151 insertions(+), 9 deletions(-) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp index f0d48ea6592eef..bd06a3c2017920 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp @@ -265,23 +265,20 @@ void jit_subtract_emitter::emit_isa(const std::vector& in_vec_idxs, std::set> jit_subtract_emitter::get_supported_precisions( const std::shared_ptr& node) { - std::set> supported = { - {element::f32, element::f32}, - {element::i32, element::i32} - }; - + std::set> supported = {{element::f32, element::f32}, {element::i32, element::i32}}; + // Only enable u8 wrap-around for pure u8->u8 arithmetic (issue #33164). // QDQ/dequantization patterns (u8 input, f32/i32 output) must NOT use u8 execution. if (node) { const auto in0 = node->get_input_element_type(0); const auto in1 = node->get_input_element_type(1); const auto out = node->get_output_element_type(0); - + if (in0 == element::u8 && in1 == element::u8 && out == element::u8) { supported.insert({element::u8, element::u8}); } } - + return supported; } diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp index 30c17a9b9ad2c7..d0f351976a5463 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp @@ -299,9 +299,15 @@ bool AclEltwiseExecutor::init(const std::vector& srcDescs, const break; case Algorithm::EltwiseSubtract: { // For u8, Subtract must wrap (e.g. 3 - 4 = 255), not saturate to 0. + // Only use wrap-around for pure u8->u8 subtract (issue #33164). + // QDQ patterns with u8 input but f32/i32 output must saturate. // See https://github.com/openvinotoolkit/openvino/issues/33164 - const auto convert_policy = - (dstDescs[0]->getPrecision() == ov::element::u8) ? ConvertPolicy::WRAP : ConvertPolicy::SATURATE; + + const bool is_u8_u8_to_u8 = (srcDescs[0]->getPrecision() == ov::element::u8) && + (srcDescs[1]->getPrecision() == ov::element::u8) && + (dstDescs[0]->getPrecision() == ov::element::u8); + + const auto convert_policy = is_u8_u8_to_u8 ? ConvertPolicy::WRAP : ConvertPolicy::SATURATE; if (!NEArithmeticSubtraction::validate(srcTensorsInfo.data(), &srcTensorsInfo[1], diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/subtract_u8_wrap.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/subtract_u8_wrap.cpp index e51c42f0e5abea..ea6948179e8ee7 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/subtract_u8_wrap.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/subtract_u8_wrap.cpp @@ -5,6 +5,10 @@ // Regression test for GitHub issue #33164: // u8 Subtract must wrap around (e.g., 3 - 4 = 255), not saturate to 0. // https://github.com/openvinotoolkit/openvino/issues/33164 +// +// Additionally, tests ensure that TypeRelaxed subtract with u8 inputs but +// f32/i32 output does NOT use wrap-around (must give negative values). +// This catches regressions in LPT/dequantization patterns. #include @@ -12,6 +16,7 @@ #include "openvino/op/result.hpp" #include "openvino/op/subtract.hpp" #include "openvino/openvino.hpp" +#include "ov_ops/type_relaxed.hpp" namespace ov { namespace test { @@ -140,6 +145,140 @@ TEST_F(SubtractU8WrapAroundTest, WrapAroundBehavior4D) { } } +// ============================================================================ +// TypeRelaxed tests: u8 inputs with non-u8 output (dequantization patterns) +// These tests ensure that the u8 wrap-around path is NOT used when output +// type is f32 or i32 (typical in LPT/QDQ patterns). +// ============================================================================ + +// u8 inputs, but output overridden to f32: MUST NOT wrap, should give negatives. +// This test would have caught the CI failure "unsupported src_prc: u8" crash. +TEST_F(SubtractU8WrapAroundTest, U8Inputs_F32Output_NoWrap_NoCrash) { + auto a = std::make_shared(ov::element::u8, ov::Shape{4}); + auto b = std::make_shared(ov::element::u8, ov::Shape{4}); + + // Create a TypeRelaxed subtract: u8 inputs, f32 output + // This simulates dequantization subtract patterns in LPT + using TRSub = ov::op::TypeRelaxed; + auto sub = std::make_shared( + ov::element::TypeVector{ov::element::f32, ov::element::f32}, // origin input types for inference + ov::element::TypeVector{ov::element::f32}, // overridden output type + a, + b); + + auto result = std::make_shared(sub); + auto model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{a, b}); + + auto compiled_model = core->compile_model(model, "CPU"); + auto infer_request = compiled_model.create_infer_request(); + + std::vector input_a = {3, 0, 1, 5}; + std::vector input_b = {4, 1, 2, 3}; + + auto tensor_a = ov::Tensor(ov::element::u8, {4}, input_a.data()); + auto tensor_b = ov::Tensor(ov::element::u8, {4}, input_b.data()); + + infer_request.set_tensor(a, tensor_a); + infer_request.set_tensor(b, tensor_b); + infer_request.infer(); + + auto out = infer_request.get_output_tensor(0); + ASSERT_EQ(out.get_element_type(), ov::element::f32); + + auto* out_data = out.data(); + // With proper dequantization semantics, these should be NEGATIVE values + // NOT wrap-around values like 255 + std::vector expected = {-1.f, -1.f, -1.f, 2.f}; + + for (size_t i = 0; i < expected.size(); ++i) { + EXPECT_FLOAT_EQ(out_data[i], expected[i]) + << "index=" << i << ": got " << out_data[i] << ", expected " << expected[i] + << ". TypeRelaxed u8->f32 subtract must NOT use wrap-around."; + } +} + +// Same idea, but output overridden to i32. +TEST_F(SubtractU8WrapAroundTest, U8Inputs_I32Output_NoWrap_NoCrash) { + auto a = std::make_shared(ov::element::u8, ov::Shape{4}); + auto b = std::make_shared(ov::element::u8, ov::Shape{4}); + + // Create a TypeRelaxed subtract: u8 inputs, i32 output + using TRSub = ov::op::TypeRelaxed; + auto sub = std::make_shared( + ov::element::TypeVector{ov::element::i32, ov::element::i32}, // origin input types for inference + ov::element::TypeVector{ov::element::i32}, // overridden output type + a, + b); + + auto result = std::make_shared(sub); + auto model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{a, b}); + + auto compiled_model = core->compile_model(model, "CPU"); + auto infer_request = compiled_model.create_infer_request(); + + std::vector input_a = {3, 0, 1, 5}; + std::vector input_b = {4, 1, 2, 3}; + + infer_request.set_tensor(a, ov::Tensor(ov::element::u8, {4}, input_a.data())); + infer_request.set_tensor(b, ov::Tensor(ov::element::u8, {4}, input_b.data())); + infer_request.infer(); + + auto out = infer_request.get_output_tensor(0); + ASSERT_EQ(out.get_element_type(), ov::element::i32); + + auto* out_data = out.data(); + // With proper dequantization semantics, these should be NEGATIVE values + std::vector expected = {-1, -1, -1, 2}; + + for (size_t i = 0; i < expected.size(); ++i) { + EXPECT_EQ(out_data[i], expected[i]) << "index=" << i << ": got " << out_data[i] << ", expected " << expected[i] + << ". TypeRelaxed u8->i32 subtract must NOT use wrap-around."; + } +} + +// Test with larger vector to exercise JIT vectorized path for TypeRelaxed +TEST_F(SubtractU8WrapAroundTest, U8Inputs_F32Output_LargeVector) { + const size_t size = 64; // Large enough to trigger vectorized JIT path + + auto a = std::make_shared(ov::element::u8, ov::Shape{size}); + auto b = std::make_shared(ov::element::u8, ov::Shape{size}); + + using TRSub = ov::op::TypeRelaxed; + auto sub = std::make_shared(ov::element::TypeVector{ov::element::f32, ov::element::f32}, + ov::element::TypeVector{ov::element::f32}, + a, + b); + + auto result = std::make_shared(sub); + auto model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{a, b}); + + auto compiled_model = core->compile_model(model, "CPU"); + auto infer_request = compiled_model.create_infer_request(); + + std::vector input_a(size); + std::vector input_b(size); + std::vector expected(size); + + for (size_t i = 0; i < size; ++i) { + input_a[i] = static_cast(i % 10); // 0-9 repeating + input_b[i] = static_cast((i % 10) + 1); // 1-10 repeating + // Expected: proper subtraction with negative results + expected[i] = static_cast(static_cast(input_a[i]) - static_cast(input_b[i])); + } + + infer_request.set_tensor(a, ov::Tensor(ov::element::u8, {size}, input_a.data())); + infer_request.set_tensor(b, ov::Tensor(ov::element::u8, {size}, input_b.data())); + infer_request.infer(); + + auto out = infer_request.get_output_tensor(0); + auto* out_data = out.data(); + + for (size_t i = 0; i < expected.size(); ++i) { + EXPECT_FLOAT_EQ(out_data[i], expected[i]) + << "index=" << i << ": got " << out_data[i] << ", expected " << expected[i]; + } +} + } // namespace } // namespace test } // namespace ov From debffa02e82b8ed3cd696618fe670a6b97551e79 Mon Sep 17 00:00:00 2001 From: Nishant-ZFYII Date: Sat, 17 Jan 2026 17:23:00 +0530 Subject: [PATCH 4/8] [CPU] Address reviewer feedback for u8 eltwise wrap-around fix - Replace subtract_u8_wrap.cpp with proper eltwise_overflow test class - Test both UNDERFLOW (subtract) and OVERFLOW (add) using CompareWithRefs - Use all_of() utility instead of chained && comparisons - Use OPENVINO_ASSERT instead of if-check for node null - Remove issue ticket references from comments --- .../plugin/x64/jit_eltwise_emitters.cpp | 17 +- .../src/nodes/executors/acl/acl_eltwise.cpp | 12 +- .../classes/eltwise_overflow.cpp | 85 ++++++ .../classes/eltwise_overflow.hpp | 32 ++ .../instances/common/eltwise_overflow.cpp | 24 ++ .../instances/common/subtract_u8_wrap.cpp | 284 ------------------ 6 files changed, 154 insertions(+), 300 deletions(-) create mode 100644 src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.cpp create mode 100644 src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.hpp create mode 100644 src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/eltwise_overflow.cpp delete mode 100644 src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/subtract_u8_wrap.cpp diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp index bd06a3c2017920..a8349c8497729c 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp @@ -246,7 +246,6 @@ void jit_subtract_emitter::emit_isa(const std::vector& in_vec_idxs, case ov::element::u8: // u8 subtraction uses vpsubb which naturally wraps around (mod 256). // This gives correct behavior: e.g., 3 - 4 = 255. - // See https://github.com/openvinotoolkit/openvino/issues/33164 OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::u8, "u8 subtract JIT must only be used for u8 output"); h->uni_vpsubb(vmm_dst, vmm_src0, vmm_src1); break; @@ -267,16 +266,14 @@ std::set> jit_subtract_emitter::get_supported_precisi const std::shared_ptr& node) { std::set> supported = {{element::f32, element::f32}, {element::i32, element::i32}}; - // Only enable u8 wrap-around for pure u8->u8 arithmetic (issue #33164). + // Only enable u8 wrap-around for pure u8->u8 arithmetic. // QDQ/dequantization patterns (u8 input, f32/i32 output) must NOT use u8 execution. - if (node) { - const auto in0 = node->get_input_element_type(0); - const auto in1 = node->get_input_element_type(1); - const auto out = node->get_output_element_type(0); - - if (in0 == element::u8 && in1 == element::u8 && out == element::u8) { - supported.insert({element::u8, element::u8}); - } + OPENVINO_ASSERT(node, "node must not be null for get_supported_precisions"); + if (ov::intel_cpu::all_of(element::u8, + node->get_input_element_type(0), + node->get_input_element_type(1), + node->get_output_element_type(0))) { + supported.insert({element::u8, element::u8}); } return supported; diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp index d0f351976a5463..375692d019fe37 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp @@ -36,6 +36,7 @@ #include "openvino/core/except.hpp" #include "openvino/core/type/element_type.hpp" #include "utils/debug_capabilities.h" +#include "utils/general_utils.h" namespace ov::intel_cpu { @@ -299,13 +300,12 @@ bool AclEltwiseExecutor::init(const std::vector& srcDescs, const break; case Algorithm::EltwiseSubtract: { // For u8, Subtract must wrap (e.g. 3 - 4 = 255), not saturate to 0. - // Only use wrap-around for pure u8->u8 subtract (issue #33164). + // Only use wrap-around for pure u8->u8 subtract. // QDQ patterns with u8 input but f32/i32 output must saturate. - // See https://github.com/openvinotoolkit/openvino/issues/33164 - - const bool is_u8_u8_to_u8 = (srcDescs[0]->getPrecision() == ov::element::u8) && - (srcDescs[1]->getPrecision() == ov::element::u8) && - (dstDescs[0]->getPrecision() == ov::element::u8); + const bool is_u8_u8_to_u8 = ov::intel_cpu::all_of(ov::element::u8, + srcDescs[0]->getPrecision(), + srcDescs[1]->getPrecision(), + dstDescs[0]->getPrecision()); const auto convert_policy = is_u8_u8_to_u8 ? ConvertPolicy::WRAP : ConvertPolicy::SATURATE; diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.cpp new file mode 100644 index 00000000000000..0e8512b1f97627 --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.cpp @@ -0,0 +1,85 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "eltwise_overflow.hpp" + +#include "common_test_utils/ov_tensor_utils.hpp" +#include "openvino/op/add.hpp" +#include "openvino/op/subtract.hpp" + +namespace ov { +namespace test { + +std::string EltwiseOverflowLayerCPUTest::getTestCaseName(const testing::TestParamInfo& obj) { + const auto& [kind, shape] = obj.param; + std::ostringstream result; + result << "kind=" << (kind == EltwiseOverflowKind::UNDERFLOW ? "UNDERFLOW" : "OVERFLOW"); + result << "_shape=" << shape; + return result.str(); +} + +void EltwiseOverflowLayerCPUTest::SetUp() { + targetDevice = ov::test::utils::DEVICE_CPU; + abs_threshold = 0; + + const auto& [kind, shape] = GetParam(); + overflowKind = kind; + + InputShape inShape = {{}, {shape}}; + init_input_shapes({inShape, inShape}); + + auto a = std::make_shared(ov::element::u8, ov::PartialShape(shape)); + auto b = std::make_shared(ov::element::u8, ov::PartialShape(shape)); + + std::shared_ptr op; + if (kind == EltwiseOverflowKind::UNDERFLOW) { + op = std::make_shared(a, b); + } else { + op = std::make_shared(a, b); + } + + function = makeNgraphFunction(ov::element::u8, {a, b}, op, "EltwiseOverflow"); +} + +void EltwiseOverflowLayerCPUTest::generate_inputs(const std::vector& targetInputStaticShapes) { + inputs.clear(); + const auto& modelInputs = function->inputs(); + ASSERT_EQ(modelInputs.size(), 2u); + ASSERT_EQ(targetInputStaticShapes.size(), 2u); + + const size_t size = ov::shape_size(targetInputStaticShapes[0]); + + std::vector data0(size); + std::vector data1(size); + + if (overflowKind == EltwiseOverflowKind::UNDERFLOW) { + // u8 subtract underflow: should wrap, not saturate. + // E.g., 3 - 4 = 255 (not 0) + for (size_t i = 0; i < size; ++i) { + data0[i] = static_cast(i % 10); // 0-9 repeating + data1[i] = static_cast((i % 10) + 1); // 1-10 repeating + } + } else { + // u8 add overflow: should wrap (255 + 1 = 0). + for (size_t i = 0; i < size; ++i) { + data0[i] = static_cast(250 + (i % 6)); // 250-255 repeating + data1[i] = static_cast((i % 10) + 1); // 1-10 repeating + } + } + + auto t0 = ov::Tensor(ov::element::u8, targetInputStaticShapes[0]); + auto t1 = ov::Tensor(ov::element::u8, targetInputStaticShapes[1]); + std::copy(data0.begin(), data0.end(), t0.data()); + std::copy(data1.begin(), data1.end(), t1.data()); + + inputs.insert({modelInputs[0].get_node_shared_ptr(), t0}); + inputs.insert({modelInputs[1].get_node_shared_ptr(), t1}); +} + +TEST_P(EltwiseOverflowLayerCPUTest, CompareWithRefs) { + run(); +} + +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.hpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.hpp new file mode 100644 index 00000000000000..a01209939d95ee --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.hpp @@ -0,0 +1,32 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include "shared_test_classes/base/ov_subgraph.hpp" + +namespace ov { +namespace test { + +enum class EltwiseOverflowKind { UNDERFLOW, OVERFLOW }; + +typedef std::tuple EltwiseOverflowTestParams; + +class EltwiseOverflowLayerCPUTest : public testing::WithParamInterface, + virtual public SubgraphBaseTest { +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj); + +protected: + void SetUp() override; + void generate_inputs(const std::vector& targetInputStaticShapes) override; + +private: + EltwiseOverflowKind overflowKind; +}; + +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/eltwise_overflow.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/eltwise_overflow.cpp new file mode 100644 index 00000000000000..d49ba76acf845b --- /dev/null +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/eltwise_overflow.cpp @@ -0,0 +1,24 @@ +// Copyright (C) 2025 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "custom/single_layer_tests/classes/eltwise_overflow.hpp" + +namespace ov { +namespace test { + +const std::vector overflowKinds = {EltwiseOverflowKind::UNDERFLOW, EltwiseOverflowKind::OVERFLOW}; + +const std::vector testShapes = { + {4}, // small 1D + {64}, // larger 1D to exercise vectorized JIT path + {1, 2, 2, 2}, // 4D typical NN shape +}; + +INSTANTIATE_TEST_SUITE_P(smoke_EltwiseOverflowU8, + EltwiseOverflowLayerCPUTest, + ::testing::Combine(::testing::ValuesIn(overflowKinds), ::testing::ValuesIn(testShapes)), + EltwiseOverflowLayerCPUTest::getTestCaseName); + +} // namespace test +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/subtract_u8_wrap.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/subtract_u8_wrap.cpp deleted file mode 100644 index ea6948179e8ee7..00000000000000 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/subtract_u8_wrap.cpp +++ /dev/null @@ -1,284 +0,0 @@ -// Copyright (C) 2025 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -// Regression test for GitHub issue #33164: -// u8 Subtract must wrap around (e.g., 3 - 4 = 255), not saturate to 0. -// https://github.com/openvinotoolkit/openvino/issues/33164 -// -// Additionally, tests ensure that TypeRelaxed subtract with u8 inputs but -// f32/i32 output does NOT use wrap-around (must give negative values). -// This catches regressions in LPT/dequantization patterns. - -#include - -#include "openvino/op/parameter.hpp" -#include "openvino/op/result.hpp" -#include "openvino/op/subtract.hpp" -#include "openvino/openvino.hpp" -#include "ov_ops/type_relaxed.hpp" - -namespace ov { -namespace test { -namespace { - -class SubtractU8WrapAroundTest : public ::testing::Test { -protected: - void SetUp() override { - core = std::make_shared(); - } - - std::shared_ptr core; -}; - -// Test that u8 subtraction wraps around instead of saturating. -// This is a regression test for https://github.com/openvinotoolkit/openvino/issues/33164 -TEST_F(SubtractU8WrapAroundTest, WrapAroundBehavior) { - // Create a simple model: out = a - b (both u8) - auto a = std::make_shared(ov::element::u8, ov::Shape{4}); - auto b = std::make_shared(ov::element::u8, ov::Shape{4}); - auto subtract = std::make_shared(a, b); - auto result = std::make_shared(subtract); - auto model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{a, b}); - - // Compile for CPU - auto compiled_model = core->compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - - // Test cases that exercise underflow: - // Input A: [3, 0, 1, 5] - // Input B: [4, 1, 2, 3] - // Expected with wrap-around (mod 256): [255, 255, 255, 2] - // Wrong saturation result would be: [0, 0, 0, 2] - std::vector input_a = {3, 0, 1, 5}; - std::vector input_b = {4, 1, 2, 3}; - std::vector expected = {255, 255, 255, 2}; - - auto tensor_a = ov::Tensor(ov::element::u8, {4}, input_a.data()); - auto tensor_b = ov::Tensor(ov::element::u8, {4}, input_b.data()); - - infer_request.set_tensor(a, tensor_a); - infer_request.set_tensor(b, tensor_b); - infer_request.infer(); - - auto output = infer_request.get_output_tensor(0); - auto output_data = output.data(); - - for (size_t i = 0; i < expected.size(); ++i) { - EXPECT_EQ(output_data[i], expected[i]) - << "Mismatch at index " << i << ": got " << static_cast(output_data[i]) << ", expected " - << static_cast(expected[i]) << ". u8 subtraction should wrap around (mod 256), not saturate to 0."; - } -} - -// Test with larger tensor to exercise vector path in JIT -TEST_F(SubtractU8WrapAroundTest, WrapAroundBehaviorLargeVector) { - const size_t size = 64; // Large enough to trigger vectorized JIT path - - auto a = std::make_shared(ov::element::u8, ov::Shape{size}); - auto b = std::make_shared(ov::element::u8, ov::Shape{size}); - auto subtract = std::make_shared(a, b); - auto result = std::make_shared(subtract); - auto model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{a, b}); - - auto compiled_model = core->compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - - std::vector input_a(size); - std::vector input_b(size); - std::vector expected(size); - - for (size_t i = 0; i < size; ++i) { - input_a[i] = static_cast(i % 10); // 0-9 repeating - input_b[i] = static_cast((i % 10) + 1); // 1-10 repeating - // Each result should be -1 mod 256 = 255, except when a >= b - expected[i] = static_cast((256 + input_a[i] - input_b[i]) % 256); - } - - auto tensor_a = ov::Tensor(ov::element::u8, {size}, input_a.data()); - auto tensor_b = ov::Tensor(ov::element::u8, {size}, input_b.data()); - - infer_request.set_tensor(a, tensor_a); - infer_request.set_tensor(b, tensor_b); - infer_request.infer(); - - auto output = infer_request.get_output_tensor(0); - auto output_data = output.data(); - - for (size_t i = 0; i < expected.size(); ++i) { - EXPECT_EQ(output_data[i], expected[i]) - << "Mismatch at index " << i << ": got " << static_cast(output_data[i]) << ", expected " - << static_cast(expected[i]); - } -} - -// Test with 4D tensor to match typical NN tensor shapes -TEST_F(SubtractU8WrapAroundTest, WrapAroundBehavior4D) { - auto a = std::make_shared(ov::element::u8, ov::Shape{1, 2, 2, 2}); - auto b = std::make_shared(ov::element::u8, ov::Shape{1, 2, 2, 2}); - auto subtract = std::make_shared(a, b); - auto result = std::make_shared(subtract); - auto model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{a, b}); - - auto compiled_model = core->compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - - // All zeros minus all ones should give all 255s (wrap-around) - std::vector input_a(8, 0); - std::vector input_b(8, 1); - std::vector expected(8, 255); - - auto tensor_a = ov::Tensor(ov::element::u8, {1, 2, 2, 2}, input_a.data()); - auto tensor_b = ov::Tensor(ov::element::u8, {1, 2, 2, 2}, input_b.data()); - - infer_request.set_tensor(a, tensor_a); - infer_request.set_tensor(b, tensor_b); - infer_request.infer(); - - auto output = infer_request.get_output_tensor(0); - auto output_data = output.data(); - - for (size_t i = 0; i < expected.size(); ++i) { - EXPECT_EQ(output_data[i], expected[i]) - << "4D tensor mismatch at index " << i << ": got " << static_cast(output_data[i]) << ", expected " - << static_cast(expected[i]); - } -} - -// ============================================================================ -// TypeRelaxed tests: u8 inputs with non-u8 output (dequantization patterns) -// These tests ensure that the u8 wrap-around path is NOT used when output -// type is f32 or i32 (typical in LPT/QDQ patterns). -// ============================================================================ - -// u8 inputs, but output overridden to f32: MUST NOT wrap, should give negatives. -// This test would have caught the CI failure "unsupported src_prc: u8" crash. -TEST_F(SubtractU8WrapAroundTest, U8Inputs_F32Output_NoWrap_NoCrash) { - auto a = std::make_shared(ov::element::u8, ov::Shape{4}); - auto b = std::make_shared(ov::element::u8, ov::Shape{4}); - - // Create a TypeRelaxed subtract: u8 inputs, f32 output - // This simulates dequantization subtract patterns in LPT - using TRSub = ov::op::TypeRelaxed; - auto sub = std::make_shared( - ov::element::TypeVector{ov::element::f32, ov::element::f32}, // origin input types for inference - ov::element::TypeVector{ov::element::f32}, // overridden output type - a, - b); - - auto result = std::make_shared(sub); - auto model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{a, b}); - - auto compiled_model = core->compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - - std::vector input_a = {3, 0, 1, 5}; - std::vector input_b = {4, 1, 2, 3}; - - auto tensor_a = ov::Tensor(ov::element::u8, {4}, input_a.data()); - auto tensor_b = ov::Tensor(ov::element::u8, {4}, input_b.data()); - - infer_request.set_tensor(a, tensor_a); - infer_request.set_tensor(b, tensor_b); - infer_request.infer(); - - auto out = infer_request.get_output_tensor(0); - ASSERT_EQ(out.get_element_type(), ov::element::f32); - - auto* out_data = out.data(); - // With proper dequantization semantics, these should be NEGATIVE values - // NOT wrap-around values like 255 - std::vector expected = {-1.f, -1.f, -1.f, 2.f}; - - for (size_t i = 0; i < expected.size(); ++i) { - EXPECT_FLOAT_EQ(out_data[i], expected[i]) - << "index=" << i << ": got " << out_data[i] << ", expected " << expected[i] - << ". TypeRelaxed u8->f32 subtract must NOT use wrap-around."; - } -} - -// Same idea, but output overridden to i32. -TEST_F(SubtractU8WrapAroundTest, U8Inputs_I32Output_NoWrap_NoCrash) { - auto a = std::make_shared(ov::element::u8, ov::Shape{4}); - auto b = std::make_shared(ov::element::u8, ov::Shape{4}); - - // Create a TypeRelaxed subtract: u8 inputs, i32 output - using TRSub = ov::op::TypeRelaxed; - auto sub = std::make_shared( - ov::element::TypeVector{ov::element::i32, ov::element::i32}, // origin input types for inference - ov::element::TypeVector{ov::element::i32}, // overridden output type - a, - b); - - auto result = std::make_shared(sub); - auto model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{a, b}); - - auto compiled_model = core->compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - - std::vector input_a = {3, 0, 1, 5}; - std::vector input_b = {4, 1, 2, 3}; - - infer_request.set_tensor(a, ov::Tensor(ov::element::u8, {4}, input_a.data())); - infer_request.set_tensor(b, ov::Tensor(ov::element::u8, {4}, input_b.data())); - infer_request.infer(); - - auto out = infer_request.get_output_tensor(0); - ASSERT_EQ(out.get_element_type(), ov::element::i32); - - auto* out_data = out.data(); - // With proper dequantization semantics, these should be NEGATIVE values - std::vector expected = {-1, -1, -1, 2}; - - for (size_t i = 0; i < expected.size(); ++i) { - EXPECT_EQ(out_data[i], expected[i]) << "index=" << i << ": got " << out_data[i] << ", expected " << expected[i] - << ". TypeRelaxed u8->i32 subtract must NOT use wrap-around."; - } -} - -// Test with larger vector to exercise JIT vectorized path for TypeRelaxed -TEST_F(SubtractU8WrapAroundTest, U8Inputs_F32Output_LargeVector) { - const size_t size = 64; // Large enough to trigger vectorized JIT path - - auto a = std::make_shared(ov::element::u8, ov::Shape{size}); - auto b = std::make_shared(ov::element::u8, ov::Shape{size}); - - using TRSub = ov::op::TypeRelaxed; - auto sub = std::make_shared(ov::element::TypeVector{ov::element::f32, ov::element::f32}, - ov::element::TypeVector{ov::element::f32}, - a, - b); - - auto result = std::make_shared(sub); - auto model = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{a, b}); - - auto compiled_model = core->compile_model(model, "CPU"); - auto infer_request = compiled_model.create_infer_request(); - - std::vector input_a(size); - std::vector input_b(size); - std::vector expected(size); - - for (size_t i = 0; i < size; ++i) { - input_a[i] = static_cast(i % 10); // 0-9 repeating - input_b[i] = static_cast((i % 10) + 1); // 1-10 repeating - // Expected: proper subtraction with negative results - expected[i] = static_cast(static_cast(input_a[i]) - static_cast(input_b[i])); - } - - infer_request.set_tensor(a, ov::Tensor(ov::element::u8, {size}, input_a.data())); - infer_request.set_tensor(b, ov::Tensor(ov::element::u8, {size}, input_b.data())); - infer_request.infer(); - - auto out = infer_request.get_output_tensor(0); - auto* out_data = out.data(); - - for (size_t i = 0; i < expected.size(); ++i) { - EXPECT_FLOAT_EQ(out_data[i], expected[i]) - << "index=" << i << ": got " << out_data[i] << ", expected " << expected[i]; - } -} - -} // namespace -} // namespace test -} // namespace ov From b5561fae96cb1c12644b60942a4f35efedd7a92f Mon Sep 17 00:00:00 2001 From: Nishant-ZFYII Date: Wed, 4 Feb 2026 19:09:06 -0500 Subject: [PATCH 5/8] [CPU] Add u8 wrap-around support for Add operation and address review comments - Added u8 wrap-around support for jit_add_emitter (x64 JIT) - Added ConvertPolicy::WRAP for EltwiseAdd in ACL executor (ARM) - Changed test inputs to hardcoded values that guarantee overflow/underflow - Fixed test to use ov::Model directly instead of makeNgraphFunction --- .../plugin/x64/jit_eltwise_emitters.cpp | 23 +++++++++++-- .../src/nodes/executors/acl/acl_eltwise.cpp | 18 +++++++--- .../classes/eltwise_overflow.cpp | 33 +++++++++++-------- 3 files changed, 53 insertions(+), 21 deletions(-) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp index a8349c8497729c..6f487e4d081251 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp @@ -79,6 +79,12 @@ void jit_add_emitter::emit_isa(const std::vector& in_vec_idxs, const std case ov::element::i32: h->uni_vpaddd(vmm_dst, vmm_src0, vmm_src1); break; + case ov::element::u8: + // u8 addition uses vpaddb which naturally wraps around (mod 256). + // This gives correct behavior: e.g., 255 + 1 = 0. + OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::u8, "u8 add JIT must only be used for u8 output"); + h->uni_vpaddb(vmm_dst, vmm_src0, vmm_src1); + break; default: OV_CPU_JIT_EMITTER_THROW("Unsupported precision"); } @@ -92,9 +98,20 @@ void jit_add_emitter::emit_isa(const std::vector& in_vec_idxs, const std } } -std::set> jit_add_emitter::get_supported_precisions( - [[maybe_unused]] const std::shared_ptr& node) { - return {{element::f32, element::f32}, {element::i32, element::i32}}; +std::set> jit_add_emitter::get_supported_precisions(const std::shared_ptr& node) { + std::set> supported = {{element::f32, element::f32}, {element::i32, element::i32}}; + + // Only enable u8 wrap-around for pure u8->u8 arithmetic. + // QDQ/dequantization patterns (u8 input, f32/i32 output) must NOT use u8 execution. + OPENVINO_ASSERT(node, "node must not be null for get_supported_precisions"); + if (ov::intel_cpu::all_of(element::u8, + node->get_input_element_type(0), + node->get_input_element_type(1), + node->get_output_element_type(0))) { + supported.insert({element::u8, element::u8}); + } + + return supported; } /// MUL_ADD /// diff --git a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp index 375692d019fe37..2bfe602c4f96e1 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp @@ -265,19 +265,29 @@ bool AclEltwiseExecutor::init(const std::vector& srcDescs, const std::function(void)> exec_func; switch (aclEltwiseAttrs.data.algo) { - case Algorithm::EltwiseAdd: + case Algorithm::EltwiseAdd: { + // For u8, Add must wrap on overflow (e.g. 255 + 1 = 0), not saturate. + // Only use wrap-around for pure u8->u8 add. + // QDQ patterns with u8 input but f32/i32 output must saturate. + const bool is_u8_u8_to_u8 = ov::intel_cpu::all_of(ov::element::u8, + srcDescs[0]->getPrecision(), + srcDescs[1]->getPrecision(), + dstDescs[0]->getPrecision()); + const auto convert_policy = is_u8_u8_to_u8 ? ConvertPolicy::WRAP : ConvertPolicy::SATURATE; + if (!NEArithmeticAddition::validate(srcTensorsInfo.data(), &srcTensorsInfo[1], dstTensorsInfo.data(), - ConvertPolicy::SATURATE)) { + convert_policy)) { return false; } - exec_func = [this]() -> std::unique_ptr { + exec_func = [this, convert_policy]() -> std::unique_ptr { auto acl_op = std::make_unique(); - acl_op->configure(srcTensors.data(), &srcTensors[1], dstTensors.data(), ConvertPolicy::SATURATE); + acl_op->configure(srcTensors.data(), &srcTensors[1], dstTensors.data(), convert_policy); return acl_op; }; break; + } case Algorithm::EltwiseMultiply: if (!NEPixelWiseMultiplication::validate(srcTensorsInfo.data(), &srcTensorsInfo[1], diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.cpp index 0e8512b1f97627..865642312e0648 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.cpp @@ -6,6 +6,7 @@ #include "common_test_utils/ov_tensor_utils.hpp" #include "openvino/op/add.hpp" +#include "openvino/op/result.hpp" #include "openvino/op/subtract.hpp" namespace ov { @@ -39,7 +40,8 @@ void EltwiseOverflowLayerCPUTest::SetUp() { op = std::make_shared(a, b); } - function = makeNgraphFunction(ov::element::u8, {a, b}, op, "EltwiseOverflow"); + auto result = std::make_shared(op); + function = std::make_shared(ov::ResultVector{result}, ov::ParameterVector{a, b}, "EltwiseOverflow"); } void EltwiseOverflowLayerCPUTest::generate_inputs(const std::vector& targetInputStaticShapes) { @@ -50,22 +52,25 @@ void EltwiseOverflowLayerCPUTest::generate_inputs(const std::vector& const size_t size = ov::shape_size(targetInputStaticShapes[0]); + // Hardcoded values that guarantee underflow/overflow regardless of shape size. + // Pattern repeats to fill any shape. + static const std::vector underflow_a = {3, 0, 1, 5, 10, 0, 100, 50}; + static const std::vector underflow_b = {4, 1, 2, 3, 20, 1, 200, 51}; + // Expected results (wrap): 255, 255, 255, 2, 246, 255, 156, 255 + + static const std::vector overflow_a = {255, 254, 200, 128, 255, 250, 255, 1}; + static const std::vector overflow_b = {1, 2, 100, 128, 255, 10, 128, 255}; + // Expected results (wrap): 0, 0, 44, 0, 254, 4, 127, 0 + + const auto& src_a = (overflowKind == EltwiseOverflowKind::UNDERFLOW) ? underflow_a : overflow_a; + const auto& src_b = (overflowKind == EltwiseOverflowKind::UNDERFLOW) ? underflow_b : overflow_b; + std::vector data0(size); std::vector data1(size); - if (overflowKind == EltwiseOverflowKind::UNDERFLOW) { - // u8 subtract underflow: should wrap, not saturate. - // E.g., 3 - 4 = 255 (not 0) - for (size_t i = 0; i < size; ++i) { - data0[i] = static_cast(i % 10); // 0-9 repeating - data1[i] = static_cast((i % 10) + 1); // 1-10 repeating - } - } else { - // u8 add overflow: should wrap (255 + 1 = 0). - for (size_t i = 0; i < size; ++i) { - data0[i] = static_cast(250 + (i % 6)); // 250-255 repeating - data1[i] = static_cast((i % 10) + 1); // 1-10 repeating - } + for (size_t i = 0; i < size; ++i) { + data0[i] = src_a[i % src_a.size()]; + data1[i] = src_b[i % src_b.size()]; } auto t0 = ov::Tensor(ov::element::u8, targetInputStaticShapes[0]); From 1badae9978e1b2c37fdcda21aebf94f285f3dda2 Mon Sep 17 00:00:00 2001 From: Nishant-ZFYII Date: Tue, 17 Feb 2026 20:18:01 -0500 Subject: [PATCH 6/8] Fix get_supported_precisions crash: use if-guard with OPENVINO_ASSERT --- .../src/emitters/plugin/x64/jit_eltwise_emitters.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp index 6f487e4d081251..e55d196961388e 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp @@ -103,8 +103,9 @@ std::set> jit_add_emitter::get_supported_precisions(c // Only enable u8 wrap-around for pure u8->u8 arithmetic. // QDQ/dequantization patterns (u8 input, f32/i32 output) must NOT use u8 execution. - OPENVINO_ASSERT(node, "node must not be null for get_supported_precisions"); - if (ov::intel_cpu::all_of(element::u8, + // node may be nullptr when called from SupportedPrecisions functor (general query). + if (node && + ov::intel_cpu::all_of(element::u8, node->get_input_element_type(0), node->get_input_element_type(1), node->get_output_element_type(0))) { @@ -285,8 +286,9 @@ std::set> jit_subtract_emitter::get_supported_precisi // Only enable u8 wrap-around for pure u8->u8 arithmetic. // QDQ/dequantization patterns (u8 input, f32/i32 output) must NOT use u8 execution. - OPENVINO_ASSERT(node, "node must not be null for get_supported_precisions"); - if (ov::intel_cpu::all_of(element::u8, + // node may be nullptr when called from SupportedPrecisions functor (general query). + if (node && + ov::intel_cpu::all_of(element::u8, node->get_input_element_type(0), node->get_input_element_type(1), node->get_output_element_type(0))) { From 6f3325507104e2a2cacd7e8f2fc4e70762149f19 Mon Sep 17 00:00:00 2001 From: Nishant-ZFYII Date: Tue, 17 Feb 2026 20:39:20 -0500 Subject: [PATCH 7/8] Clang-18 fix --- .../plugin/x64/jit_eltwise_emitters.cpp | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp index e55d196961388e..c9979b644e24e1 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/x64/jit_eltwise_emitters.cpp @@ -104,11 +104,10 @@ std::set> jit_add_emitter::get_supported_precisions(c // Only enable u8 wrap-around for pure u8->u8 arithmetic. // QDQ/dequantization patterns (u8 input, f32/i32 output) must NOT use u8 execution. // node may be nullptr when called from SupportedPrecisions functor (general query). - if (node && - ov::intel_cpu::all_of(element::u8, - node->get_input_element_type(0), - node->get_input_element_type(1), - node->get_output_element_type(0))) { + if (node && ov::intel_cpu::all_of(element::u8, + node->get_input_element_type(0), + node->get_input_element_type(1), + node->get_output_element_type(0))) { supported.insert({element::u8, element::u8}); } @@ -287,11 +286,10 @@ std::set> jit_subtract_emitter::get_supported_precisi // Only enable u8 wrap-around for pure u8->u8 arithmetic. // QDQ/dequantization patterns (u8 input, f32/i32 output) must NOT use u8 execution. // node may be nullptr when called from SupportedPrecisions functor (general query). - if (node && - ov::intel_cpu::all_of(element::u8, - node->get_input_element_type(0), - node->get_input_element_type(1), - node->get_output_element_type(0))) { + if (node && ov::intel_cpu::all_of(element::u8, + node->get_input_element_type(0), + node->get_input_element_type(1), + node->get_output_element_type(0))) { supported.insert({element::u8, element::u8}); } From d03bed17124f44cb4dcc85b5987fa4b74c6e3e0b Mon Sep 17 00:00:00 2001 From: Nishant-ZFYII Date: Wed, 25 Feb 2026 19:28:43 -0500 Subject: [PATCH 8/8] Update copyright headers to 2026 --- .../custom/single_layer_tests/classes/eltwise_overflow.cpp | 2 +- .../custom/single_layer_tests/classes/eltwise_overflow.hpp | 2 +- .../single_layer_tests/instances/common/eltwise_overflow.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.cpp index 865642312e0648..e747d64d56df09 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2025 Intel Corporation +// Copyright (C) 2018-2026 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.hpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.hpp index a01209939d95ee..6d65f2952e0312 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.hpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise_overflow.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2025 Intel Corporation +// Copyright (C) 2018-2026 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/eltwise_overflow.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/eltwise_overflow.cpp index d49ba76acf845b..45c3633e2dcaaa 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/eltwise_overflow.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/instances/common/eltwise_overflow.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2025 Intel Corporation +// Copyright (C) 2018-2026 Intel Corporation // SPDX-License-Identifier: Apache-2.0 //