Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,13 @@ void jit_subtract_emitter::emit_isa(const std::vector<size_t>& in_vec_idxs,
case ov::element::i32:
h->uni_vpsubd(vmm_dst, vmm_src0, vmm_src1);
break;
case ov::element::u8:
// u8 subtraction uses vpsubb which naturally wraps around (mod 256).
// This gives correct behavior: e.g., 3 - 4 = 255.
// See https://github.com/openvinotoolkit/openvino/issues/33164
OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::u8, "u8 subtract JIT must only be used for u8 output");
h->uni_vpsubb(vmm_dst, vmm_src0, vmm_src1);
break;
default:
OV_CPU_JIT_EMITTER_THROW("Unsupported precision");
}
Expand All @@ -257,8 +264,22 @@ void jit_subtract_emitter::emit_isa(const std::vector<size_t>& in_vec_idxs,
}

std::set<std::vector<element::Type>> jit_subtract_emitter::get_supported_precisions(
[[maybe_unused]] const std::shared_ptr<ov::Node>& node) {
return {{element::f32, element::f32}, {element::i32, element::i32}};
const std::shared_ptr<ov::Node>& node) {
std::set<std::vector<element::Type>> supported = {{element::f32, element::f32}, {element::i32, element::i32}};

// Only enable u8 wrap-around for pure u8->u8 arithmetic (issue #33164).
// QDQ/dequantization patterns (u8 input, f32/i32 output) must NOT use u8 execution.
if (node) {
const auto in0 = node->get_input_element_type(0);
const auto in1 = node->get_input_element_type(1);
const auto out = node->get_output_element_type(0);

if (in0 == element::u8 && in1 == element::u8 && out == element::u8) {
supported.insert({element::u8, element::u8});
}
}

return supported;
}

/// MULTIPLY ///
Expand Down
20 changes: 16 additions & 4 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -297,19 +297,31 @@ bool AclEltwiseExecutor::init(const std::vector<MemoryDescPtr>& srcDescs, const
return acl_op;
};
break;
case Algorithm::EltwiseSubtract:
case Algorithm::EltwiseSubtract: {
// For u8, Subtract must wrap (e.g. 3 - 4 = 255), not saturate to 0.
// Only use wrap-around for pure u8->u8 subtract (issue #33164).
// QDQ patterns with u8 input but f32/i32 output must saturate.
// See https://github.com/openvinotoolkit/openvino/issues/33164

const bool is_u8_u8_to_u8 = (srcDescs[0]->getPrecision() == ov::element::u8) &&
(srcDescs[1]->getPrecision() == ov::element::u8) &&
(dstDescs[0]->getPrecision() == ov::element::u8);

const auto convert_policy = is_u8_u8_to_u8 ? ConvertPolicy::WRAP : ConvertPolicy::SATURATE;

if (!NEArithmeticSubtraction::validate(srcTensorsInfo.data(),
&srcTensorsInfo[1],
dstTensorsInfo.data(),
ConvertPolicy::SATURATE)) {
convert_policy)) {
return false;
}
exec_func = [this]() -> std::unique_ptr<IFunction> {
exec_func = [this, convert_policy]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEArithmeticSubtraction>();
acl_op->configure(srcTensors.data(), &srcTensors[1], dstTensors.data(), ConvertPolicy::SATURATE);
acl_op->configure(srcTensors.data(), &srcTensors[1], dstTensors.data(), convert_policy);
return acl_op;
};
break;
}
case Algorithm::EltwiseDivide:
if (!NEElementwiseDivision::validate(srcTensorsInfo.data(), &srcTensorsInfo[1], dstTensorsInfo.data())) {
return false;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,284 @@
// Copyright (C) 2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

// Regression test for GitHub issue #33164:
// u8 Subtract must wrap around (e.g., 3 - 4 = 255), not saturate to 0.
// https://github.com/openvinotoolkit/openvino/issues/33164
//
// Additionally, tests ensure that TypeRelaxed subtract with u8 inputs but
// f32/i32 output does NOT use wrap-around (must give negative values).
// This catches regressions in LPT/dequantization patterns.

#include <gtest/gtest.h>

#include "openvino/op/parameter.hpp"
#include "openvino/op/result.hpp"
#include "openvino/op/subtract.hpp"
#include "openvino/openvino.hpp"
#include "ov_ops/type_relaxed.hpp"

namespace ov {
namespace test {
namespace {

class SubtractU8WrapAroundTest : public ::testing::Test {
protected:
void SetUp() override {
core = std::make_shared<ov::Core>();
}

std::shared_ptr<ov::Core> core;
};

// Test that u8 subtraction wraps around instead of saturating.
// This is a regression test for https://github.com/openvinotoolkit/openvino/issues/33164
TEST_F(SubtractU8WrapAroundTest, WrapAroundBehavior) {
// Create a simple model: out = a - b (both u8)
auto a = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{4});
auto b = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{4});
auto subtract = std::make_shared<ov::op::v1::Subtract>(a, b);
auto result = std::make_shared<ov::op::v0::Result>(subtract);
auto model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{a, b});

// Compile for CPU
auto compiled_model = core->compile_model(model, "CPU");
auto infer_request = compiled_model.create_infer_request();

// Test cases that exercise underflow:
// Input A: [3, 0, 1, 5]
// Input B: [4, 1, 2, 3]
// Expected with wrap-around (mod 256): [255, 255, 255, 2]
// Wrong saturation result would be: [0, 0, 0, 2]
std::vector<uint8_t> input_a = {3, 0, 1, 5};
std::vector<uint8_t> input_b = {4, 1, 2, 3};
std::vector<uint8_t> expected = {255, 255, 255, 2};

auto tensor_a = ov::Tensor(ov::element::u8, {4}, input_a.data());
auto tensor_b = ov::Tensor(ov::element::u8, {4}, input_b.data());

infer_request.set_tensor(a, tensor_a);
infer_request.set_tensor(b, tensor_b);
infer_request.infer();

auto output = infer_request.get_output_tensor(0);
auto output_data = output.data<uint8_t>();

for (size_t i = 0; i < expected.size(); ++i) {
EXPECT_EQ(output_data[i], expected[i])
<< "Mismatch at index " << i << ": got " << static_cast<int>(output_data[i]) << ", expected "
<< static_cast<int>(expected[i]) << ". u8 subtraction should wrap around (mod 256), not saturate to 0.";
}
}

// Test with larger tensor to exercise vector path in JIT
TEST_F(SubtractU8WrapAroundTest, WrapAroundBehaviorLargeVector) {
const size_t size = 64; // Large enough to trigger vectorized JIT path

auto a = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{size});
auto b = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{size});
auto subtract = std::make_shared<ov::op::v1::Subtract>(a, b);
auto result = std::make_shared<ov::op::v0::Result>(subtract);
auto model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{a, b});

auto compiled_model = core->compile_model(model, "CPU");
auto infer_request = compiled_model.create_infer_request();

std::vector<uint8_t> input_a(size);
std::vector<uint8_t> input_b(size);
std::vector<uint8_t> expected(size);

for (size_t i = 0; i < size; ++i) {
input_a[i] = static_cast<uint8_t>(i % 10); // 0-9 repeating
input_b[i] = static_cast<uint8_t>((i % 10) + 1); // 1-10 repeating
// Each result should be -1 mod 256 = 255, except when a >= b
expected[i] = static_cast<uint8_t>((256 + input_a[i] - input_b[i]) % 256);
}

auto tensor_a = ov::Tensor(ov::element::u8, {size}, input_a.data());
auto tensor_b = ov::Tensor(ov::element::u8, {size}, input_b.data());

infer_request.set_tensor(a, tensor_a);
infer_request.set_tensor(b, tensor_b);
infer_request.infer();

auto output = infer_request.get_output_tensor(0);
auto output_data = output.data<uint8_t>();

for (size_t i = 0; i < expected.size(); ++i) {
EXPECT_EQ(output_data[i], expected[i])
<< "Mismatch at index " << i << ": got " << static_cast<int>(output_data[i]) << ", expected "
<< static_cast<int>(expected[i]);
}
}

// Test with 4D tensor to match typical NN tensor shapes
TEST_F(SubtractU8WrapAroundTest, WrapAroundBehavior4D) {
auto a = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{1, 2, 2, 2});
auto b = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{1, 2, 2, 2});
auto subtract = std::make_shared<ov::op::v1::Subtract>(a, b);
auto result = std::make_shared<ov::op::v0::Result>(subtract);
auto model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{a, b});

auto compiled_model = core->compile_model(model, "CPU");
auto infer_request = compiled_model.create_infer_request();

// All zeros minus all ones should give all 255s (wrap-around)
std::vector<uint8_t> input_a(8, 0);
std::vector<uint8_t> input_b(8, 1);
std::vector<uint8_t> expected(8, 255);

auto tensor_a = ov::Tensor(ov::element::u8, {1, 2, 2, 2}, input_a.data());
auto tensor_b = ov::Tensor(ov::element::u8, {1, 2, 2, 2}, input_b.data());

infer_request.set_tensor(a, tensor_a);
infer_request.set_tensor(b, tensor_b);
infer_request.infer();

auto output = infer_request.get_output_tensor(0);
auto output_data = output.data<uint8_t>();

for (size_t i = 0; i < expected.size(); ++i) {
EXPECT_EQ(output_data[i], expected[i])
<< "4D tensor mismatch at index " << i << ": got " << static_cast<int>(output_data[i]) << ", expected "
<< static_cast<int>(expected[i]);
}
}

// ============================================================================
// TypeRelaxed tests: u8 inputs with non-u8 output (dequantization patterns)
// These tests ensure that the u8 wrap-around path is NOT used when output
// type is f32 or i32 (typical in LPT/QDQ patterns).
// ============================================================================

// u8 inputs, but output overridden to f32: MUST NOT wrap, should give negatives.
// This test would have caught the CI failure "unsupported src_prc: u8" crash.
TEST_F(SubtractU8WrapAroundTest, U8Inputs_F32Output_NoWrap_NoCrash) {
auto a = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{4});
auto b = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{4});

// Create a TypeRelaxed subtract: u8 inputs, f32 output
// This simulates dequantization subtract patterns in LPT
using TRSub = ov::op::TypeRelaxed<ov::op::v1::Subtract>;
auto sub = std::make_shared<TRSub>(
ov::element::TypeVector{ov::element::f32, ov::element::f32}, // origin input types for inference
ov::element::TypeVector{ov::element::f32}, // overridden output type
a,
b);

auto result = std::make_shared<ov::op::v0::Result>(sub);
auto model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{a, b});

auto compiled_model = core->compile_model(model, "CPU");
auto infer_request = compiled_model.create_infer_request();

std::vector<uint8_t> input_a = {3, 0, 1, 5};
std::vector<uint8_t> input_b = {4, 1, 2, 3};

auto tensor_a = ov::Tensor(ov::element::u8, {4}, input_a.data());
auto tensor_b = ov::Tensor(ov::element::u8, {4}, input_b.data());

infer_request.set_tensor(a, tensor_a);
infer_request.set_tensor(b, tensor_b);
infer_request.infer();

auto out = infer_request.get_output_tensor(0);
ASSERT_EQ(out.get_element_type(), ov::element::f32);

auto* out_data = out.data<float>();
// With proper dequantization semantics, these should be NEGATIVE values
// NOT wrap-around values like 255
std::vector<float> expected = {-1.f, -1.f, -1.f, 2.f};

for (size_t i = 0; i < expected.size(); ++i) {
EXPECT_FLOAT_EQ(out_data[i], expected[i])
<< "index=" << i << ": got " << out_data[i] << ", expected " << expected[i]
<< ". TypeRelaxed u8->f32 subtract must NOT use wrap-around.";
}
}

// Same idea, but output overridden to i32.
TEST_F(SubtractU8WrapAroundTest, U8Inputs_I32Output_NoWrap_NoCrash) {
auto a = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{4});
auto b = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{4});

// Create a TypeRelaxed subtract: u8 inputs, i32 output
using TRSub = ov::op::TypeRelaxed<ov::op::v1::Subtract>;
auto sub = std::make_shared<TRSub>(
ov::element::TypeVector{ov::element::i32, ov::element::i32}, // origin input types for inference
ov::element::TypeVector{ov::element::i32}, // overridden output type
a,
b);

auto result = std::make_shared<ov::op::v0::Result>(sub);
auto model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{a, b});

auto compiled_model = core->compile_model(model, "CPU");
auto infer_request = compiled_model.create_infer_request();

std::vector<uint8_t> input_a = {3, 0, 1, 5};
std::vector<uint8_t> input_b = {4, 1, 2, 3};

infer_request.set_tensor(a, ov::Tensor(ov::element::u8, {4}, input_a.data()));
infer_request.set_tensor(b, ov::Tensor(ov::element::u8, {4}, input_b.data()));
infer_request.infer();

auto out = infer_request.get_output_tensor(0);
ASSERT_EQ(out.get_element_type(), ov::element::i32);

auto* out_data = out.data<int32_t>();
// With proper dequantization semantics, these should be NEGATIVE values
std::vector<int32_t> expected = {-1, -1, -1, 2};

for (size_t i = 0; i < expected.size(); ++i) {
EXPECT_EQ(out_data[i], expected[i]) << "index=" << i << ": got " << out_data[i] << ", expected " << expected[i]
<< ". TypeRelaxed u8->i32 subtract must NOT use wrap-around.";
}
}

// Test with larger vector to exercise JIT vectorized path for TypeRelaxed
TEST_F(SubtractU8WrapAroundTest, U8Inputs_F32Output_LargeVector) {
const size_t size = 64; // Large enough to trigger vectorized JIT path

auto a = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{size});
auto b = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{size});

using TRSub = ov::op::TypeRelaxed<ov::op::v1::Subtract>;
auto sub = std::make_shared<TRSub>(ov::element::TypeVector{ov::element::f32, ov::element::f32},
ov::element::TypeVector{ov::element::f32},
a,
b);

auto result = std::make_shared<ov::op::v0::Result>(sub);
auto model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{a, b});

auto compiled_model = core->compile_model(model, "CPU");
auto infer_request = compiled_model.create_infer_request();

std::vector<uint8_t> input_a(size);
std::vector<uint8_t> input_b(size);
std::vector<float> expected(size);

for (size_t i = 0; i < size; ++i) {
input_a[i] = static_cast<uint8_t>(i % 10); // 0-9 repeating
input_b[i] = static_cast<uint8_t>((i % 10) + 1); // 1-10 repeating
// Expected: proper subtraction with negative results
expected[i] = static_cast<float>(static_cast<int>(input_a[i]) - static_cast<int>(input_b[i]));
}

infer_request.set_tensor(a, ov::Tensor(ov::element::u8, {size}, input_a.data()));
infer_request.set_tensor(b, ov::Tensor(ov::element::u8, {size}, input_b.data()));
infer_request.infer();

auto out = infer_request.get_output_tensor(0);
auto* out_data = out.data<float>();

for (size_t i = 0; i < expected.size(); ++i) {
EXPECT_FLOAT_EQ(out_data[i], expected[i])
<< "index=" << i << ": got " << out_data[i] << ", expected " << expected[i];
}
}

} // namespace
} // namespace test
} // namespace ov