openvinotoolkit · Nishant-ZFYII · Jan 3, 2026 · Jan 6, 2026 · Jan 6, 2026 · Jan 17, 2026
@@ -243,6 +243,13 @@ void jit_subtract_emitter::emit_isa(const std::vector<size_t>& in_vec_idxs,
         case ov::element::i32:
             h->uni_vpsubd(vmm_dst, vmm_src0, vmm_src1);
             break;
+        case ov::element::u8:
+            // u8 subtraction uses vpsubb which naturally wraps around (mod 256).
+            // This gives correct behavior: e.g., 3 - 4 = 255.
+            // See https://github.com/openvinotoolkit/openvino/issues/33164
+            OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::u8, "u8 subtract JIT must only be used for u8 output");
+            h->uni_vpsubb(vmm_dst, vmm_src0, vmm_src1);
+            break;
         default:
             OV_CPU_JIT_EMITTER_THROW("Unsupported precision");
         }
@@ -257,8 +264,22 @@ void jit_subtract_emitter::emit_isa(const std::vector<size_t>& in_vec_idxs,
 }
 
 std::set<std::vector<element::Type>> jit_subtract_emitter::get_supported_precisions(
-    [[maybe_unused]] const std::shared_ptr<ov::Node>& node) {
-    return {{element::f32, element::f32}, {element::i32, element::i32}};
+    const std::shared_ptr<ov::Node>& node) {
+    std::set<std::vector<element::Type>> supported = {{element::f32, element::f32}, {element::i32, element::i32}};
+
+    // Only enable u8 wrap-around for pure u8->u8 arithmetic (issue #33164).
+    // QDQ/dequantization patterns (u8 input, f32/i32 output) must NOT use u8 execution.
+    if (node) {
+        const auto in0 = node->get_input_element_type(0);
+        const auto in1 = node->get_input_element_type(1);
+        const auto out = node->get_output_element_type(0);
+
+        if (in0 == element::u8 && in1 == element::u8 && out == element::u8) {
+            supported.insert({element::u8, element::u8});
+        }
+    }
+
+    return supported;
 }
 
 /// MULTIPLY ///

@@ -297,19 +297,31 @@ bool AclEltwiseExecutor::init(const std::vector<MemoryDescPtr>& srcDescs, const
             return acl_op;
         };
         break;
-    case Algorithm::EltwiseSubtract:
+    case Algorithm::EltwiseSubtract: {
+        // For u8, Subtract must wrap (e.g. 3 - 4 = 255), not saturate to 0.
+        // Only use wrap-around for pure u8->u8 subtract (issue #33164).
+        // QDQ patterns with u8 input but f32/i32 output must saturate.
+        // See https://github.com/openvinotoolkit/openvino/issues/33164
+
+        const bool is_u8_u8_to_u8 = (srcDescs[0]->getPrecision() == ov::element::u8) &&
+                                    (srcDescs[1]->getPrecision() == ov::element::u8) &&
+                                    (dstDescs[0]->getPrecision() == ov::element::u8);
+
+        const auto convert_policy = is_u8_u8_to_u8 ? ConvertPolicy::WRAP : ConvertPolicy::SATURATE;
+
         if (!NEArithmeticSubtraction::validate(srcTensorsInfo.data(),
                                                &srcTensorsInfo[1],
                                                dstTensorsInfo.data(),
-                                               ConvertPolicy::SATURATE)) {
+                                               convert_policy)) {
             return false;
         }
-        exec_func = [this]() -> std::unique_ptr<IFunction> {
+        exec_func = [this, convert_policy]() -> std::unique_ptr<IFunction> {
             auto acl_op = std::make_unique<NEArithmeticSubtraction>();
-            acl_op->configure(srcTensors.data(), &srcTensors[1], dstTensors.data(), ConvertPolicy::SATURATE);
+            acl_op->configure(srcTensors.data(), &srcTensors[1], dstTensors.data(), convert_policy);
             return acl_op;
         };
         break;
+    }
     case Algorithm::EltwiseDivide:
         if (!NEElementwiseDivision::validate(srcTensorsInfo.data(), &srcTensorsInfo[1], dstTensorsInfo.data())) {
             return false;

diff --git a/...ntel_cpu/tests/functional/custom/single_layer_tests/instances/common/subtract_u8_wrap.cpp b/...ntel_cpu/tests/functional/custom/single_layer_tests/instances/common/subtract_u8_wrap.cpp
@@ -0,0 +1,284 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+// Regression test for GitHub issue #33164:
+// u8 Subtract must wrap around (e.g., 3 - 4 = 255), not saturate to 0.
+// https://github.com/openvinotoolkit/openvino/issues/33164
+//
+// Additionally, tests ensure that TypeRelaxed subtract with u8 inputs but
+// f32/i32 output does NOT use wrap-around (must give negative values).
+// This catches regressions in LPT/dequantization patterns.
+
+#include <gtest/gtest.h>
+
+#include "openvino/op/parameter.hpp"
+#include "openvino/op/result.hpp"
+#include "openvino/op/subtract.hpp"
+#include "openvino/openvino.hpp"
+#include "ov_ops/type_relaxed.hpp"
+
+namespace ov {
+namespace test {
+namespace {
+
+class SubtractU8WrapAroundTest : public ::testing::Test {
+protected:
+    void SetUp() override {
+        core = std::make_shared<ov::Core>();
+    }
+
+    std::shared_ptr<ov::Core> core;
+};
+
+// Test that u8 subtraction wraps around instead of saturating.
+// This is a regression test for https://github.com/openvinotoolkit/openvino/issues/33164
+TEST_F(SubtractU8WrapAroundTest, WrapAroundBehavior) {
+    // Create a simple model: out = a - b (both u8)
+    auto a = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{4});
+    auto b = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{4});
+    auto subtract = std::make_shared<ov::op::v1::Subtract>(a, b);
+    auto result = std::make_shared<ov::op::v0::Result>(subtract);
+    auto model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{a, b});
+
+    // Compile for CPU
+    auto compiled_model = core->compile_model(model, "CPU");
+    auto infer_request = compiled_model.create_infer_request();
+
+    // Test cases that exercise underflow:
+    // Input A: [3, 0, 1, 5]
+    // Input B: [4, 1, 2, 3]
+    // Expected with wrap-around (mod 256): [255, 255, 255, 2]
+    // Wrong saturation result would be:    [0,   0,   0,   2]
+    std::vector<uint8_t> input_a = {3, 0, 1, 5};
+    std::vector<uint8_t> input_b = {4, 1, 2, 3};
+    std::vector<uint8_t> expected = {255, 255, 255, 2};
+
+    auto tensor_a = ov::Tensor(ov::element::u8, {4}, input_a.data());
+    auto tensor_b = ov::Tensor(ov::element::u8, {4}, input_b.data());
+
+    infer_request.set_tensor(a, tensor_a);
+    infer_request.set_tensor(b, tensor_b);
+    infer_request.infer();
+
+    auto output = infer_request.get_output_tensor(0);
+    auto output_data = output.data<uint8_t>();
+
+    for (size_t i = 0; i < expected.size(); ++i) {
+        EXPECT_EQ(output_data[i], expected[i])
+            << "Mismatch at index " << i << ": got " << static_cast<int>(output_data[i]) << ", expected "
+            << static_cast<int>(expected[i]) << ". u8 subtraction should wrap around (mod 256), not saturate to 0.";
+    }
+}
+
+// Test with larger tensor to exercise vector path in JIT
+TEST_F(SubtractU8WrapAroundTest, WrapAroundBehaviorLargeVector) {
+    const size_t size = 64;  // Large enough to trigger vectorized JIT path
+
+    auto a = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{size});
+    auto b = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{size});
+    auto subtract = std::make_shared<ov::op::v1::Subtract>(a, b);
+    auto result = std::make_shared<ov::op::v0::Result>(subtract);
+    auto model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{a, b});
+
+    auto compiled_model = core->compile_model(model, "CPU");
+    auto infer_request = compiled_model.create_infer_request();
+
+    std::vector<uint8_t> input_a(size);
+    std::vector<uint8_t> input_b(size);
+    std::vector<uint8_t> expected(size);
+
+    for (size_t i = 0; i < size; ++i) {
+        input_a[i] = static_cast<uint8_t>(i % 10);        // 0-9 repeating
+        input_b[i] = static_cast<uint8_t>((i % 10) + 1);  // 1-10 repeating
+        // Each result should be -1 mod 256 = 255, except when a >= b
+        expected[i] = static_cast<uint8_t>((256 + input_a[i] - input_b[i]) % 256);
+    }
+
+    auto tensor_a = ov::Tensor(ov::element::u8, {size}, input_a.data());
+    auto tensor_b = ov::Tensor(ov::element::u8, {size}, input_b.data());
+
+    infer_request.set_tensor(a, tensor_a);
+    infer_request.set_tensor(b, tensor_b);
+    infer_request.infer();
+
+    auto output = infer_request.get_output_tensor(0);
+    auto output_data = output.data<uint8_t>();
+
+    for (size_t i = 0; i < expected.size(); ++i) {
+        EXPECT_EQ(output_data[i], expected[i])
+            << "Mismatch at index " << i << ": got " << static_cast<int>(output_data[i]) << ", expected "
+            << static_cast<int>(expected[i]);
+    }
+}
+
+// Test with 4D tensor to match typical NN tensor shapes
+TEST_F(SubtractU8WrapAroundTest, WrapAroundBehavior4D) {
+    auto a = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{1, 2, 2, 2});
+    auto b = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{1, 2, 2, 2});
+    auto subtract = std::make_shared<ov::op::v1::Subtract>(a, b);
+    auto result = std::make_shared<ov::op::v0::Result>(subtract);
+    auto model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{a, b});
+
+    auto compiled_model = core->compile_model(model, "CPU");
+    auto infer_request = compiled_model.create_infer_request();
+
+    // All zeros minus all ones should give all 255s (wrap-around)
+    std::vector<uint8_t> input_a(8, 0);
+    std::vector<uint8_t> input_b(8, 1);
+    std::vector<uint8_t> expected(8, 255);
+
+    auto tensor_a = ov::Tensor(ov::element::u8, {1, 2, 2, 2}, input_a.data());
+    auto tensor_b = ov::Tensor(ov::element::u8, {1, 2, 2, 2}, input_b.data());
+
+    infer_request.set_tensor(a, tensor_a);
+    infer_request.set_tensor(b, tensor_b);
+    infer_request.infer();
+
+    auto output = infer_request.get_output_tensor(0);
+    auto output_data = output.data<uint8_t>();
+
+    for (size_t i = 0; i < expected.size(); ++i) {
+        EXPECT_EQ(output_data[i], expected[i])
+            << "4D tensor mismatch at index " << i << ": got " << static_cast<int>(output_data[i]) << ", expected "
+            << static_cast<int>(expected[i]);
+    }
+}
+
+// ============================================================================
+// TypeRelaxed tests: u8 inputs with non-u8 output (dequantization patterns)
+// These tests ensure that the u8 wrap-around path is NOT used when output
+// type is f32 or i32 (typical in LPT/QDQ patterns).
+// ============================================================================
+
+// u8 inputs, but output overridden to f32: MUST NOT wrap, should give negatives.
+// This test would have caught the CI failure "unsupported src_prc: u8" crash.
+TEST_F(SubtractU8WrapAroundTest, U8Inputs_F32Output_NoWrap_NoCrash) {
+    auto a = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{4});
+    auto b = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{4});
+
+    // Create a TypeRelaxed subtract: u8 inputs, f32 output
+    // This simulates dequantization subtract patterns in LPT
+    using TRSub = ov::op::TypeRelaxed<ov::op::v1::Subtract>;
+    auto sub = std::make_shared<TRSub>(
+        ov::element::TypeVector{ov::element::f32, ov::element::f32},  // origin input types for inference
+        ov::element::TypeVector{ov::element::f32},                    // overridden output type
+        a,
+        b);
+
+    auto result = std::make_shared<ov::op::v0::Result>(sub);
+    auto model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{a, b});
+
+    auto compiled_model = core->compile_model(model, "CPU");
+    auto infer_request = compiled_model.create_infer_request();
+
+    std::vector<uint8_t> input_a = {3, 0, 1, 5};
+    std::vector<uint8_t> input_b = {4, 1, 2, 3};
+
+    auto tensor_a = ov::Tensor(ov::element::u8, {4}, input_a.data());
+    auto tensor_b = ov::Tensor(ov::element::u8, {4}, input_b.data());
+
+    infer_request.set_tensor(a, tensor_a);
+    infer_request.set_tensor(b, tensor_b);
+    infer_request.infer();
+
+    auto out = infer_request.get_output_tensor(0);
+    ASSERT_EQ(out.get_element_type(), ov::element::f32);
+
+    auto* out_data = out.data<float>();
+    // With proper dequantization semantics, these should be NEGATIVE values
+    // NOT wrap-around values like 255
+    std::vector<float> expected = {-1.f, -1.f, -1.f, 2.f};
+
+    for (size_t i = 0; i < expected.size(); ++i) {
+        EXPECT_FLOAT_EQ(out_data[i], expected[i])
+            << "index=" << i << ": got " << out_data[i] << ", expected " << expected[i]
+            << ". TypeRelaxed u8->f32 subtract must NOT use wrap-around.";
+    }
+}
+
+// Same idea, but output overridden to i32.
+TEST_F(SubtractU8WrapAroundTest, U8Inputs_I32Output_NoWrap_NoCrash) {
+    auto a = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{4});
+    auto b = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{4});
+
+    // Create a TypeRelaxed subtract: u8 inputs, i32 output
+    using TRSub = ov::op::TypeRelaxed<ov::op::v1::Subtract>;
+    auto sub = std::make_shared<TRSub>(
+        ov::element::TypeVector{ov::element::i32, ov::element::i32},  // origin input types for inference
+        ov::element::TypeVector{ov::element::i32},                    // overridden output type
+        a,
+        b);
+
+    auto result = std::make_shared<ov::op::v0::Result>(sub);
+    auto model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{a, b});
+
+    auto compiled_model = core->compile_model(model, "CPU");
+    auto infer_request = compiled_model.create_infer_request();
+
+    std::vector<uint8_t> input_a = {3, 0, 1, 5};
+    std::vector<uint8_t> input_b = {4, 1, 2, 3};
+
+    infer_request.set_tensor(a, ov::Tensor(ov::element::u8, {4}, input_a.data()));
+    infer_request.set_tensor(b, ov::Tensor(ov::element::u8, {4}, input_b.data()));
+    infer_request.infer();
+
+    auto out = infer_request.get_output_tensor(0);
+    ASSERT_EQ(out.get_element_type(), ov::element::i32);
+
+    auto* out_data = out.data<int32_t>();
+    // With proper dequantization semantics, these should be NEGATIVE values
+    std::vector<int32_t> expected = {-1, -1, -1, 2};
+
+    for (size_t i = 0; i < expected.size(); ++i) {
+        EXPECT_EQ(out_data[i], expected[i]) << "index=" << i << ": got " << out_data[i] << ", expected " << expected[i]
+                                            << ". TypeRelaxed u8->i32 subtract must NOT use wrap-around.";
+    }
+}
+
+// Test with larger vector to exercise JIT vectorized path for TypeRelaxed
+TEST_F(SubtractU8WrapAroundTest, U8Inputs_F32Output_LargeVector) {
+    const size_t size = 64;  // Large enough to trigger vectorized JIT path
+
+    auto a = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{size});
+    auto b = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{size});
+
+    using TRSub = ov::op::TypeRelaxed<ov::op::v1::Subtract>;
+    auto sub = std::make_shared<TRSub>(ov::element::TypeVector{ov::element::f32, ov::element::f32},
+                                       ov::element::TypeVector{ov::element::f32},
+                                       a,
+                                       b);
+
+    auto result = std::make_shared<ov::op::v0::Result>(sub);
+    auto model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{a, b});
+
+    auto compiled_model = core->compile_model(model, "CPU");
+    auto infer_request = compiled_model.create_infer_request();
+
+    std::vector<uint8_t> input_a(size);
+    std::vector<uint8_t> input_b(size);
+    std::vector<float> expected(size);
+
+    for (size_t i = 0; i < size; ++i) {
+        input_a[i] = static_cast<uint8_t>(i % 10);        // 0-9 repeating
+        input_b[i] = static_cast<uint8_t>((i % 10) + 1);  // 1-10 repeating
+        // Expected: proper subtraction with negative results
+        expected[i] = static_cast<float>(static_cast<int>(input_a[i]) - static_cast<int>(input_b[i]));
+    }
+
+    infer_request.set_tensor(a, ov::Tensor(ov::element::u8, {size}, input_a.data()));
+    infer_request.set_tensor(b, ov::Tensor(ov::element::u8, {size}, input_b.data()));
+    infer_request.infer();
+
+    auto out = infer_request.get_output_tensor(0);
+    auto* out_data = out.data<float>();
+
+    for (size_t i = 0; i < expected.size(); ++i) {
+        EXPECT_FLOAT_EQ(out_data[i], expected[i])
+            << "index=" << i << ": got " << out_data[i] << ", expected " << expected[i];
+    }
+}
+
+}  // namespace
+}  // namespace test
+}  // namespace ov