openvinotoolkit · Nishant-ZFYII · Jan 3, 2026 · Jan 6, 2026 · Jan 6, 2026 · Jan 17, 2026
@@ -79,6 +79,12 @@ void jit_add_emitter::emit_isa(const std::vector<size_t>& in_vec_idxs, const std
         case ov::element::i32:
             h->uni_vpaddd(vmm_dst, vmm_src0, vmm_src1);
             break;
+        case ov::element::u8:
+            // u8 addition uses vpaddb which naturally wraps around (mod 256).
+            // This gives correct behavior: e.g., 255 + 1 = 0.
+            OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::u8, "u8 add JIT must only be used for u8 output");
+            h->uni_vpaddb(vmm_dst, vmm_src0, vmm_src1);
+            break;
         default:
             OV_CPU_JIT_EMITTER_THROW("Unsupported precision");
         }
@@ -92,9 +98,20 @@ void jit_add_emitter::emit_isa(const std::vector<size_t>& in_vec_idxs, const std
     }
 }
 
-std::set<std::vector<element::Type>> jit_add_emitter::get_supported_precisions(
-    [[maybe_unused]] const std::shared_ptr<ov::Node>& node) {
-    return {{element::f32, element::f32}, {element::i32, element::i32}};
+std::set<std::vector<element::Type>> jit_add_emitter::get_supported_precisions(const std::shared_ptr<ov::Node>& node) {
+    std::set<std::vector<element::Type>> supported = {{element::f32, element::f32}, {element::i32, element::i32}};
+
+    // Only enable u8 wrap-around for pure u8->u8 arithmetic.
+    // QDQ/dequantization patterns (u8 input, f32/i32 output) must NOT use u8 execution.
+    // node may be nullptr when called from SupportedPrecisions functor (general query).
+    if (node && ov::intel_cpu::all_of(element::u8,
+                                      node->get_input_element_type(0),
+                                      node->get_input_element_type(1),
+                                      node->get_output_element_type(0))) {
+        supported.insert({element::u8, element::u8});
+    }
+
+    return supported;
 }
 
 /// MUL_ADD ///
@@ -243,6 +260,12 @@ void jit_subtract_emitter::emit_isa(const std::vector<size_t>& in_vec_idxs,
         case ov::element::i32:
             h->uni_vpsubd(vmm_dst, vmm_src0, vmm_src1);
             break;
+        case ov::element::u8:
+            // u8 subtraction uses vpsubb which naturally wraps around (mod 256).
+            // This gives correct behavior: e.g., 3 - 4 = 255.
+            OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::u8, "u8 subtract JIT must only be used for u8 output");
+            h->uni_vpsubb(vmm_dst, vmm_src0, vmm_src1);
+            break;
         default:
             OV_CPU_JIT_EMITTER_THROW("Unsupported precision");
         }
@@ -257,8 +280,20 @@ void jit_subtract_emitter::emit_isa(const std::vector<size_t>& in_vec_idxs,
 }
 
 std::set<std::vector<element::Type>> jit_subtract_emitter::get_supported_precisions(
-    [[maybe_unused]] const std::shared_ptr<ov::Node>& node) {
-    return {{element::f32, element::f32}, {element::i32, element::i32}};
+    const std::shared_ptr<ov::Node>& node) {
+    std::set<std::vector<element::Type>> supported = {{element::f32, element::f32}, {element::i32, element::i32}};
+
+    // Only enable u8 wrap-around for pure u8->u8 arithmetic.
+    // QDQ/dequantization patterns (u8 input, f32/i32 output) must NOT use u8 execution.
+    // node may be nullptr when called from SupportedPrecisions functor (general query).
+    if (node && ov::intel_cpu::all_of(element::u8,
+                                      node->get_input_element_type(0),
+                                      node->get_input_element_type(1),
+                                      node->get_output_element_type(0))) {
+        supported.insert({element::u8, element::u8});
+    }
+
+    return supported;
 }
 
 /// MULTIPLY ///

@@ -36,6 +36,7 @@
 #include "openvino/core/except.hpp"
 #include "openvino/core/type/element_type.hpp"
 #include "utils/debug_capabilities.h"
+#include "utils/general_utils.h"
 
 namespace ov::intel_cpu {
 
@@ -264,19 +265,29 @@ bool AclEltwiseExecutor::init(const std::vector<MemoryDescPtr>& srcDescs, const
 
     std::function<std::unique_ptr<IFunction>(void)> exec_func;
     switch (aclEltwiseAttrs.data.algo) {
-    case Algorithm::EltwiseAdd:
+    case Algorithm::EltwiseAdd: {
+        // For u8, Add must wrap on overflow (e.g. 255 + 1 = 0), not saturate.
+        // Only use wrap-around for pure u8->u8 add.
+        // QDQ patterns with u8 input but f32/i32 output must saturate.
+        const bool is_u8_u8_to_u8 = ov::intel_cpu::all_of(ov::element::u8,
+                                                          srcDescs[0]->getPrecision(),
+                                                          srcDescs[1]->getPrecision(),
+                                                          dstDescs[0]->getPrecision());
+        const auto convert_policy = is_u8_u8_to_u8 ? ConvertPolicy::WRAP : ConvertPolicy::SATURATE;
+
         if (!NEArithmeticAddition::validate(srcTensorsInfo.data(),
                                             &srcTensorsInfo[1],
                                             dstTensorsInfo.data(),
-                                            ConvertPolicy::SATURATE)) {
+                                            convert_policy)) {
             return false;
         }
-        exec_func = [this]() -> std::unique_ptr<IFunction> {
+        exec_func = [this, convert_policy]() -> std::unique_ptr<IFunction> {
             auto acl_op = std::make_unique<NEArithmeticAddition>();
-            acl_op->configure(srcTensors.data(), &srcTensors[1], dstTensors.data(), ConvertPolicy::SATURATE);
+            acl_op->configure(srcTensors.data(), &srcTensors[1], dstTensors.data(), convert_policy);
             return acl_op;
         };
         break;
+    }
     case Algorithm::EltwiseMultiply:
         if (!NEPixelWiseMultiplication::validate(srcTensorsInfo.data(),
                                                  &srcTensorsInfo[1],
@@ -297,19 +308,30 @@ bool AclEltwiseExecutor::init(const std::vector<MemoryDescPtr>& srcDescs, const
             return acl_op;
         };
         break;
-    case Algorithm::EltwiseSubtract:
+    case Algorithm::EltwiseSubtract: {
+        // For u8, Subtract must wrap (e.g. 3 - 4 = 255), not saturate to 0.
+        // Only use wrap-around for pure u8->u8 subtract.
+        // QDQ patterns with u8 input but f32/i32 output must saturate.
+        const bool is_u8_u8_to_u8 = ov::intel_cpu::all_of(ov::element::u8,
+                                                          srcDescs[0]->getPrecision(),
+                                                          srcDescs[1]->getPrecision(),
+                                                          dstDescs[0]->getPrecision());
+
+        const auto convert_policy = is_u8_u8_to_u8 ? ConvertPolicy::WRAP : ConvertPolicy::SATURATE;
+
         if (!NEArithmeticSubtraction::validate(srcTensorsInfo.data(),
                                                &srcTensorsInfo[1],
                                                dstTensorsInfo.data(),
-                                               ConvertPolicy::SATURATE)) {
+                                               convert_policy)) {
             return false;
         }
-        exec_func = [this]() -> std::unique_ptr<IFunction> {
+        exec_func = [this, convert_policy]() -> std::unique_ptr<IFunction> {
             auto acl_op = std::make_unique<NEArithmeticSubtraction>();
-            acl_op->configure(srcTensors.data(), &srcTensors[1], dstTensors.data(), ConvertPolicy::SATURATE);
+            acl_op->configure(srcTensors.data(), &srcTensors[1], dstTensors.data(), convert_policy);
             return acl_op;
         };
         break;
+    }
     case Algorithm::EltwiseDivide:
         if (!NEElementwiseDivision::validate(srcTensorsInfo.data(), &srcTensorsInfo[1], dstTensorsInfo.data())) {
             return false;

@@ -0,0 +1,90 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
-// Copyright (C) 2025 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
-// Copyright (C) 2025 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "eltwise_overflow.hpp"
+
+#include "common_test_utils/ov_tensor_utils.hpp"
+#include "openvino/op/add.hpp"
+#include "openvino/op/result.hpp"
+#include "openvino/op/subtract.hpp"
+
+namespace ov {
+namespace test {
+
+std::string EltwiseOverflowLayerCPUTest::getTestCaseName(const testing::TestParamInfo<EltwiseOverflowTestParams>& obj) {
+    const auto& [kind, shape] = obj.param;
+    std::ostringstream result;
+    result << "kind=" << (kind == EltwiseOverflowKind::UNDERFLOW ? "UNDERFLOW" : "OVERFLOW");
+    result << "_shape=" << shape;
+    return result.str();
+}
+
+void EltwiseOverflowLayerCPUTest::SetUp() {
+    targetDevice = ov::test::utils::DEVICE_CPU;
+    abs_threshold = 0;
+
+    const auto& [kind, shape] = GetParam();
+    overflowKind = kind;
+
+    InputShape inShape = {{}, {shape}};
+    init_input_shapes({inShape, inShape});
+
+    auto a = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::PartialShape(shape));
+    auto b = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::PartialShape(shape));
+
+    std::shared_ptr<ov::Node> op;
+    if (kind == EltwiseOverflowKind::UNDERFLOW) {
+        op = std::make_shared<ov::op::v1::Subtract>(a, b);
+    } else {
+        op = std::make_shared<ov::op::v1::Add>(a, b);
+    }
+
+    auto result = std::make_shared<ov::op::v0::Result>(op);
+    function = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{a, b}, "EltwiseOverflow");
+}
+
+void EltwiseOverflowLayerCPUTest::generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) {
+    inputs.clear();
+    const auto& modelInputs = function->inputs();
+    ASSERT_EQ(modelInputs.size(), 2u);
+    ASSERT_EQ(targetInputStaticShapes.size(), 2u);
+
+    const size_t size = ov::shape_size(targetInputStaticShapes[0]);
+
+    // Hardcoded values that guarantee underflow/overflow regardless of shape size.
+    // Pattern repeats to fill any shape.
+    static const std::vector<uint8_t> underflow_a = {3, 0, 1, 5, 10, 0, 100, 50};
+    static const std::vector<uint8_t> underflow_b = {4, 1, 2, 3, 20, 1, 200, 51};
+    // Expected results (wrap): 255, 255, 255, 2, 246, 255, 156, 255
+
+    static const std::vector<uint8_t> overflow_a = {255, 254, 200, 128, 255, 250, 255, 1};
+    static const std::vector<uint8_t> overflow_b = {1, 2, 100, 128, 255, 10, 128, 255};
+    // Expected results (wrap): 0, 0, 44, 0, 254, 4, 127, 0
+
+    const auto& src_a = (overflowKind == EltwiseOverflowKind::UNDERFLOW) ? underflow_a : overflow_a;
+    const auto& src_b = (overflowKind == EltwiseOverflowKind::UNDERFLOW) ? underflow_b : overflow_b;
+
+    std::vector<uint8_t> data0(size);
+    std::vector<uint8_t> data1(size);
+
+    for (size_t i = 0; i < size; ++i) {
+        data0[i] = src_a[i % src_a.size()];
+        data1[i] = src_b[i % src_b.size()];
+    }
+
+    auto t0 = ov::Tensor(ov::element::u8, targetInputStaticShapes[0]);
+    auto t1 = ov::Tensor(ov::element::u8, targetInputStaticShapes[1]);
+    std::copy(data0.begin(), data0.end(), t0.data<uint8_t>());
+    std::copy(data1.begin(), data1.end(), t1.data<uint8_t>());
+
+    inputs.insert({modelInputs[0].get_node_shared_ptr(), t0});
+    inputs.insert({modelInputs[1].get_node_shared_ptr(), t1});
+}
+
+TEST_P(EltwiseOverflowLayerCPUTest, CompareWithRefs) {
+    run();
+}
+
+}  // namespace test
+}  // namespace ov
@@ -0,0 +1,32 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
-// Copyright (C) 2025 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
-// Copyright (C) 2025 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <gtest/gtest.h>
+
+#include "shared_test_classes/base/ov_subgraph.hpp"
+
+namespace ov {
+namespace test {
+
+enum class EltwiseOverflowKind { UNDERFLOW, OVERFLOW };
+
+typedef std::tuple<EltwiseOverflowKind, ov::Shape> EltwiseOverflowTestParams;
+
+class EltwiseOverflowLayerCPUTest : public testing::WithParamInterface<EltwiseOverflowTestParams>,
+                                    virtual public SubgraphBaseTest {
+public:
+    static std::string getTestCaseName(const testing::TestParamInfo<EltwiseOverflowTestParams>& obj);
+
+protected:
+    void SetUp() override;
+    void generate_inputs(const std::vector<ov::Shape>& targetInputStaticShapes) override;
+
+private:
+    EltwiseOverflowKind overflowKind;
+};
+
+}  // namespace test
+}  // namespace ov
@@ -0,0 +1,24 @@
+// Copyright (C) 2025 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
-// Copyright (C) 2025 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
-// Copyright (C) 2025 Intel Corporation
-// SPDX-License-Identifier: Apache-2.0
-//
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "custom/single_layer_tests/classes/eltwise_overflow.hpp"
+
+namespace ov {
+namespace test {
+
+const std::vector<EltwiseOverflowKind> overflowKinds = {EltwiseOverflowKind::UNDERFLOW, EltwiseOverflowKind::OVERFLOW};
+
+const std::vector<ov::Shape> testShapes = {
+    {4},           // small 1D
+    {64},          // larger 1D to exercise vectorized JIT path
+    {1, 2, 2, 2},  // 4D typical NN shape
+};
+
+INSTANTIATE_TEST_SUITE_P(smoke_EltwiseOverflowU8,
+                         EltwiseOverflowLayerCPUTest,
+                         ::testing::Combine(::testing::ValuesIn(overflowKinds), ::testing::ValuesIn(testShapes)),
+                         EltwiseOverflowLayerCPUTest::getTestCaseName);
+
+}  // namespace test
+}  // namespace ov