-
Notifications
You must be signed in to change notification settings - Fork 3.1k
[CPU] Fix u8 Subtract to use wrap-around instead of saturation #33453
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
Nishant-ZFYII
wants to merge
8
commits into
openvinotoolkit:master
Choose a base branch
from
Nishant-ZFYII:fix/33164-u8-subtract-wrap-around
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
+216
−13
Open
Changes from 3 commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
4ca0fc2
[CPU] Fix u8 Subtract to use wrap-around instead of saturation
Nishant-ZFYII 6a70449
[CPU] Fix u8 subtract to wrap instead of saturate
Nishant-ZFYII d8e9383
[CPU] Fix u8 subtract to wrap instead of saturate
Nishant-ZFYII debffa0
[CPU] Address reviewer feedback for u8 eltwise wrap-around fix
Nishant-ZFYII b5561fa
[CPU] Add u8 wrap-around support for Add operation and address review…
Nishant-ZFYII 1badae9
Fix get_supported_precisions crash: use if-guard with OPENVINO_ASSERT
Nishant-ZFYII 6f33255
Clang-18 fix
Nishant-ZFYII d03bed1
Update copyright headers to 2026
Nishant-ZFYII File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
284 changes: 284 additions & 0 deletions
284
...ntel_cpu/tests/functional/custom/single_layer_tests/instances/common/subtract_u8_wrap.cpp
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,284 @@ | ||
| // Copyright (C) 2025 Intel Corporation | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
| // | ||
|
|
||
| // Regression test for GitHub issue #33164: | ||
| // u8 Subtract must wrap around (e.g., 3 - 4 = 255), not saturate to 0. | ||
| // https://github.com/openvinotoolkit/openvino/issues/33164 | ||
| // | ||
| // Additionally, tests ensure that TypeRelaxed subtract with u8 inputs but | ||
| // f32/i32 output does NOT use wrap-around (must give negative values). | ||
| // This catches regressions in LPT/dequantization patterns. | ||
|
|
||
| #include <gtest/gtest.h> | ||
|
|
||
| #include "openvino/op/parameter.hpp" | ||
| #include "openvino/op/result.hpp" | ||
| #include "openvino/op/subtract.hpp" | ||
| #include "openvino/openvino.hpp" | ||
| #include "ov_ops/type_relaxed.hpp" | ||
|
|
||
| namespace ov { | ||
| namespace test { | ||
| namespace { | ||
|
|
||
| class SubtractU8WrapAroundTest : public ::testing::Test { | ||
| protected: | ||
| void SetUp() override { | ||
| core = std::make_shared<ov::Core>(); | ||
| } | ||
|
|
||
| std::shared_ptr<ov::Core> core; | ||
| }; | ||
|
|
||
| // Test that u8 subtraction wraps around instead of saturating. | ||
| // This is a regression test for https://github.com/openvinotoolkit/openvino/issues/33164 | ||
| TEST_F(SubtractU8WrapAroundTest, WrapAroundBehavior) { | ||
| // Create a simple model: out = a - b (both u8) | ||
| auto a = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{4}); | ||
| auto b = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{4}); | ||
| auto subtract = std::make_shared<ov::op::v1::Subtract>(a, b); | ||
| auto result = std::make_shared<ov::op::v0::Result>(subtract); | ||
| auto model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{a, b}); | ||
|
|
||
| // Compile for CPU | ||
| auto compiled_model = core->compile_model(model, "CPU"); | ||
| auto infer_request = compiled_model.create_infer_request(); | ||
|
|
||
| // Test cases that exercise underflow: | ||
| // Input A: [3, 0, 1, 5] | ||
| // Input B: [4, 1, 2, 3] | ||
| // Expected with wrap-around (mod 256): [255, 255, 255, 2] | ||
| // Wrong saturation result would be: [0, 0, 0, 2] | ||
| std::vector<uint8_t> input_a = {3, 0, 1, 5}; | ||
| std::vector<uint8_t> input_b = {4, 1, 2, 3}; | ||
| std::vector<uint8_t> expected = {255, 255, 255, 2}; | ||
|
|
||
| auto tensor_a = ov::Tensor(ov::element::u8, {4}, input_a.data()); | ||
| auto tensor_b = ov::Tensor(ov::element::u8, {4}, input_b.data()); | ||
|
|
||
| infer_request.set_tensor(a, tensor_a); | ||
| infer_request.set_tensor(b, tensor_b); | ||
| infer_request.infer(); | ||
|
|
||
| auto output = infer_request.get_output_tensor(0); | ||
| auto output_data = output.data<uint8_t>(); | ||
|
|
||
| for (size_t i = 0; i < expected.size(); ++i) { | ||
| EXPECT_EQ(output_data[i], expected[i]) | ||
| << "Mismatch at index " << i << ": got " << static_cast<int>(output_data[i]) << ", expected " | ||
| << static_cast<int>(expected[i]) << ". u8 subtraction should wrap around (mod 256), not saturate to 0."; | ||
| } | ||
| } | ||
|
|
||
| // Test with larger tensor to exercise vector path in JIT | ||
| TEST_F(SubtractU8WrapAroundTest, WrapAroundBehaviorLargeVector) { | ||
| const size_t size = 64; // Large enough to trigger vectorized JIT path | ||
|
|
||
| auto a = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{size}); | ||
| auto b = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{size}); | ||
| auto subtract = std::make_shared<ov::op::v1::Subtract>(a, b); | ||
| auto result = std::make_shared<ov::op::v0::Result>(subtract); | ||
| auto model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{a, b}); | ||
|
|
||
| auto compiled_model = core->compile_model(model, "CPU"); | ||
| auto infer_request = compiled_model.create_infer_request(); | ||
|
|
||
| std::vector<uint8_t> input_a(size); | ||
| std::vector<uint8_t> input_b(size); | ||
| std::vector<uint8_t> expected(size); | ||
|
|
||
| for (size_t i = 0; i < size; ++i) { | ||
| input_a[i] = static_cast<uint8_t>(i % 10); // 0-9 repeating | ||
| input_b[i] = static_cast<uint8_t>((i % 10) + 1); // 1-10 repeating | ||
| // Each result should be -1 mod 256 = 255, except when a >= b | ||
| expected[i] = static_cast<uint8_t>((256 + input_a[i] - input_b[i]) % 256); | ||
| } | ||
|
|
||
| auto tensor_a = ov::Tensor(ov::element::u8, {size}, input_a.data()); | ||
| auto tensor_b = ov::Tensor(ov::element::u8, {size}, input_b.data()); | ||
|
|
||
| infer_request.set_tensor(a, tensor_a); | ||
| infer_request.set_tensor(b, tensor_b); | ||
| infer_request.infer(); | ||
|
|
||
| auto output = infer_request.get_output_tensor(0); | ||
| auto output_data = output.data<uint8_t>(); | ||
|
|
||
| for (size_t i = 0; i < expected.size(); ++i) { | ||
| EXPECT_EQ(output_data[i], expected[i]) | ||
| << "Mismatch at index " << i << ": got " << static_cast<int>(output_data[i]) << ", expected " | ||
| << static_cast<int>(expected[i]); | ||
| } | ||
| } | ||
|
|
||
| // Test with 4D tensor to match typical NN tensor shapes | ||
| TEST_F(SubtractU8WrapAroundTest, WrapAroundBehavior4D) { | ||
| auto a = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{1, 2, 2, 2}); | ||
| auto b = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{1, 2, 2, 2}); | ||
| auto subtract = std::make_shared<ov::op::v1::Subtract>(a, b); | ||
| auto result = std::make_shared<ov::op::v0::Result>(subtract); | ||
| auto model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{a, b}); | ||
|
|
||
| auto compiled_model = core->compile_model(model, "CPU"); | ||
| auto infer_request = compiled_model.create_infer_request(); | ||
|
|
||
| // All zeros minus all ones should give all 255s (wrap-around) | ||
| std::vector<uint8_t> input_a(8, 0); | ||
| std::vector<uint8_t> input_b(8, 1); | ||
| std::vector<uint8_t> expected(8, 255); | ||
|
|
||
| auto tensor_a = ov::Tensor(ov::element::u8, {1, 2, 2, 2}, input_a.data()); | ||
| auto tensor_b = ov::Tensor(ov::element::u8, {1, 2, 2, 2}, input_b.data()); | ||
|
|
||
| infer_request.set_tensor(a, tensor_a); | ||
| infer_request.set_tensor(b, tensor_b); | ||
| infer_request.infer(); | ||
|
|
||
| auto output = infer_request.get_output_tensor(0); | ||
| auto output_data = output.data<uint8_t>(); | ||
|
|
||
| for (size_t i = 0; i < expected.size(); ++i) { | ||
| EXPECT_EQ(output_data[i], expected[i]) | ||
| << "4D tensor mismatch at index " << i << ": got " << static_cast<int>(output_data[i]) << ", expected " | ||
| << static_cast<int>(expected[i]); | ||
| } | ||
| } | ||
|
|
||
| // ============================================================================ | ||
| // TypeRelaxed tests: u8 inputs with non-u8 output (dequantization patterns) | ||
| // These tests ensure that the u8 wrap-around path is NOT used when output | ||
| // type is f32 or i32 (typical in LPT/QDQ patterns). | ||
| // ============================================================================ | ||
|
|
||
| // u8 inputs, but output overridden to f32: MUST NOT wrap, should give negatives. | ||
| // This test would have caught the CI failure "unsupported src_prc: u8" crash. | ||
| TEST_F(SubtractU8WrapAroundTest, U8Inputs_F32Output_NoWrap_NoCrash) { | ||
| auto a = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{4}); | ||
| auto b = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{4}); | ||
|
|
||
| // Create a TypeRelaxed subtract: u8 inputs, f32 output | ||
| // This simulates dequantization subtract patterns in LPT | ||
| using TRSub = ov::op::TypeRelaxed<ov::op::v1::Subtract>; | ||
| auto sub = std::make_shared<TRSub>( | ||
| ov::element::TypeVector{ov::element::f32, ov::element::f32}, // origin input types for inference | ||
| ov::element::TypeVector{ov::element::f32}, // overridden output type | ||
| a, | ||
| b); | ||
|
|
||
| auto result = std::make_shared<ov::op::v0::Result>(sub); | ||
| auto model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{a, b}); | ||
|
|
||
| auto compiled_model = core->compile_model(model, "CPU"); | ||
| auto infer_request = compiled_model.create_infer_request(); | ||
|
|
||
| std::vector<uint8_t> input_a = {3, 0, 1, 5}; | ||
| std::vector<uint8_t> input_b = {4, 1, 2, 3}; | ||
|
|
||
| auto tensor_a = ov::Tensor(ov::element::u8, {4}, input_a.data()); | ||
| auto tensor_b = ov::Tensor(ov::element::u8, {4}, input_b.data()); | ||
|
|
||
| infer_request.set_tensor(a, tensor_a); | ||
| infer_request.set_tensor(b, tensor_b); | ||
| infer_request.infer(); | ||
|
|
||
| auto out = infer_request.get_output_tensor(0); | ||
| ASSERT_EQ(out.get_element_type(), ov::element::f32); | ||
|
|
||
| auto* out_data = out.data<float>(); | ||
| // With proper dequantization semantics, these should be NEGATIVE values | ||
| // NOT wrap-around values like 255 | ||
| std::vector<float> expected = {-1.f, -1.f, -1.f, 2.f}; | ||
|
|
||
| for (size_t i = 0; i < expected.size(); ++i) { | ||
| EXPECT_FLOAT_EQ(out_data[i], expected[i]) | ||
| << "index=" << i << ": got " << out_data[i] << ", expected " << expected[i] | ||
| << ". TypeRelaxed u8->f32 subtract must NOT use wrap-around."; | ||
| } | ||
| } | ||
|
|
||
| // Same idea, but output overridden to i32. | ||
| TEST_F(SubtractU8WrapAroundTest, U8Inputs_I32Output_NoWrap_NoCrash) { | ||
| auto a = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{4}); | ||
| auto b = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{4}); | ||
|
|
||
| // Create a TypeRelaxed subtract: u8 inputs, i32 output | ||
| using TRSub = ov::op::TypeRelaxed<ov::op::v1::Subtract>; | ||
| auto sub = std::make_shared<TRSub>( | ||
| ov::element::TypeVector{ov::element::i32, ov::element::i32}, // origin input types for inference | ||
| ov::element::TypeVector{ov::element::i32}, // overridden output type | ||
| a, | ||
| b); | ||
|
|
||
| auto result = std::make_shared<ov::op::v0::Result>(sub); | ||
| auto model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{a, b}); | ||
|
|
||
| auto compiled_model = core->compile_model(model, "CPU"); | ||
| auto infer_request = compiled_model.create_infer_request(); | ||
|
|
||
| std::vector<uint8_t> input_a = {3, 0, 1, 5}; | ||
| std::vector<uint8_t> input_b = {4, 1, 2, 3}; | ||
|
|
||
| infer_request.set_tensor(a, ov::Tensor(ov::element::u8, {4}, input_a.data())); | ||
| infer_request.set_tensor(b, ov::Tensor(ov::element::u8, {4}, input_b.data())); | ||
| infer_request.infer(); | ||
|
|
||
| auto out = infer_request.get_output_tensor(0); | ||
| ASSERT_EQ(out.get_element_type(), ov::element::i32); | ||
|
|
||
| auto* out_data = out.data<int32_t>(); | ||
| // With proper dequantization semantics, these should be NEGATIVE values | ||
| std::vector<int32_t> expected = {-1, -1, -1, 2}; | ||
|
|
||
| for (size_t i = 0; i < expected.size(); ++i) { | ||
| EXPECT_EQ(out_data[i], expected[i]) << "index=" << i << ": got " << out_data[i] << ", expected " << expected[i] | ||
| << ". TypeRelaxed u8->i32 subtract must NOT use wrap-around."; | ||
| } | ||
| } | ||
|
|
||
| // Test with larger vector to exercise JIT vectorized path for TypeRelaxed | ||
| TEST_F(SubtractU8WrapAroundTest, U8Inputs_F32Output_LargeVector) { | ||
| const size_t size = 64; // Large enough to trigger vectorized JIT path | ||
|
|
||
| auto a = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{size}); | ||
| auto b = std::make_shared<ov::op::v0::Parameter>(ov::element::u8, ov::Shape{size}); | ||
|
|
||
| using TRSub = ov::op::TypeRelaxed<ov::op::v1::Subtract>; | ||
| auto sub = std::make_shared<TRSub>(ov::element::TypeVector{ov::element::f32, ov::element::f32}, | ||
| ov::element::TypeVector{ov::element::f32}, | ||
| a, | ||
| b); | ||
|
|
||
| auto result = std::make_shared<ov::op::v0::Result>(sub); | ||
| auto model = std::make_shared<ov::Model>(ov::ResultVector{result}, ov::ParameterVector{a, b}); | ||
|
|
||
| auto compiled_model = core->compile_model(model, "CPU"); | ||
| auto infer_request = compiled_model.create_infer_request(); | ||
|
|
||
| std::vector<uint8_t> input_a(size); | ||
| std::vector<uint8_t> input_b(size); | ||
| std::vector<float> expected(size); | ||
|
|
||
| for (size_t i = 0; i < size; ++i) { | ||
| input_a[i] = static_cast<uint8_t>(i % 10); // 0-9 repeating | ||
| input_b[i] = static_cast<uint8_t>((i % 10) + 1); // 1-10 repeating | ||
| // Expected: proper subtraction with negative results | ||
| expected[i] = static_cast<float>(static_cast<int>(input_a[i]) - static_cast<int>(input_b[i])); | ||
EgorDuplensky marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| } | ||
|
|
||
| infer_request.set_tensor(a, ov::Tensor(ov::element::u8, {size}, input_a.data())); | ||
| infer_request.set_tensor(b, ov::Tensor(ov::element::u8, {size}, input_b.data())); | ||
| infer_request.infer(); | ||
|
|
||
| auto out = infer_request.get_output_tensor(0); | ||
| auto* out_data = out.data<float>(); | ||
|
|
||
| for (size_t i = 0; i < expected.size(); ++i) { | ||
| EXPECT_FLOAT_EQ(out_data[i], expected[i]) | ||
| << "index=" << i << ": got " << out_data[i] << ", expected " << expected[i]; | ||
| } | ||
| } | ||
|
|
||
| } // namespace | ||
| } // namespace test | ||
| } // namespace ov | ||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.