Skip to content
Merged
Show file tree
Hide file tree
Changes from 75 commits
Commits
Show all changes
77 commits
Select commit Hold shift + click to select a range
0bf796a
Added QDQ stripping pipeline to CPU plugin
v-Golubev Feb 4, 2026
86044a8
CPU QDQ Stripping tests
v-Golubev Feb 4, 2026
94ab742
[TMP] Add debug serialization
v-Golubev Feb 4, 2026
4a5f8cd
Refactor FQStripping
v-Golubev Feb 4, 2026
18aca0c
Tests refactoring
v-Golubev Feb 4, 2026
30df189
NeedScalingMulMatMul
v-Golubev Feb 4, 2026
3360153
Added third pattern
v-Golubev Feb 4, 2026
23c839e
WIP: 1st stage
v-Golubev Feb 4, 2026
aaed259
Tests further modification
v-Golubev Feb 5, 2026
c4ed9d8
Pass simplification
v-Golubev Feb 5, 2026
b218b13
Original implementation WIP
v-Golubev Feb 5, 2026
927bd27
Added dubug serialization for GPU
v-Golubev Feb 5, 2026
cbc3120
Restore old build_shared_dq_pattern builder
v-Golubev Feb 5, 2026
05fa938
Adjust quantization values
v-Golubev Feb 5, 2026
522f4ec
Simplified impl: first steps
v-Golubev Feb 5, 2026
883079e
Scale adjustment initial impl
v-Golubev Feb 5, 2026
470aee5
All tests passed
v-Golubev Feb 5, 2026
1b57039
Logging via env variable
v-Golubev Feb 5, 2026
b219805
Add scale_factor related env variables
v-Golubev Feb 5, 2026
414460e
Added serialization via env variable
v-Golubev Feb 5, 2026
777dacd
Removed debug serialization from plugins
v-Golubev Feb 5, 2026
f23c48f
Change debug log env var name
v-Golubev Feb 5, 2026
bcf272c
Added SERIALIZE_MODEL env var
v-Golubev Feb 6, 2026
1f95a5d
Convert on zero point
v-Golubev Feb 6, 2026
ef74512
tests builder unification
v-Golubev Feb 6, 2026
55dd75d
Minor pass adjustments
v-Golubev Feb 6, 2026
e2067c1
Add bias after ONNX FE conversion support
v-Golubev Feb 6, 2026
66ca640
Fixed max dq scale computation
v-Golubev Feb 9, 2026
691d2f5
Reproduce f16 overflow in tests in case when scale adjustment doesn't…
v-Golubev Feb 9, 2026
9dcb2c5
Stop backward propagation at ShapeOf nodes
v-Golubev Feb 11, 2026
dfcc682
MatMul with bias support
v-Golubev Feb 11, 2026
f2b8e31
Cover MM with bias with tests
v-Golubev Feb 11, 2026
f21cb93
NeedScalingResidualBlock: tightened threshold
v-Golubev Feb 11, 2026
aab88d5
NeedScalingMulMatMul: tightened threshold
v-Golubev Feb 11, 2026
8555aa8
Threshold is aligned for all tests
v-Golubev Feb 11, 2026
e9338e7
Logging extended
v-Golubev Feb 11, 2026
fb1356d
QDQ stripping: implement forward propagation for scale adjustment
v-Golubev Feb 12, 2026
e836b87
QDQ stripping: handle FQ in propagation and merge into single pass
v-Golubev Feb 12, 2026
644ae12
Track stripped FQs and prevent skip-connection double-scaling in forw…
v-Golubev Feb 12, 2026
4ac06a6
Add FQ on residual block Conv branch to test backward propagation fro…
v-Golubev Feb 12, 2026
83ee56b
Use 65536 levels for test FQs so they are stripped after range adjust…
v-Golubev Feb 12, 2026
c28e302
Add TransformationTestsF tests for QDQ stripping algorithm
v-Golubev Feb 12, 2026
8135ae7
Fixed fq_ranges_are_the_same lambda
v-Golubev Feb 12, 2026
de6e1fd
Fix FQ ranges in GPU and LPT QDQ stripping tests
v-Golubev Feb 12, 2026
ada023e
style
v-Golubev Feb 12, 2026
5eb3721
ratio is set to 10
v-Golubev Feb 12, 2026
a5f0823
Moved builders to tests
v-Golubev Feb 12, 2026
0e03f19
LPT subgraph tests
v-Golubev Feb 12, 2026
c5c1aad
style
v-Golubev Feb 12, 2026
8678278
Reverted CPU related changes
v-Golubev Feb 13, 2026
088a020
Introduced need_weights_adjustment parameter
v-Golubev Feb 13, 2026
9b2604b
LPT tests extending
v-Golubev Feb 13, 2026
8b8ecfc
Handle Result in forward propagation
v-Golubev Feb 13, 2026
f0998c1
debug info cleanup
v-Golubev Feb 13, 2026
b7262ac
Per-channel FQ support
v-Golubev Feb 13, 2026
f7752b6
refactoring
v-Golubev Feb 13, 2026
b68350a
Further code cleanup
v-Golubev Feb 13, 2026
eb47692
Simplification
v-Golubev Feb 13, 2026
cc8e3b4
Avoid double scaling
v-Golubev Feb 13, 2026
3f0b856
Refactoring
v-Golubev Feb 13, 2026
be62a55
Handle mul/MatMul with 2 activations
v-Golubev Feb 13, 2026
85a4951
Cleanup
v-Golubev Feb 13, 2026
bb2d73d
Some refactoring
v-Golubev Feb 16, 2026
3b695fe
Handled bias during forward propagation
v-Golubev Feb 16, 2026
bb9c6be
Forward/Backward propagation significantly simplified
v-Golubev Feb 16, 2026
4112f19
Further simplification
v-Golubev Feb 16, 2026
8a4b222
ScaleAdjuster: is_allowed_node introduced for safety
v-Golubev Feb 16, 2026
4ef530c
Remove some nodes from is_allowed_node
v-Golubev Feb 16, 2026
b922711
Code cleanup
v-Golubev Feb 16, 2026
e8d81aa
Added some docs
v-Golubev Feb 16, 2026
20bb11e
Tests cleanup
v-Golubev Feb 16, 2026
24c7ecc
Fixed shared_dq test case
v-Golubev Feb 16, 2026
36ab9ce
fixed build_residual_block_pattern
v-Golubev Feb 16, 2026
721e31e
Review comments applied
v-Golubev Feb 18, 2026
767438a
LPT test builders correction
v-Golubev Feb 18, 2026
e42db4d
[TESTS] Residual block pattern: added shortcut convolution
v-Golubev Feb 25, 2026
46c7182
Merge branch 'master' into vg/lpt/qdq_stripping_rework
v-Golubev Feb 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include <set>

#include "lpt_visibility.hpp"
#include "openvino/pass/matcher_pass.hpp"
#include "openvino/pass/pass.hpp"
#include "quantization_details.hpp"

namespace ov {
Expand All @@ -20,10 +20,15 @@ namespace low_precision {
* @brief FQStrippingTransformation strips FakeQuantize operations with specified levels
* by replacing them with Clamp operations.
*/
class LP_TRANSFORMATIONS_API FQStrippingTransformation : public ov::pass::MatcherPass {
class LP_TRANSFORMATIONS_API FQStrippingTransformation : public ov::pass::ModelPass {
public:
OPENVINO_RTTI("FQStrippingTransformation", "0", MatcherPass);
FQStrippingTransformation(const std::set<size_t>& levels_to_strip, bool replace_with_clamp);
OPENVINO_MODEL_PASS_RTTI("low_precision::FQStrippingTransformation");
FQStrippingTransformation(const std::set<size_t>& levels_to_strip, bool need_weights_adjustment = false);
bool run_on_model(const std::shared_ptr<ov::Model>& m) override;

private:
const std::set<size_t> levels_to_strip;
const bool need_weights_adjustment;
};

} // namespace low_precision
Expand Down
278 changes: 227 additions & 51 deletions src/common/low_precision_transformations/src/qdq_stripping.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,87 +4,263 @@

#include "low_precision/qdq_stripping.hpp"

#include <algorithm>
#include <memory>
#include <queue>
#include <unordered_set>

#include "itt.hpp"
#include "low_precision/common/ie_lpt_exception.hpp"
#include "low_precision/lpt_itt.hpp"
#include "low_precision/network_helper.hpp"
#include "openvino/core/except.hpp"
#include "openvino/core/type.hpp"
#include "openvino/core/validation_util.hpp"
#include "openvino/op/abs.hpp"
#include "openvino/op/add.hpp"
#include "openvino/op/broadcast.hpp"
#include "openvino/op/clamp.hpp"
#include "openvino/op/concat.hpp"
#include "openvino/op/constant.hpp"
#include "openvino/op/convolution.hpp"
#include "openvino/op/divide.hpp"
#include "openvino/op/equal.hpp"
#include "openvino/op/fake_quantize.hpp"
#include "openvino/op/gather.hpp"
#include "openvino/op/less.hpp"
#include "openvino/op/matmul.hpp"
#include "openvino/op/multiply.hpp"
#include "openvino/op/mvn.hpp"
#include "openvino/op/reshape.hpp"
#include "openvino/op/result.hpp"
#include "openvino/op/shape_of.hpp"
#include "openvino/op/softmax.hpp"
#include "openvino/op/subtract.hpp"
#include "openvino/op/transpose.hpp"
#include "openvino/pass/pattern/matcher.hpp"
#include "openvino/pass/pattern/op/block.hpp"
#include "openvino/pass/pattern/op/optional.hpp"
#include "openvino/pass/pattern/op/wrap_type.hpp"
#include "openvino/util/log.hpp"
#include "transformations/utils/utils.hpp"

namespace ov {
namespace pass {
namespace low_precision {
namespace {
// Motivation: if the stripped FQ's dequantization scale (y_scale) is large,
// the original activation values flowing through the stripped FQ path can exceed f16 range,
// causing overflow and corrupting inference results.
//
// ScaleAdjuster reduces the magnitude of activations, keeping them within f16 range, by dividing weight DQ constants by
// `scale_divisor = y_scale × ratio` (where `ratio = 10.0`).
// Note: Such scaling is possible only if all downstream paths go to scale-invariant nodes (such as MVN or Softmax).
class ScaleAdjuster {
public:
ScaleAdjuster(float scale_divisor, const std::shared_ptr<ov::Node>& fq)
: m_scale_divisor(scale_divisor),
m_fq(fq.get()) {}

FQStrippingTransformation::FQStrippingTransformation(const std::set<size_t>& levels_to_strip, bool replace_with_clamp) {
MATCHER_SCOPE(FQStrippingTransformation);
auto is_scalar = [](const Output<Node>& output) -> bool {
return ov::shape_size(output.get_shape()) == 1;
};
auto input_low_m = pattern::wrap_type<ov::op::v0::Constant>(is_scalar);
auto input_high_m = pattern::wrap_type<ov::op::v0::Constant>(is_scalar);
auto output_low_m = pattern::wrap_type<ov::op::v0::Constant>(is_scalar);
auto output_high_m = pattern::wrap_type<ov::op::v0::Constant>(is_scalar);
auto fq_m = pattern::wrap_type<ov::op::v0::FakeQuantize>(
{pattern::any_input(), input_low_m, input_high_m, output_low_m, output_high_m});

ov::graph_rewrite_callback callback = [OV_CAPTURE_CPY_AND_THIS](pattern::Matcher& m) {
const auto& pattern_map = m.get_pattern_value_map();
auto node = ov::as_type_ptr<ov::op::v0::FakeQuantize>(pattern_map.at(fq_m).get_node_shared_ptr());
if (!node) {
return false;
}
void adjust() {
propagate_backward(m_fq);
propagate_forward(m_fq);

const size_t levels = node->get_levels();
if (!levels_to_strip.count(levels)) {
return false;
if (scale_adjustment_possible()) {
for (auto& input : m_pending_adjustments) {
auto original_const = input.get_source_output();
auto divisor_const =
ov::op::v0::Constant::create(original_const.get_element_type(), {}, {m_scale_divisor});
auto new_const = ov::op::util::make_try_fold<ov::op::v1::Divide>(original_const, divisor_const);
OPENVINO_ASSERT(new_const, "Adjusted scale must be constant");
ov::copy_runtime_info(original_const.get_node_shared_ptr(), new_const);
input.replace_source_output(new_const);
}
}
}

private:
float m_scale_divisor;
ov::Node* m_fq;
bool m_scale_adjustment_possible = true;

auto input = node->get_input_node_shared_ptr(0);
auto input_low = ov::as_type_ptr<ov::op::v0::Constant>(pattern_map.at(input_low_m).get_node_shared_ptr());
auto input_high = ov::as_type_ptr<ov::op::v0::Constant>(pattern_map.at(input_high_m).get_node_shared_ptr());
auto output_low = ov::as_type_ptr<ov::op::v0::Constant>(pattern_map.at(output_low_m).get_node_shared_ptr());
auto output_high = ov::as_type_ptr<ov::op::v0::Constant>(pattern_map.at(output_high_m).get_node_shared_ptr());
std::unordered_set<ov::Node*> m_visited;
std::vector<ov::Input<ov::Node>> m_pending_adjustments;

if (!input_low || !input_high || !output_low || !output_high) {
return false;
bool scale_adjustment_possible() const {
return m_scale_adjustment_possible;
}

static bool is_scale_invariant(ov::Node* n) {
return ov::is_type_any_of<ov::op::v0::MVN, ov::op::v6::MVN, ov::op::v1::Softmax, ov::op::v8::Softmax>(n);
}

void validate_activations_flow_node(ov::Node* n, bool is_forward) {
if (is_forward && is_scale_invariant(n)) {
return;
}
auto constants_are_equal = [](const std::shared_ptr<ov::op::v0::Constant>& lhs,
const std::shared_ptr<ov::op::v0::Constant>& rhs) -> bool {
auto equal =
ov::as_type_ptr<ov::op::v0::Constant>(ov::op::util::make_try_fold<ov::op::v1::Equal>(lhs, rhs));
OPENVINO_ASSERT(equal && ov::shape_size(equal->get_shape()) == 1,
"constants_are_equal expects scalar constant as a comparison result");
return equal->get_vector<bool>()[0];
// Note: the set of supported nodes is intentionally limited to avoid overcomplicating the adjuster logic and make it safer.
// The current set is enough for covering all existing models which require scale adjustment.
if (!ov::is_type_any_of<ov::op::v1::Add,
ov::op::v0::Constant,
ov::op::v0::Convert,
ov::op::v1::Convolution,
ov::op::v0::FakeQuantize,
ov::op::v0::MatMul,
ov::op::v1::Multiply,
ov::op::v1::Reshape,
ov::op::v1::Transpose>(n)) {
m_scale_adjustment_possible = false;
}
}

auto make_skip_predicate(bool is_forward) {
return [this, is_forward](ov::Node* n) {
const auto& out_precision = n->get_output_element_type(0);
const bool shapeof_subgraph = ov::is_type<ov::op::v0::ShapeOf>(n) || ov::is_type<ov::op::v3::ShapeOf>(n) ||
out_precision == ov::element::i32 || out_precision == ov::element::i64;
// Both forward/backward propagation should not visit shape related paths
if (shapeof_subgraph) {
return true;
}

validate_activations_flow_node(n, is_forward);
return !scale_adjustment_possible() || (is_forward && is_scale_invariant(n));
};
}

void propagate_backward(ov::Node* root) {
auto collect_nodes_to_scale = [&](ov::Node* node) {
using namespace ov::pass::pattern;
auto convert = wrap_type<ov::op::v0::Convert>({wrap_const()});
auto sub_const_convert = optional<ov::op::v0::Convert>({wrap_const()});
auto subtract = optional<ov::op::v1::Subtract>({convert, sub_const_convert});
auto multiply = wrap_type<ov::op::v1::Multiply>({subtract, wrap_const()});
auto matcher = std::make_shared<Matcher>(multiply, "WeightsDQPattern");

// Case 1: DQ block on constant path — collect for scale adjustment
if (matcher->match(node->shared_from_this())) {
const auto mul = matcher->get_pattern_value_map().at(multiply).get_node_shared_ptr();
const bool const_is_in1 = ov::is_type<ov::op::v0::Constant>(mul->get_input_node_shared_ptr(1));

m_pending_adjustments.push_back(mul->input(const_is_in1 ? 1 : 0));
// Stop backward propagation since adjustement is done for this branch
for (const auto& in : matcher->get_match_root()->input_values()) {
m_visited.insert(in.get_node());
}
return;
}

// Case 2: FakeQuantize (un-stripped) — collect for ranges adjustment
if (ov::is_type<ov::op::v0::FakeQuantize>(node)) {
for (size_t i = 1; i < node->get_input_size(); ++i) {
m_pending_adjustments.push_back(node->input(i));
m_visited.insert(node->get_input_node_ptr(i));
}
return;
}

// Case 3: Layers with weights: backward propagation goes only by 2nd input
if (ov::is_type_any_of<ov::op::v0::MatMul, ov::op::v1::Multiply, ov::op::v1::Convolution>(node)) {
m_visited.insert(node->get_input_node_ptr(0));
return;
}
};
ov::op::util::visit_path(root, m_visited, collect_nodes_to_scale, make_skip_predicate(false));
}

void propagate_forward(ov::Node* root) {
auto collect_nodes_to_scale = [&](ov::Node* node) {
// Case 1: FakeQuantize (un-stripped) — collect for ranges adjustment
if (ov::is_type<ov::op::v0::FakeQuantize>(node) && node != m_fq && !node->get_users().empty()) {
for (size_t i = 1; i < node->get_input_size(); ++i) {
m_pending_adjustments.push_back(node->input(i));
}
return;
}

// Case 2: Commutative ops: backward propagation should be called for all non visited inputs
if (ov::is_type<ov::op::v1::Add>(node)) {
for (size_t i = 0; i < node->get_input_size(); ++i) {
auto input_node = node->get_input_node_ptr(i);
if (m_visited.count(input_node))
continue;
propagate_backward(input_node);
}
}
};
ov::op::util::visit_path_forward(root, m_visited, collect_nodes_to_scale, make_skip_predicate(true));
}
};
} // namespace

FQStrippingTransformation::FQStrippingTransformation(const std::set<size_t>& levels_to_strip,
bool need_weights_adjustment)
: levels_to_strip(levels_to_strip),
need_weights_adjustment(need_weights_adjustment) {}

bool FQStrippingTransformation::run_on_model(const std::shared_ptr<ov::Model>& f) {
RUN_ON_FUNCTION_SCOPE(FQStrippingTransformation);
if (levels_to_strip.empty()) {
return false;
}

auto fq_ranges_are_the_same = [](const std::shared_ptr<ov::op::v0::FakeQuantize>& fq) {
auto equal_with_threshold = [](const ov::Output<ov::Node>& val1, const ov::Output<ov::Node>& val2) {
auto diff = std::make_shared<ov::op::v1::Subtract>(val1, val2);
auto abs_diff = std::make_shared<ov::op::v0::Abs>(diff);
auto eps = ov::op::v0::Constant::create(val1.get_element_type(), {}, {1e-6f});
auto is_less = ov::util::get_constant_from_source(std::make_shared<ov::op::v1::Less>(abs_diff, eps));

auto all_true = [](const std::shared_ptr<ov::op::v0::Constant>& c) {
auto v = c->get_vector<bool>();
return std::all_of(v.begin(), v.end(), [](bool b) {
return b;
});
};
return is_less && all_true(is_less);
};

if (!constants_are_equal(input_low, output_low) || !constants_are_equal(input_high, output_high)) {
return false;
return equal_with_threshold(fq->input_value(1), fq->input_value(3)) &&
equal_with_threshold(fq->input_value(2), fq->input_value(4));
};

bool model_changed = false;
for (const auto& node : f->get_ordered_ops()) {
auto fq = ov::as_type_ptr<ov::op::v0::FakeQuantize>(node);
if (!fq || transformation_callback(node)) {
continue;
}

bool res = false;
if (replace_with_clamp) {
auto clamp = std::make_shared<ov::op::v0::Clamp>(input->output(0),
output_low->cast_vector<double>()[0],
output_high->cast_vector<double>()[0]);
res = replace_node_update_name(node, clamp);
} else {
res = replace_output_update_name(node->output(0), node->input_value(0));
if (!levels_to_strip.count(fq->get_levels()) || !fq_ranges_are_the_same(fq)) {
continue;
}
OPENVINO_ASSERT(res, "FQ stripping failed");
return res;
};

auto m = std::make_shared<ov::pass::pattern::Matcher>(fq_m, matcher_name);
this->register_matcher(m, callback);
// Compute dq_scale = |input_high - input_low| / (levels - 1)
auto levels_minus_one_node = ov::op::v0::Constant::create(fq->input_value(2).get_element_type(),
ov::Shape{},
{static_cast<float>(fq->get_levels() - 1)});
auto input_range_node = std::make_shared<ov::op::v1::Subtract>(fq->input_value(2), fq->input_value(1));
auto abs_input_range = std::make_shared<ov::op::v0::Abs>(input_range_node);
auto dq_scale_per_elem = ov::util::get_constant_from_source(
std::make_shared<ov::op::v1::Divide>(abs_input_range, levels_minus_one_node));
if (!dq_scale_per_elem) {
continue;
}

const auto dq_scale_values = dq_scale_per_elem->cast_vector<float>();
const auto max_dq_scale = *std::max_element(dq_scale_values.begin(), dq_scale_values.end());
constexpr auto threshold = 1.0f;

if (need_weights_adjustment && max_dq_scale > threshold) {
constexpr auto ratio = 10.0f;
ScaleAdjuster adjuster(max_dq_scale * ratio, fq);
adjuster.adjust();
}

OPENVINO_ASSERT(replace_output_update_name(fq->output(0), fq->input_value(0)), "FQ stripping failed");
model_changed = true;
}

return model_changed;
}

} // namespace low_precision
Expand Down
Loading
Loading