Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[QNN EP] Make offloading graph input/output quantization (to CPU) the default #23368

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions include/onnxruntime/core/session/onnxruntime_c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -3665,8 +3665,8 @@ struct OrtApi {
* - "1": Enabled.
* "offload_graph_io_quantization": Offload graph input quantization and graph output dequantization to another
* execution provider (typically CPU EP).
* - "0": Default. Disabled. QNN EP will handle quantization and dequantization of graph I/O.
* - "1": Enabled.
* - "0": Disabled. QNN EP will handle quantization and dequantization of graph I/O.
* - "1": Enabled. This is the default value.
* "enable_htp_spill_fill_buffer": Enable HTP spill fill buffer setting. The flag is used while generating context binary.
* - "0": Default. Disabled.
* - "1": Enabled.
Expand Down
10 changes: 6 additions & 4 deletions onnxruntime/core/providers/qnn/qnn_execution_provider.cc
Original file line number Diff line number Diff line change
Expand Up @@ -384,13 +384,15 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
// Add this option because this feature requires QnnSystem lib and it's no supported for Windows x86_64 platform
enable_spill_fill_buffer_ = ParseBoolOption("enable_htp_spill_fill_buffer", false, provider_options_map);

model_settings_.offload_graph_io_quantization = ParseBoolOption("offload_graph_io_quantization", false,
model_settings_.offload_graph_io_quantization = ParseBoolOption("offload_graph_io_quantization", true,
provider_options_map);

if (disable_cpu_ep_fallback_ && model_settings_.offload_graph_io_quantization) {
LOGS_DEFAULT(WARNING) << "Fallback to CPU EP is disabled, but user configured QNN EP to offload graph I/O "
<< "quantization/dequantization to another EP. Session creation will fail if the CPU EP "
<< "handles the graph I/O quantization/dequantization.";
LOGS_DEFAULT(INFO) << "Fallback to CPU EP is disabled, but user tried to configure QNN EP to offload graph I/O "
<< "quantization/dequantization to another EP. These are conflicting options. Fallback to CPU "
<< "EP will remain disabled and graph I/O quantization/dequantization will not be offloaded "
<< "to another EP.";
model_settings_.offload_graph_io_quantization = false;
}

static const std::string QNN_HTP_SHARED_MEMORY_ALLOCATOR_ENABLED = "enable_htp_shared_memory_allocator";
Expand Down
1 change: 1 addition & 0 deletions onnxruntime/test/providers/qnn/argmaxmin_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ static void RunQDQArgMxxOpTest(const std::string& op_type, TestInputDef<float> i
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, attrs), // baseline float32 model
BuildQDQArgMxxTestCase<QType>(op_type, input_def, attrs), // QDQ model
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/test/providers/qnn/average_pool_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ static void RunAveragePoolOpTest(const std::string& op_type,
#else
provider_options["backend_path"] = "libQnnCpu.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

RunQnnModelTest(BuildOpTestCase<float>(op_type, input_defs, {}, attrs),
provider_options,
Expand All @@ -53,6 +54,7 @@ static void RunQDQAveragePoolOpTest(const std::string& op_type,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, input_defs, {}, attrs),
BuildQDQOpTestCase<QuantType>(op_type, input_defs, {}, attrs),
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/test/providers/qnn/batch_norm_htp_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ static void RunBatchNormQDQTest(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

// Runs model with DQ-> InstanceNorm -> Q and compares the outputs of the CPU and QNN EPs.
TestQDQModelAccuracy(BuildBatchNormTestCase(input_def, scale_def, bias_def),
Expand All @@ -180,6 +181,7 @@ static void RunBatchNormFP16Test(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

TestInputDef<MLFloat16> input_fp16_def = ConvertToFP16InputDef(input_def);
TestInputDef<MLFloat16> scale_fp16_def = ConvertToFP16InputDef(scale_def);
Expand Down
1 change: 1 addition & 0 deletions onnxruntime/test/providers/qnn/cast_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ static void RunCastOpTest(const std::vector<int64_t>& shape, ONNX_NAMESPACE::Ten
#else
provider_options["backend_path"] = use_htp ? "libQnnHtp.so" : "libQnnCpu.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

if (use_htp && enable_fp16_precision) {
provider_options["enable_htp_fp16_precision"] = "1";
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/test/providers/qnn/clip_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ static void RunQDQClipTestOnHTP(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

auto f32_model_builder = BuildOpTestCase<float, float>("Clip", {input_def}, {min_max_defs}, {});
auto qdq_model_builder = BuildQDQOpTestCase<QType, float>("Clip", {input_def}, {min_max_defs}, {},
Expand Down Expand Up @@ -205,6 +206,7 @@ TEST_F(QnnHTPBackendTests, Clip_U8_Rank5) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

RunQnnModelTest(model_fn,
provider_options,
Expand Down
5 changes: 5 additions & 0 deletions onnxruntime/test/providers/qnn/conv_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ static void RunCPUConvOpTest(const std::string& conv_op_type, const TestInputDef
#else
provider_options["backend_path"] = "libQnnCpu.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

auto build_fn = BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads,
dilations, group, auto_pad);
RunQnnModelTest(build_fn,
Expand Down Expand Up @@ -317,6 +319,7 @@ static void RunHTPConvOpTest(const std::string& conv_op_type, const TestInputDef
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

TestQDQModelAccuracy(BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations,
group, auto_pad, output_activation),
Expand Down Expand Up @@ -354,6 +357,7 @@ static void RunHTPConvOpPerChannelTest(const std::string& conv_op_type, const Te
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

auto f32_fn = BuildF32ConvTestCase(conv_op_type, input_def, weights_def, bias_def, strides, pads, dilations,
group, auto_pad, output_activation);
Expand Down Expand Up @@ -665,6 +669,7 @@ TEST_F(QnnHTPBackendTests, Test_QDQConvWithDynamicWeightsFromMul) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

auto BuildConvMulGraph = [](ModelTestBuilder& builder) {
// DQ node for Conv input
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/test/providers/qnn/flatten_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ static void RunQDQFlattenTestOnHTP(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

auto f32_model_builder = BuildOpTestCase<float>("Flatten", {input_def}, {}, attrs);
auto qdq_model_builder = BuildQDQOpTestCase<QType>("Flatten", {input_def}, {}, attrs, kOnnxDomain, use_contrib_qdq);
Expand Down Expand Up @@ -172,6 +173,7 @@ TEST_F(QnnHTPBackendTests, Flatten_QDQ8bit_Rank5) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

RunQnnModelTest(model_fn,
provider_options,
Expand Down
3 changes: 3 additions & 0 deletions onnxruntime/test/providers/qnn/gather_elems_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ static void RunCPUGatherElemsOpTest(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnCpu.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

RunQnnModelTest(BuildOpTestCase<DataType, IndexType>("GatherElements", {input_def}, {indices_def}, attrs),
provider_options,
Expand All @@ -91,6 +92,7 @@ static void RunHTPQDQGatherElemsOpTest(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

auto f32_model_builder = BuildOpTestCase<float, IndexType>("GatherElements", {input_def}, {indices_def}, attrs);
auto qdq_model_builder = BuildQDQGatherElemsTestCase<QuantType, IndexType>(input_def, indices_def, attrs,
Expand Down Expand Up @@ -119,6 +121,7 @@ static void RunHTPGatherElemsOpTest(const TestInputDef<DataType>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

RunQnnModelTest(BuildOpTestCase<DataType, IndexType>("GatherElements", {input_def}, {indices_def}, attrs),
provider_options,
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/test/providers/qnn/gather_op_htp_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ static void RunQDQGatherOpTest(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

auto f32_model_builder = BuildOpTestCase<float, IndicesType>("Gather", {input_def}, {indices_def}, attrs);
auto qdq_model_builder = BuildQDQGatherTestCase<QuantType, IndicesType>(input_def, indices_def, attrs,
Expand Down Expand Up @@ -173,6 +174,7 @@ static void RunOpTest(const std::string& op_type,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

// Runs model with a Q/DQ binary op and compares the outputs of the CPU and QNN EPs.
RunQnnModelTest(BuildOpTestCase<InputType1, InputType2>(op_type, {input_def_1}, {input_defs_2}, attrs, op_domain),
Expand Down
3 changes: 3 additions & 0 deletions onnxruntime/test/providers/qnn/gemm_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ static void RunGemmTestOnCPU(const std::vector<TestInputDef<DataType>>& input_de
#else
provider_options["backend_path"] = "libQnnCpu.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

RunQnnModelTest(BuildOpTestCase<float>("Gemm", input_defs, {}, attrs),
provider_options,
Expand Down Expand Up @@ -246,6 +247,8 @@ static void RunQDQGemmTestOnHTP(const std::vector<TestInputDef<float>>& input_de
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

auto f32_model_builder = BuildOpTestCase<float>("Gemm", input_defs, {}, attrs);
auto qdq_model_builder = BuildQDQGemmTestCase<InputAQType, InputBQType>(input_defs, attrs, use_contrib_qdq);
TestQDQModelAccuracy<InputAQType>(f32_model_builder,
Expand Down
1 change: 1 addition & 0 deletions onnxruntime/test/providers/qnn/instance_norm_htp_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ static void RunInstanceNormQDQTest(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

// Runs model with DQ-> InstanceNorm -> Q and compares the outputs of the CPU and QNN EPs.
TestQDQModelAccuracy(BuildOpTestCase<float>("InstanceNormalization", {input_def, scale_def, bias_def}, {}, attrs),
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/test/providers/qnn/layer_norm_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ static void RunLayerNormCpuTest(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnCpu.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

RunQnnModelTest(BuildOpTestCase<float>("LayerNormalization", {input_def, scale_def}, {}, attrs),
provider_options,
Expand Down Expand Up @@ -152,6 +153,7 @@ static void RunLayerNormQDQTest(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

TestQDQModelAccuracy(BuildOpTestCase<float>("LayerNormalization", {input_def, scale_def}, {}, attrs),
BuildQDQLayerNormTestCase<InputQType, ScaleQType>(input_def, scale_def, bias_def, attrs,
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/test/providers/qnn/leakyrelu_op_htp_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ static void RunLeakyReluOpQDQTest(const TestInputDef<float>& input_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

TestQDQModelAccuracy(BuildOpTestCase<float>("LeakyRelu", {input_def}, {}, attrs),
BuildQDQOpTestCase<QuantType>("LeakyRelu", {input_def}, {}, attrs),
Expand Down Expand Up @@ -66,6 +67,7 @@ TEST_F(QnnHTPBackendTests, LeakyReluFP16OpSet16) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

auto input_def = TestInputDef<float>({1, 2, 3}, false, {-40.0f, -20.0f, 1.0f, 10.0f, 30.0f, 40.0f});
TestInputDef<MLFloat16> input_fp16_def = ConvertToFP16InputDef(input_def);
Expand Down
3 changes: 3 additions & 0 deletions onnxruntime/test/providers/qnn/logical_comp_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ static void RunCPULogicalOpTest(const std::string& op_type, const std::vector<in
#else
provider_options["backend_path"] = "libQnnCpu.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

RunQnnModelTest(BuildLogicalOpTestCase(op_type, shape),
provider_options,
Expand All @@ -92,6 +93,7 @@ static void RunQDQLogicalOpTest(const std::string& op_type, const std::vector<in
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

RunQnnModelTest(BuildQDQLogicalOpTestCase<QuantType>(op_type, shape),
provider_options,
Expand Down Expand Up @@ -157,6 +159,7 @@ TEST_F(QnnHTPBackendTests, EqualToCast4D) {
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

// Model building function that creates a QDQ graph with an Equal node followed by
// a Cast to float32.
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/test/providers/qnn/lrn_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ static void RunCPULRNOpTest(const TestInputDef<float>& input_def, int64_t size,
provider_options["backend_path"] = "libQnnCpu.so";
fp32_abs_err = 1.5e-5f; // On linux we need slightly larger tolerance.
#endif
provider_options["offload_graph_io_quantization"] = "0";

RunQnnModelTest(BuildLRNTestCase(input_def, size, alpha, beta, bias),
provider_options,
Expand All @@ -91,6 +92,7 @@ static void RunQDQLRNOpTest(const TestInputDef<float>& input_def, int64_t size,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

TestQDQModelAccuracy(BuildLRNTestCase(input_def, size, alpha, beta, bias),
BuildQDQLRNTestCase<QuantType>(input_def, size, alpha, beta, bias),
Expand Down
7 changes: 5 additions & 2 deletions onnxruntime/test/providers/qnn/matmul_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ static void RunMatMulOpTest(bool is_htp_backend, const std::vector<int64_t>& sha
provider_options["backend_path"] = "libQnnCpu.so";
#endif
}
provider_options["offload_graph_io_quantization"] = "0";

RunQnnModelTest(BuildMatMulOpTestCase(
TestInputDef<float>(shape_0, is_initializer_0, GetSequentialFloatData(shape_0, 0.01f, 0.02f)),
Expand Down Expand Up @@ -142,6 +143,7 @@ static void RunQDQMatMulOpTest(const std::vector<int64_t>& shape_0, const std::v
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

TestInputDef<float> input0_def(
shape_0, is_initializer_0,
Expand Down Expand Up @@ -172,6 +174,7 @@ static void RunQDQPerChannelMatMulOpTest(
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

if (enable_fp16_precision) {
provider_options["enable_htp_fp16_precision"] = "1";
Expand Down Expand Up @@ -290,10 +293,10 @@ TEST_F(QnnHTPBackendTests, MatMulOp_QDQ) {
RunQDQPerChannelMatMulOpTest<uint16_t, Int4x2, uint16_t>({2, 3, 3, 3}, {3, 2}, -1, QDQTolerance(),
ExpectedEPNodeAssignment::All, 18, true);

// // UINT16, per-channel INT8 weight
// UINT16, per-channel INT8 weight
RunQDQPerChannelMatMulOpTest<uint16_t, int8_t, uint16_t>({2, 3}, {3, 2}, 1, QDQTolerance(),
ExpectedEPNodeAssignment::All, 21, false, false);
RunQDQPerChannelMatMulOpTest<uint16_t, int8_t, uint16_t>({2, 3, 3}, {3}, -1);
RunQDQPerChannelMatMulOpTest<uint16_t, int8_t, uint16_t>({2, 3, 3}, {3}, -1, QDQTolerance(0.0041f));
}

#endif // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/test/providers/qnn/max_min_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ static void RunCPUMinOrMaxOpTest(const std::string& op_type,
#else
provider_options["backend_path"] = "libQnnCpu.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

RunQnnModelTest(BuildOpTestCase<float>(op_type, input_defs, {}, {}, kOnnxDomain),
provider_options,
Expand All @@ -47,6 +48,7 @@ static void RunQDQMinOrMaxOpTest(const std::string& op_type,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, input_defs, {}, {}, kOnnxDomain), // baseline float32 model
BuildQDQOpTestCase<QType>(op_type, input_defs, {}, {}, kOnnxDomain), // QDQ model
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/test/providers/qnn/pad_op_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ static void RunPadOpTest(const TestInputDef<float>& data_def,
provider_options["backend_path"] = "libQnnCpu.so";
#endif
}
provider_options["offload_graph_io_quantization"] = "0";

if (enable_fp16_precision) {
provider_options["enable_htp_fp16_precision"] = "1";
Expand Down Expand Up @@ -144,6 +145,7 @@ static void RunQDQPadOpTest(const TestInputDef<float>& data_def,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

TestQDQModelAccuracy(BuildPadTestCase(data_def, pads_def, constant_value_def, attrs),
BuildPadQDQTestCase<QuantType>(data_def, pads_def, constant_value_def, attrs,
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/test/providers/qnn/pool_op_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ static void RunPoolOpTest(const std::string& op_type,
#else
provider_options["backend_path"] = "libQnnCpu.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

RunQnnModelTest(BuildOpTestCase<float>(op_type, {input_def}, {}, attrs),
provider_options,
Expand All @@ -83,6 +84,7 @@ static void RunQDQPoolOpTest(const std::string& op_type,
#else
provider_options["backend_path"] = "libQnnHtp.so";
#endif
provider_options["offload_graph_io_quantization"] = "0";

TestQDQModelAccuracy(BuildOpTestCase<float>(op_type, {input_def}, {}, attrs),
BuildPoolQDQTestCase<QuantType>(op_type, input_def, attrs, use_contrib_qdq_ops),
Expand Down
Loading
Loading