Skip to content

Commit 9d0dc9f

Browse files
authored
Enable QNN EP weight sharing generation using public API (#23702)
### Description Enable QNN EP weight sharing generation using public API instead of internal interfaces, so that user can integrate into their own toolchain. The change is to share the QnnBackendManager across ORT sessions if ep.share_ep_contexts is enabled. And there is extra option to end the share so that we know when to remove the shared QnnBackendManager from the singleton. Change the tool name from onnxruntime_qnn_ctx_gen to ep_weight_sharing_ctx_gen, so that it can be shared for other EPs.
1 parent 813bdaa commit 9d0dc9f

15 files changed

+519
-377
lines changed

cmake/onnxruntime_python.cmake

+1-1
Original file line numberDiff line numberDiff line change
@@ -1029,7 +1029,7 @@ if (onnxruntime_USE_QNN)
10291029
add_custom_command(
10301030
TARGET onnxruntime_pybind11_state POST_BUILD
10311031
COMMAND ${CMAKE_COMMAND} -E copy
1032-
$<TARGET_FILE:onnxruntime_qnn_ctx_gen>
1032+
$<TARGET_FILE:ep_weight_sharing_ctx_gen>
10331033
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
10341034
)
10351035
if (EXISTS "${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf")

cmake/onnxruntime_unittests.cmake

+18-15
Original file line numberDiff line numberDiff line change
@@ -1289,31 +1289,34 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
12891289

12901290
if(onnxruntime_USE_QNN)
12911291
#qnn ctx generator
1292-
set(onnxruntime_qnn_ctx_gen_src_dir ${TEST_SRC_DIR}/qnn_ctx_gen)
1293-
set(onnxruntime_qnn_ctx_gen_src_patterns
1294-
"${onnxruntime_qnn_ctx_gen_src_dir}/*.cc"
1295-
"${onnxruntime_qnn_ctx_gen_src_dir}/*.h")
1292+
set(ep_weight_sharing_ctx_gen_src_dir ${TEST_SRC_DIR}/ep_weight_sharing_ctx_gen)
1293+
set(ep_weight_sharing_ctx_gen_src_patterns
1294+
"${ep_weight_sharing_ctx_gen_src_dir}/*.cc"
1295+
"${ep_weight_sharing_ctx_gen_src_dir}/*.h")
12961296

1297-
file(GLOB onnxruntime_qnn_ctx_gen_src CONFIGURE_DEPENDS
1298-
${onnxruntime_qnn_ctx_gen_src_patterns}
1297+
file(GLOB ep_weight_sharing_ctx_gen_src CONFIGURE_DEPENDS
1298+
${ep_weight_sharing_ctx_gen_src_patterns}
12991299
)
1300-
onnxruntime_add_executable(onnxruntime_qnn_ctx_gen ${onnxruntime_qnn_ctx_gen_src})
1301-
target_include_directories(onnxruntime_qnn_ctx_gen PRIVATE ${onnx_test_runner_src_dir} ${ONNXRUNTIME_ROOT}
1302-
${onnxruntime_graph_header} ${onnxruntime_exec_src_dir}
1303-
${CMAKE_CURRENT_BINARY_DIR})
1300+
onnxruntime_add_executable(ep_weight_sharing_ctx_gen ${ep_weight_sharing_ctx_gen_src})
1301+
target_include_directories(ep_weight_sharing_ctx_gen PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR})
13041302
if (WIN32)
1305-
target_compile_options(onnxruntime_qnn_ctx_gen PRIVATE ${disabled_warnings})
1303+
target_compile_options(ep_weight_sharing_ctx_gen PRIVATE ${disabled_warnings})
13061304
if (NOT DEFINED SYS_PATH_LIB)
13071305
set(SYS_PATH_LIB shlwapi)
13081306
endif()
13091307
endif()
13101308

1311-
if(WIN32)
1312-
target_link_libraries(onnxruntime_qnn_ctx_gen PRIVATE debug dbghelp advapi32)
1309+
if (onnxruntime_BUILD_SHARED_LIB)
1310+
set(ep_weight_sharing_ctx_gen_libs onnxruntime_common onnxruntime ${onnxruntime_EXTERNAL_LIBRARIES} ${GETOPT_LIB_WIDE})
1311+
target_link_libraries(ep_weight_sharing_ctx_gen PRIVATE ${ep_weight_sharing_ctx_gen_libs})
1312+
if (WIN32)
1313+
target_link_libraries(ep_weight_sharing_ctx_gen PRIVATE debug dbghelp advapi32)
1314+
endif()
1315+
else()
1316+
target_link_libraries(ep_weight_sharing_ctx_gen PRIVATE onnxruntime_session ${onnxruntime_test_providers_libs} ${onnxruntime_EXTERNAL_LIBRARIES} ${GETOPT_LIB_WIDE})
13131317
endif()
1314-
target_link_libraries(onnxruntime_qnn_ctx_gen PRIVATE onnx_test_runner_common onnxruntime_test_utils onnxruntime_common onnxruntime_graph onnxruntime_session onnxruntime_providers onnxruntime_framework onnxruntime_util onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers onnx_test_data_proto ${onnxruntime_test_providers_libs} ${onnxruntime_EXTERNAL_LIBRARIES} ${GETOPT_LIB_WIDE} ${SYS_PATH_LIB} ${CMAKE_DL_LIBS})
13151318

1316-
set_target_properties(onnxruntime_qnn_ctx_gen PROPERTIES FOLDER "ONNXRuntimeTest")
1319+
set_target_properties(ep_weight_sharing_ctx_gen PROPERTIES FOLDER "ONNXRuntimeTest")
13171320
endif()
13181321

13191322
# shared lib

include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h

+4-1
Original file line numberDiff line numberDiff line change
@@ -315,9 +315,12 @@ static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed
315315
// in case user need to merge/connect multiple EPContext nodes in one model
316316
static const char* const kOrtSessionOptionEpContextNodeNamePrefix = "ep.context_node_name_prefix";
317317

318-
// Share EP related resources across EPs
318+
// Share EP related resources across sessions
319319
static const char* const kOrtSessionOptionShareEpContexts = "ep.share_ep_contexts";
320320

321+
// Stop to share EP related resources across sessions from then on
322+
static const char* const kOrtSessionOptionStopShareEpContexts = "ep.stop_share_ep_contexts";
323+
321324
// Use this config when dumping EP context model with an external initializers file
322325
// All initializers will be inside the external data file if specified, otherwise all in Onnx file
323326
static const char* const kOrtSessionOptionsEpContextModelExternalInitializersFileName =

onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc

+2
Original file line numberDiff line numberDiff line change
@@ -470,8 +470,10 @@ Status QnnBackendManager::InitializeProfiling() {
470470
QnnProfile_Level_t qnn_profile_level = QNN_PROFILE_LEVEL_BASIC;
471471
if (ProfilingLevel::BASIC == profiling_level_merge_) {
472472
qnn_profile_level = QNN_PROFILE_LEVEL_BASIC;
473+
LOGS_DEFAULT(VERBOSE) << "Profiling level set to basic.";
473474
} else if (ProfilingLevel::DETAILED == profiling_level_merge_) {
474475
qnn_profile_level = QNN_PROFILE_LEVEL_DETAILED;
476+
LOGS_DEFAULT(VERBOSE) << "Profiling level set to detailed.";
475477
}
476478
Qnn_ErrorHandle_t result = qnn_interface_.profileCreate(backend_handle_, qnn_profile_level, &profile_backend_handle_);
477479
ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result, "Failed to create QNN profile! Error: ", QnnErrorHandleToString(result));

onnxruntime/core/providers/qnn/qnn_execution_provider.cc

+31-11
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,10 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
195195
share_ep_contexts_ =
196196
config_options->GetConfigOrDefault(kOrtSessionOptionShareEpContexts, "0") == "1";
197197
LOGS_DEFAULT(VERBOSE) << "User specified option - share EP contexts across sessions: " << share_ep_contexts_;
198+
199+
stop_share_ep_contexts_ =
200+
config_options->GetConfigOrDefault(kOrtSessionOptionStopShareEpContexts, "0") == "1";
201+
LOGS_DEFAULT(VERBOSE) << "User specified option - stop share EP contexts across sessions: " << stop_share_ep_contexts_;
198202
}
199203

200204
static const std::string BACKEND_PATH = "backend_path";
@@ -384,17 +388,27 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
384388
}
385389
}
386390

387-
qnn_backend_manager_ = qnn::QnnBackendManager::Create(
388-
qnn::QnnBackendManagerConfig{backend_path,
389-
profiling_level_etw,
390-
profiling_level,
391-
profiling_file_path,
392-
context_priority,
393-
qnn_saver_path,
394-
device_id_,
395-
htp_arch,
396-
soc_model,
397-
enable_htp_weight_sharing});
391+
// For context binary generation with weight sharing enabled, use the QnnBackendManager from the shared context if it exits
392+
// So that all graphs from later sessions will be compiled into the same QNN context
393+
if (context_cache_enabled_ && share_ep_contexts_ && SharedContext::GetInstance().GetSharedQnnBackendManager()) {
394+
qnn_backend_manager_ = SharedContext::GetInstance().GetSharedQnnBackendManager();
395+
// Clear the QnnBackendManager from singleton to stop the resource share
396+
if (stop_share_ep_contexts_) {
397+
SharedContext::GetInstance().ResetSharedQnnBackendManager();
398+
}
399+
} else {
400+
qnn_backend_manager_ = qnn::QnnBackendManager::Create(
401+
qnn::QnnBackendManagerConfig{backend_path,
402+
profiling_level_etw,
403+
profiling_level,
404+
profiling_file_path,
405+
context_priority,
406+
qnn_saver_path,
407+
device_id_,
408+
htp_arch,
409+
soc_model,
410+
enable_htp_weight_sharing});
411+
}
398412

399413
#if defined(_WIN32)
400414
if (onnxruntime::logging::EtwRegistrationManager::SupportsETW()) {
@@ -1037,6 +1051,12 @@ Status QNNExecutionProvider::Compile(const std::vector<FusedNodeAndGraph>& fused
10371051
qnn_context_embed_mode_,
10381052
max_spill_fill_buffer_size,
10391053
logger));
1054+
1055+
if (share_ep_contexts_ && !stop_share_ep_contexts_ &&
1056+
nullptr == SharedContext::GetInstance().GetSharedQnnBackendManager()) {
1057+
ORT_RETURN_IF_NOT(SharedContext::GetInstance().SetSharedQnnBackendManager(qnn_backend_manager_),
1058+
"Failed to set shared QnnBackendManager.");
1059+
}
10401060
}
10411061
return Status::OK();
10421062
}

onnxruntime/core/providers/qnn/qnn_execution_provider.h

+1
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ class QNNExecutionProvider : public IExecutionProvider {
9090
uint32_t default_rpc_control_latency_ = 0;
9191
bool enable_HTP_FP16_precision_ = true;
9292
bool share_ep_contexts_ = false;
93+
bool stop_share_ep_contexts_ = false;
9394
bool enable_spill_fill_buffer_ = false;
9495
#if defined(_WIN32)
9596
onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback callback_ETWSink_provider_ = nullptr;

onnxruntime/core/providers/qnn/shared_context.h

+26
Original file line numberDiff line numberDiff line change
@@ -61,13 +61,39 @@ class SharedContext {
6161
return graph_exist;
6262
}
6363

64+
bool SetSharedQnnBackendManager(std::shared_ptr<qnn::QnnBackendManager>& qnn_backend_manager) {
65+
const std::lock_guard<std::mutex> lock(mtx_);
66+
67+
if (qnn_backend_manager_ != nullptr) {
68+
if (qnn_backend_manager_ == qnn_backend_manager) {
69+
return true;
70+
}
71+
return false;
72+
}
73+
qnn_backend_manager_ = qnn_backend_manager;
74+
return true;
75+
}
76+
77+
std::shared_ptr<qnn::QnnBackendManager> GetSharedQnnBackendManager() {
78+
const std::lock_guard<std::mutex> lock(mtx_);
79+
return qnn_backend_manager_;
80+
}
81+
82+
void ResetSharedQnnBackendManager() {
83+
const std::lock_guard<std::mutex> lock(mtx_);
84+
qnn_backend_manager_.reset();
85+
}
86+
6487
private:
6588
SharedContext() = default;
6689
~SharedContext() = default;
6790

6891
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(SharedContext);
6992

93+
// Used for passing through QNN models (deserialized from context binary) across sessions
7094
std::vector<std::unique_ptr<qnn::QnnModel>> shared_qnn_models_;
95+
// Used for compiling multiple models into same QNN context binary
96+
std::shared_ptr<qnn::QnnBackendManager> qnn_backend_manager_;
7197
// Producer sessions can be in parallel
7298
// Consumer sessions have to be after producer sessions initialized
7399
std::mutex mtx_;

onnxruntime/test/qnn_ctx_gen/README.md onnxruntime/test/ep_weight_sharing_ctx_gen/README.md

+6-4
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,19 @@
22

33
This tool provides the way to generate Onnx models that wraps QNN context binary warpt with weight sharing enabled. The options to use with the tool are listed below:
44

5-
`onnxruntime_qnn_ctx_gen [options...] model_path,model_path`
5+
`ep_weight_sharing_ctx_gen [options...] model_1_path,model_2_path`
66

7-
./onnxruntime_qnn_ctx_gen -v -i "soc_model|60 htp_graph_finalization_optimization_mode|3" -C "ep.context_enable|1 ep.context_embed_mode|0" /mnt/c/model1.onnx,/mnt/c/model2.onnx
7+
./ep_weight_sharing_ctx_gen -e qnn -v -i "soc_model|60 htp_graph_finalization_optimization_mode|3" /mnt/c/model1.onnx,/mnt/c/model2.onnx
88

99
Options:
10-
10+
11+
-e [qnn|tensorrt|openvino|vitisai]: Specifies the compile based provider qnn, tensorrt, openvino, vitisai. Default is qnn.
12+
1113
-v: Show verbose information.
1214

1315
-C: [session_config_entries]: Specify session configuration entries as key-value pairs: -C "<key1>|<val1> <key2>|<val2>"
1416
Refer to onnxruntime_session_options_config_keys.h for valid keys and values.
15-
[Example] -C "ep.context_enable|1 ep.context_embed_mode|0"
17+
[Example] -C "ep.context_enable|1 ep.context_embed_mode|0". These are set as default so can be ignored.
1618

1719
-i: [provider_options]: Specify QNN EP specific runtime options as key value pairs. Different runtime options available are:
1820
[Usage]: -i '<key1>|<value1> <key2>|<value2>'

onnxruntime/test/qnn_ctx_gen/command_args_parser.cc onnxruntime/test/ep_weight_sharing_ctx_gen/command_args_parser.cc

+31-16
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
// Copyright (c) Microsoft Corporation. All rights reserved.
2-
// Copyright (c) 2023 NVIDIA Corporation.
32
// Licensed under the MIT License.
43

54
#include "command_args_parser.h"
@@ -29,28 +28,30 @@ namespace qnnctxgen {
2928

3029
/*static*/ void CommandLineParser::ShowUsage() {
3130
printf(
32-
"onnxruntime_qnn_ctx_gen [options...] model1_path,model2_path\n"
33-
"Example: ./onnxruntime_qnn_ctx_gen -i \"soc_model|60 htp_graph_finalization_optimization_mode|3\" -C \"ep.context_node_name_prefix|_part1\" ./model1.onnx,./model2.onnx\n"
31+
"ep_weight_sharing_ctx_gen [options...] model1_path,model2_path\n"
32+
"Example: ./ep_weight_sharing_ctx_gen -i \"soc_model|60 htp_graph_finalization_optimization_mode|3\" -C \"ep.context_node_name_prefix|_part1\" ./model1.onnx,./model2.onnx\n"
3433
"Options:\n"
34+
"\t-e [qnn|tensorrt|openvino|vitisai]: Specifies the compile based provider 'qnn','tensorrt','openvino', 'vitisai'. "
35+
"Default:'qnn'.\n"
3536
"\t-v: Show verbose information.\n"
3637
"\t-C: Specify session configuration entries as key-value pairs: -C \"<key1>|<value1> <key2>|<value2>\" \n"
3738
"\t Refer to onnxruntime_session_options_config_keys.h for valid keys and values. \n"
3839
"\t Force ep.context_enable to 1 and ep.context_embed_mode to 0. Change ep.context_file_path is not allowed."
3940
"\t [Example] -C \"ep.context_node_name_prefix|_part1\" \n"
40-
"\t-i: Specify QNN EP specific runtime options as key value pairs. Different runtime options available are: \n"
41+
"\t-i: Specify EP specific runtime options as key value pairs. Different runtime options available are: \n"
4142
"\t [Usage]: -i '<key1>|<value1> <key2>|<value2>'\n"
4243
"\n"
43-
"\t [backend_path]: QNN backend path. e.g '/folderpath/libQnnHtp.so', '/winfolderpath/QnnHtp.dll'. default to HTP backend\n"
44-
"\t [vtcm_mb]: QNN VTCM size in MB. default to 0(not set).\n"
45-
"\t [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: '0', '1', '2', '3', default is '0'.\n"
46-
"\t [soc_model]: The SoC Model number. Refer to QNN SDK documentation for specific values. Defaults to '0' (unknown). \n"
47-
"\t [htp_arch]: The minimum HTP architecture. The driver will use ops compatible with this architecture. eg: '0', '68', '69', '73', '75'. Defaults to '0' (none). \n"
48-
"\t [enable_htp_fp16_precision]: Enable the HTP_FP16 precision so that the float32 model will be inferenced with fp16 precision. \n"
44+
"\t [QNN only] [backend_path]: QNN backend path. e.g '/folderpath/libQnnHtp.so', '/winfolderpath/QnnHtp.dll'. default to HTP backend\n"
45+
"\t [QNN only] [vtcm_mb]: QNN VTCM size in MB. default to 0(not set).\n"
46+
"\t [QNN only] [htp_graph_finalization_optimization_mode]: QNN graph finalization optimization mode, options: '0', '1', '2', '3', default is '0'.\n"
47+
"\t [QNN only] [soc_model]: The SoC Model number. Refer to QNN SDK documentation for specific values. Defaults to '0' (unknown). \n"
48+
"\t [QNN only] [htp_arch]: The minimum HTP architecture. The driver will use ops compatible with this architecture. eg: '0', '68', '69', '73', '75'. Defaults to '0' (none). \n"
49+
"\t [QNN only] [enable_htp_fp16_precision]: Enable the HTP_FP16 precision so that the float32 model will be inferenced with fp16 precision. \n"
4950
"\t Otherwise, it will be fp32 precision. Works for float32 model for HTP backend. Defaults to '1' (with FP16 precision.). \n"
50-
"\t [enable_htp_weight_sharing]: Allows common weights across graphs to be shared and stored in a single context binary. Defaults to '1' (enabled).\n"
51-
"\t [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n"
52-
"\t Defaults to '0' (QNN EP handles the graph I/O quantization and dequantization). \n"
53-
"\t [enable_htp_spill_fill_buffer]: Enable HTP spill file buffer, used while generating QNN context binary."
51+
"\t [QNN only] [enable_htp_weight_sharing]: Allows common weights across graphs to be shared and stored in a single context binary. Defaults to '1' (enabled).\n"
52+
"\t [QNN only] [offload_graph_io_quantization]: Offload graph input quantization and graph output dequantization to another EP (typically CPU EP). \n"
53+
"\t Defaults to '1' (QNN EP handles the graph I/O quantization and dequantization). \n"
54+
"\t [QNN only] [enable_htp_spill_fill_buffer]: Enable HTP spill file buffer, used while generating QNN context binary."
5455
"\t [Example] -i \"vtcm_mb|8 htp_arch|73\" \n"
5556
"\n"
5657
"\t-h: help\n");
@@ -109,8 +110,22 @@ static bool ParseSessionConfigs(const std::string& configs_string,
109110

110111
/*static*/ bool CommandLineParser::ParseArguments(TestConfig& test_config, int argc, ORTCHAR_T* argv[]) {
111112
int ch;
112-
while ((ch = getopt(argc, argv, ORT_TSTR("o:u:i:C:vh"))) != -1) {
113+
while ((ch = getopt(argc, argv, ORT_TSTR("e:o:u:i:C:vh"))) != -1) {
113114
switch (ch) {
115+
case 'e':
116+
if (!CompareCString(optarg, ORT_TSTR("qnn"))) {
117+
test_config.machine_config.provider_type_name = onnxruntime::kQnnExecutionProvider;
118+
} else if (!CompareCString(optarg, ORT_TSTR("openvino"))) {
119+
test_config.machine_config.provider_type_name = onnxruntime::kOpenVINOExecutionProvider;
120+
} else if (!CompareCString(optarg, ORT_TSTR("tensorrt"))) {
121+
test_config.machine_config.provider_type_name = onnxruntime::kTensorrtExecutionProvider;
122+
} else if (!CompareCString(optarg, ORT_TSTR("vitisai"))) {
123+
test_config.machine_config.provider_type_name = onnxruntime::kVitisAIExecutionProvider;
124+
} else {
125+
fprintf(stderr, "The execution provider is not included in this tool.\n");
126+
return false;
127+
}
128+
break;
114129
case 'v':
115130
test_config.run_config.f_verbose = true;
116131
break;
@@ -162,7 +177,7 @@ static bool ParseSessionConfigs(const std::string& configs_string,
162177
'offload_graph_io_quantization', 'enable_htp_spill_fill_buffer'])");
163178
}
164179

165-
test_config.run_config.qnn_options[key] = value;
180+
test_config.run_config.provider_options[key] = value;
166181
}
167182
break;
168183
}

0 commit comments

Comments
 (0)