Set shared memory type based on options during the compilation phase (microsoft#24196)

quic-ashigarg · Ashish Garg (AISW) · ankitm3k · commit 8e5b8b6cd2b9 · 2025-05-12T17:27:39.000+05:30
### Description During inference, using the QNN EP option to set enable_htp_shared_memory_allocator gives a hint that we use RPC allocated buffers to avoid buffer copy between CPU and NPU. With the current PR, we add hints in the compilation phase that if RPC memory is going to be used, any additional allocations done on the CPU can be avoided. ### Motivation and Context This should help reduce the peak CPU memory consumption while running AI work loads using shared memory. Related PR: microsoft#23136 Co-authored-by: Ashish Garg (AISW) <ashigarg@qti.qualcomm.com>
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.cc
@@ -204,7 +204,10 @@ Status BaseOpBuilder::ProcessOutputs(QnnModelWrapper& qnn_model_wrapper,
     std::string output_name;
   };
   std::vector<CastNodeInfo> cast_node_info_vec;
-
+  auto mem_type = QNN_TENSORMEMTYPE_RAW;
+  if (true == qnn_model_wrapper.GetModelSettings().htp_shared_memory) {
+    mem_type = QNN_TENSORMEMTYPE_MEMHANDLE;
+  }
   const auto output_count = GetOutputCountQnnRequired(node_unit);
   for (size_t output_i = 0; output_i < output_count; ++output_i) {
     const auto& output_name = outputs[output_i].node_arg.Name();
@@ -255,7 +258,8 @@ Status BaseOpBuilder::ProcessOutputs(QnnModelWrapper& qnn_model_wrapper,
                                                 QNN_TENSOR_TYPE_NATIVE,
                                                 supported_qnn_data_type,
                                                 output_info.quant_param.Copy(),
-                                                std::move(cast_output_shape));
+                                                std::move(cast_output_shape), {},
+                                                mem_type);
       ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(cast_input_tensorwrapper)), "Failed to add tensor.");
       output_names.push_back(cast_input_name);
       cast_node_info_vec.push_back({cast_node_name, cast_input_name, output_name});
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_def.h b/onnxruntime/core/providers/qnn/builder/qnn_def.h
@@ -188,10 +188,6 @@ class QnnTensorWrapper {
       SetQnnTensorClientBuf(qnn_tensor_, client_buf_);
     }
 
-    if (mem_type != QNN_TENSORMEMTYPE_RAW) {
-      ORT_THROW("mem_type not supported for now.");
-    }
-
     SetQnnTensorQParams(qnn_tensor_, quant_params_.Get());
   }
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
@@ -68,9 +68,13 @@ Status QnnModelWrapper::MakeTensorWrapper(const NodeUnitIODef& tensor, QnnTensor
     ORT_RETURN_IF_ERROR(UnpackInitializerData(*tensor_info.initializer_tensor, unpacked_tensor));
   }
 
+  Qnn_TensorMemType_t mem_type = QNN_TENSORMEMTYPE_RAW;
+  if (true == model_settings_.htp_shared_memory) {
+    mem_type = QNN_TENSORMEMTYPE_MEMHANDLE;
+  }
   tensor_wrapper = QnnTensorWrapper(tensor_name, GetTensorType(tensor_name), tensor_info.qnn_data_type,
                                     std::move(tensor_info.quant_param), std::move(tensor_info.shape),
-                                    std::move(unpacked_tensor));
+                                    std::move(unpacked_tensor), mem_type);
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.h
@@ -30,6 +30,7 @@ struct TensorInfo {
 
 struct ModelSettings {
   bool offload_graph_io_quantization = false;
+  bool htp_shared_memory = false;
 };
 
 class QnnModelWrapper {
diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc
@@ -431,6 +431,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio
     // Initialize rpcmem_library_.
     // This is necessary for HtpSharedMemoryAllocator to function and also indicates that the allocator is available.
     rpcmem_library_ = std::make_shared<qnn::RpcMemLibrary>();
+    model_settings_.htp_shared_memory = true;
   }
 
   dump_json_qnn_graph_ = ParseBoolOption("dump_json_qnn_graph", false, provider_options_map);

Original file line number	Diff line number	Diff line change
`@@ -188,10 +188,6 @@ class QnnTensorWrapper {`
`188`	`188`	`SetQnnTensorClientBuf(qnn_tensor_, client_buf_);`
`189`	`189`	`}`
`190`	`190`
`191`		`- if (mem_type != QNN_TENSORMEMTYPE_RAW) {`
`192`		`- ORT_THROW("mem_type not supported for now.");`
`193`		`- }`
`194`		`-`
`195`	`191`	`SetQnnTensorQParams(qnn_tensor_, quant_params_.Get());`
`196`	`192`	`}`
`197`	`193`
Original file line number	Diff line number	Diff line change
`@@ -68,9 +68,13 @@ Status QnnModelWrapper::MakeTensorWrapper(const NodeUnitIODef& tensor, QnnTensor`
`68`	`68`	`ORT_RETURN_IF_ERROR(UnpackInitializerData(*tensor_info.initializer_tensor, unpacked_tensor));`
`69`	`69`	`}`
`70`	`70`
	`71`	`+ Qnn_TensorMemType_t mem_type = QNN_TENSORMEMTYPE_RAW;`
	`72`	`+ if (true == model_settings_.htp_shared_memory) {`
	`73`	`+ mem_type = QNN_TENSORMEMTYPE_MEMHANDLE;`
	`74`	`+ }`
`71`	`75`	`tensor_wrapper = QnnTensorWrapper(tensor_name, GetTensorType(tensor_name), tensor_info.qnn_data_type,`
`72`	`76`	`std::move(tensor_info.quant_param), std::move(tensor_info.shape),`
`73`		`- std::move(unpacked_tensor));`
	`77`	`+ std::move(unpacked_tensor), mem_type);`
`74`	`78`	`return Status::OK();`
`75`	`79`	`}`
`76`	`80`
Original file line number	Diff line number	Diff line change
`@@ -431,6 +431,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio`
`431`	`431`	`// Initialize rpcmem_library_.`
`432`	`432`	`// This is necessary for HtpSharedMemoryAllocator to function and also indicates that the allocator is available.`
`433`	`433`	`rpcmem_library_ = std::make_shared<qnn::RpcMemLibrary>();`
	`434`	`+ model_settings_.htp_shared_memory = true;`
`434`	`435`	`}`
`435`	`436`
`436`	`437`	`dump_json_qnn_graph_ = ParseBoolOption("dump_json_qnn_graph", false, provider_options_map);`