Skip to content

Commit ea5ca55

Browse files
authored
[Paddle-TRT] Support engine sharing memory of multiple predictors (#47631)
1 parent d969c30 commit ea5ca55

File tree

4 files changed

+51
-15
lines changed

4 files changed

+51
-15
lines changed

paddle/fluid/inference/api/analysis_config.cc

+25-14
Original file line numberDiff line numberDiff line change
@@ -679,24 +679,11 @@ void AnalysisConfig::EnableTensorRtEngine(
679679
bool use_calib_mode) {
680680
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
681681
if (!use_gpu()) {
682-
LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
682+
LOG(ERROR) << "To use TensorRT engine, please call EnableUseGpu() first";
683683
return;
684684
}
685685

686686
use_tensorrt_ = true;
687-
#ifdef PADDLE_WITH_TENSORRT
688-
// https://forums.developer.nvidia.com/t/nvinfer1-createexecutioncontextwithoutdevicememory-returns-nullptr/111878/2
689-
// when trt version less than 7.2,
690-
// createExecutionContextWithoutDeviceMemory() has bug.
691-
// so, we cannot enable engine context memory sharing.
692-
#if IS_TRT_VERSION_GE(7200)
693-
trt_engine_memory_sharing_ = true;
694-
#else
695-
LOG(WARNING)
696-
<< "TensorRT engine context memory sharing needs version 7.2 and after.";
697-
trt_engine_memory_sharing_ = false;
698-
#endif
699-
#endif
700687
tensorrt_workspace_size_ = workspace_size;
701688
tensorrt_max_batchsize_ = max_batch_size;
702689
tensorrt_min_subgraph_size_ = min_subgraph_size;
@@ -711,6 +698,30 @@ void AnalysisConfig::EnableTensorRtEngine(
711698
#endif
712699
}
713700

701+
void AnalysisConfig::EnableTensorRTMemoryOptim(bool engine_memory_sharing,
702+
int sharing_identifier) {
703+
PADDLE_ENFORCE_EQ(
704+
use_tensorrt_,
705+
true,
706+
platform::errors::InvalidArgument(
707+
"To enable TensorRT memory optim, please call "
708+
"EnableTensorRtEngine or enable_tensorrt_engine first."));
709+
PADDLE_ENFORCE_GE(sharing_identifier,
710+
0,
711+
platform::errors::InvalidArgument(
712+
"The value of sharing_identifier must be greater "
713+
"than or equal to 0."));
714+
if (!engine_memory_sharing) {
715+
PADDLE_ENFORCE_EQ(sharing_identifier,
716+
0,
717+
platform::errors::InvalidArgument(
718+
"The value of sharing_identifier must be equal to 0 "
719+
"when engine_memory_sharing is false."));
720+
}
721+
trt_engine_memory_sharing_ = engine_memory_sharing;
722+
trt_engine_memory_sharing_identifier_ = sharing_identifier;
723+
}
724+
714725
void AnalysisConfig::EnableDlnne(
715726
int min_subgraph_size,
716727
int max_batch_size,

paddle/fluid/inference/api/analysis_predictor.h

+6-1
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,12 @@ class AnalysisPredictor : public PaddlePredictor {
103103
if (config_.shape_range_info_collected()) {
104104
config_.SwitchIrOptim(false);
105105
}
106-
predictor_id_ = inference::GetUniqueId();
106+
auto trt_identifier = config_.trt_engine_memory_sharing_identifier_;
107+
if (trt_identifier > 0) {
108+
predictor_id_ = -trt_identifier;
109+
} else {
110+
predictor_id_ = inference::GetUniqueId();
111+
}
107112
}
108113
///
109114
/// \brief Destroy the Analysis Predictor object

paddle/fluid/inference/api/paddle_analysis_config.h

+15
Original file line numberDiff line numberDiff line change
@@ -576,6 +576,20 @@ struct PD_INFER_DECL AnalysisConfig {
576576
///
577577
bool tensorrt_engine_enabled() const { return use_tensorrt_; }
578578
///
579+
/// \brief Turn on the TensorRT memory optimization.
580+
///
581+
/// \param engine_memory_sharing Whether to enable TensorRT memory
582+
/// optimization.
583+
/// \param sharing_identifier This parameter can be set if TensorRT memory
584+
/// optimization is enabled, and the value must be greater than 0. If you have
585+
/// multiple predictors that want to share memory, you can specify a
586+
/// same value for these predictors. NOTE: The predictors specified with the
587+
/// same value must be guaranteed to be executed serially, otherwise undefined
588+
/// behavior will occur.
589+
///
590+
void EnableTensorRTMemoryOptim(bool engine_memory_sharing = true,
591+
int sharing_identifier = 0);
592+
///
579593
/// \brief A boolean state telling whether the tensorrt engine memory sharing
580594
/// is activated.
581595
///
@@ -1093,6 +1107,7 @@ struct PD_INFER_DECL AnalysisConfig {
10931107
// memory reuse related.
10941108
bool enable_memory_optim_{false};
10951109
bool trt_engine_memory_sharing_{false};
1110+
int trt_engine_memory_sharing_identifier_{0};
10961111

10971112
bool use_mkldnn_{false};
10981113
std::unordered_set<std::string> mkldnn_enabled_op_types_;

paddle/fluid/pybind/inference_api.cc

+5
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232

3333
#include "paddle/fluid/inference/api/analysis_predictor.h"
3434
#include "paddle/fluid/inference/api/helper.h"
35+
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
3536
#include "paddle/fluid/inference/api/paddle_infer_contrib.h"
3637
#include "paddle/fluid/inference/api/paddle_inference_api.h"
3738
#include "paddle/fluid/inference/api/paddle_pass_builder.h"
@@ -732,6 +733,10 @@ void BindAnalysisConfig(py::module *m) {
732733
py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
733734
py::arg("use_static") = false,
734735
py::arg("use_calib_mode") = true)
736+
.def("enable_tensorrt_memory_optim",
737+
&AnalysisConfig::EnableTensorRTMemoryOptim,
738+
py::arg("engine_memory_sharing") = true,
739+
py::arg("sharing_identifier") = 0)
735740
.def("tensorrt_precision_mode", &AnalysisConfig::tensorrt_precision_mode)
736741
.def("set_trt_dynamic_shape_info",
737742
&AnalysisConfig::SetTRTDynamicShapeInfo,

0 commit comments

Comments
 (0)