@@ -15,6 +15,12 @@ CSplittedModelInfer::CSplittedModelInfer(const std::string& model_path,
1515 : m_dynamic_load_model_weights(dynamic_load_model_weights),
1616 m_is_gpu (device.find(" GPU" ) != std::string::npos || device.find(" gpu" ) != std::string::npos),
1717 m_properties(properties) {
18+ #if !ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
19+ OPENVINO_ASSERT (!m_dynamic_load_model_weights,
20+ " Dynamic loading of model weights is not enabled in this build. Please set "
21+ " ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS to 1 and rebuild." );
22+ #endif
23+
1824 if (m_dynamic_load_model_weights) {
1925 OPENVINO_ASSERT (m_is_gpu, " Dynamic loading of model weights is currently only supported for GPU device." );
2026 }
@@ -46,7 +52,8 @@ void CSplittedModelInfer::get_splitted_model_paths(const std::string& model_path
4652 continue ;
4753 }
4854
49- // check if the file name end with "_preprocess.xml" or "_postprocess.xml" for preprocess and postprocess model
55+ // check if the file name end with "_preprocess.xml" or "_postprocess.xml" for preprocess and postprocess
56+ // model
5057 if (filename.size () > 15 && filename.substr (filename.size () - 15 ) == " _preprocess.xml" ) {
5158 m_preprocess_model_path = entry.path ().string ();
5259 } else if (filename.size () > 16 && filename.substr (filename.size () - 16 ) == " _postprocess.xml" ) {
@@ -79,7 +86,9 @@ void CSplittedModelInfer::get_splitted_model_paths(const std::string& model_path
7986 " Both preprocessing (_preprocess.xml) and postprocessing (_postprocess.xml) models are required." );
8087}
8188
82- void CSplittedModelInfer::load_model (const std::string& model_path, const ov::AnyMap& properties, const std::string& device) {
89+ void CSplittedModelInfer::load_model (const std::string& model_path,
90+ const ov::AnyMap& properties,
91+ const std::string& device) {
8392#if USE_FULL_MODEL
8493#else
8594 {
@@ -107,18 +116,21 @@ void CSplittedModelInfer::load_model(const std::string& model_path, const ov::An
107116 auto model = utils::singleton_core ().read_model (path);
108117 if (m_is_gpu) {
109118 if (m_dynamic_load_model_weights) {
110- properties_splitted_model[ov::weights_path.name ()] = std::filesystem::path (path).replace_extension (" .bin" ).string ();
119+ properties_splitted_model[ov::weights_path.name ()] =
120+ std::filesystem::path (path).replace_extension (" .bin" ).string ();
111121 auto cm = utils::singleton_core ().compile_model (model, m_context, properties_splitted_model);
112122 // Release model weights after compilation to save GPU memory. Load weights again in infer() when
113123 // weights are needed.
114124 cm.release_model_weights ();
115125 m_compiled_models.push_back (std::move (cm));
116126 } else {
117- m_compiled_models.push_back (utils::singleton_core ().compile_model (model, m_context, properties_splitted_model));
127+ m_compiled_models.push_back (
128+ utils::singleton_core ().compile_model (model, m_context, properties_splitted_model));
118129 m_infer_requests.push_back (m_compiled_models.back ().create_infer_request ());
119130 }
120131 } else {
121- m_compiled_models.push_back (utils::singleton_core ().compile_model (model, device, properties_splitted_model));
132+ m_compiled_models.push_back (
133+ utils::singleton_core ().compile_model (model, device, properties_splitted_model));
122134 m_infer_requests.push_back (m_compiled_models.back ().create_infer_request ());
123135 }
124136 }
@@ -149,10 +161,19 @@ void CSplittedModelInfer::infer(const ov::AnyMap& inputs) {
149161 OPENVINO_ASSERT (num_splitted_models > 1 ,
150162 " Splitted models should be at least 2, but got " + std::to_string (num_splitted_models));
151163
164+ # ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
165+ # if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
152166 std::future<bool > future_flag;
153167 if (m_dynamic_load_model_weights) {
154168 future_flag = std::move (thread_utils::load_model_weights_async (m_compiled_models[0 ]));
155169 }
170+ # else // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
171+ if (m_dynamic_load_model_weights) {
172+ PROFILE (pm, " load_model_weights" );
173+ m_compiled_models[0 ].load_model_weights ();
174+ }
175+ # endif // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
176+ # endif // ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
156177
157178 // Preprocess
158179 for (const auto & input : inputs) {
@@ -186,14 +207,20 @@ void CSplittedModelInfer::infer(const ov::AnyMap& inputs) {
186207 PROFILE (pm, " splitted_model_infer_" + std::to_string (i));
187208 ov::InferRequest curInferRequest;
188209 if (m_dynamic_load_model_weights) {
210+ # ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
189211 if (i + 1 < num_splitted_models) {
190- next_future_flag =
191- thread_utils::load_model_weights_async (m_compiled_models[i + 1 ]);
212+ # if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
213+ next_future_flag = thread_utils::load_model_weights_async (m_compiled_models[i + 1 ]);
214+ # else
215+ m_compiled_models[i + 1 ].load_model_weights ();
216+ # endif // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
192217 }
218+ # if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
193219 if (future_flag.valid ())
194220 future_flag.wait ();
195-
221+ # endif // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
196222 curInferRequest = m_compiled_models[i].create_infer_request ();
223+ # endif // ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
197224 } else {
198225 curInferRequest = m_infer_requests[i];
199226 }
@@ -208,10 +235,22 @@ void CSplittedModelInfer::infer(const ov::AnyMap& inputs) {
208235 PROFILE (pmi, " infer" );
209236 curInferRequest.infer ();
210237 }
238+
239+ # ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
211240 if (m_dynamic_load_model_weights) {
241+ # if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
212242 thread_utils::release_model_weights_async (m_compiled_models[i], std::move (curInferRequest));
243+ # else
244+ curInferRequest = ov::InferRequest (); // release infer request before releasing model weights to ensure the
245+ // model weights can be released successfully.
246+ m_compiled_models[i].release_model_weights ();
247+ # endif
213248 }
249+
250+ # if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
214251 future_flag = std::move (next_future_flag);
252+ # endif
253+ # endif // ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
215254 }
216255
217256 GENAI_DEBUG (
0 commit comments