xipingyan · xipingyan · Feb 26, 2026 · Feb 6, 2026 · Feb 6, 2026 · Feb 6, 2026
diff --git a/samples/cpp/module_genai/config_yaml/Wan2.1-T2V-1.3B-Diffusers/config_split_transformer.yaml b/samples/cpp/module_genai/config_yaml/Wan2.1-T2V-1.3B-Diffusers/config_split_transformer.yaml
@@ -53,6 +53,7 @@ pipeline_modules:
       cache_dir: ./cache_dir_denoiser_loop/
       model_path: tests/module_genai/cpp/test_models/Wan2.1-T2V-1.3B-Diffusers
       splitted_model: true
+      dynamic_load_weights: true
     type: DenoiserLoopModule
   latent_image:
     device: CPU

diff --git a/src/cpp/src/module_genai/module_base.cpp b/src/cpp/src/module_genai/module_base.cpp
@@ -146,6 +146,24 @@ void IBaseModule::check_splitted_model() {
     }
 }
 
+bool IBaseModule::check_bool_param(const std::string& param_name, const bool& default_value) {
+    auto p = get_optional_param(param_name);
+    if (p.empty()) {
+        return default_value;
+    }
+
+    if (p == "true" || p == "True" || p == "TRUE" || p == "1") {
+        GENAI_INFO("Module[" + module_desc->name + "]: " + param_name + " = true");
+        return true;
+    } else if (p == "false" || p == "False" || p == "FALSE" || p == "0") {
+        GENAI_INFO("Module[" + module_desc->name + "]: " + param_name + " = false");
+        return false;
+    }
+    GENAI_ERR("Module[" + module_desc->name + "]: Invalid bool param value for '" + param_name + "': " + p +
+              ", use default value: " + (default_value ? "true" : "false"));
+    return default_value;
+}
+
 }  // namespace module
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/module_genai/module_base.hpp b/src/cpp/src/module_genai/module_base.hpp
@@ -76,6 +76,8 @@ class IBaseModule {
     bool m_splitted_model = false;
     void check_splitted_model();
 
+    bool check_bool_param(const std::string& param_name, const bool& default_value);
+
     // Initialize ov::Model from config models_map with param_name: "ov_model"
     void init_ov_model();
 };

diff --git a/src/cpp/src/module_genai/modules/md_denoiser_loop/class.cpp b/src/cpp/src/module_genai/modules/md_denoiser_loop/class.cpp
@@ -52,8 +52,9 @@ void DenoiserLoopModule::print_static_config() {
         type: "OVTensor"                                   # Support DataType: [OVTensor]
     params:
       model_path: "model"
-      splitted_model: "bool value"    # [Optional], default false.
-      cache_dir: "./cache_dir_transformer/"  # [Optional], default is empty string.
+      splitted_model: "bool value"          # [Optional], default false.
+      cache_dir: "./cache_dir_transformer/" # [Optional], default is empty string. But `splitted_model` and `dynamic_load_weights` depend on it.
+      dynamic_load_weights: "bool value"    # [Optional], default true. Whether to dynamically load/release model weights during inference to save GPU memory.
     )" << std::endl;
 }
 
@@ -82,6 +83,8 @@ bool DenoiserLoopModule::initialize() {
 
     check_splitted_model();
 
+    m_dynamic_load_weights = check_bool_param("dynamic_load_weights", false);
+
     std::filesystem::path model_path = module_desc->get_full_path(it_path->second);
     auto transformer_model_path = model_path / "transformer/openvino_model.xml";
     if (m_model_type == DiffusionModelType::ZIMAGE) {

diff --git a/src/cpp/src/module_genai/modules/md_denoiser_loop/splitted_model_infer.cpp b/src/cpp/src/module_genai/modules/md_denoiser_loop/splitted_model_infer.cpp
@@ -2,6 +2,7 @@
 
 #include <regex>
 #include "logger.hpp"
+#include "module_genai/utils/thread_helper.hpp"
 
 namespace ov::genai::module {
 
@@ -12,9 +13,12 @@ CSplittedModelInfer::CSplittedModelInfer(const std::string& model_path,
     : m_dynamic_load_model_weights(dynamic_load_model_weights),
       m_is_gpu(device.find("GPU") != std::string::npos || device.find("gpu") != std::string::npos),
       m_properties(properties) {
+    if (m_dynamic_load_model_weights) {
+        OPENVINO_ASSERT(m_is_gpu, "Dynamic loading of model weights is currently only supported for GPU device.");
+    }
     // parse all splitted model paths, model_path is the directory that contains all splitted models
     get_splitted_model_paths(model_path, device);
-    load_model(model_path, properties, device);
+    load_model(model_path, m_properties, device);
 }
 
 void CSplittedModelInfer::get_splitted_model_paths(const std::string& model_path, const std::string& device) {
@@ -96,12 +100,22 @@ void CSplittedModelInfer::load_model(const std::string& model_path, const ov::An
         m_postprocess_infer_request = m_postprocess_compiled_model.create_infer_request();
     }
 
+    auto properties_splitted_model = properties;
     for (const auto& path : m_splitted_model_paths) {
         auto model = utils::singleton_core().read_model(path);
         if (m_is_gpu) {
-            m_compiled_models.push_back(utils::singleton_core().compile_model(model, m_context, properties));
+            if (m_dynamic_load_model_weights) {
+                properties_splitted_model[ov::weights_path.name()] = std::filesystem::path(path).replace_extension(".bin").string();
+                auto cm = utils::singleton_core().compile_model(model, m_context, properties_splitted_model);
+                // Release model weights after compilation to save GPU memory. Load weights again in infer() when
+                // weights are needed.
+                cm.release_model_weights();
+                m_compiled_models.push_back(std::move(cm));
+            } else {
+                m_compiled_models.push_back(utils::singleton_core().compile_model(model, m_context, properties_splitted_model));
+            }
         } else {
-            m_compiled_models.push_back(utils::singleton_core().compile_model(model, device, properties));
+            m_compiled_models.push_back(utils::singleton_core().compile_model(model, device, properties_splitted_model));
         }
         m_infer_requests.push_back(m_compiled_models.back().create_infer_request());
     }
@@ -128,6 +142,15 @@ void CSplittedModelInfer::infer(const ov::AnyMap& inputs) {
 
     m_full_infer_request.infer();
 #else
+    int num_splitted_models = static_cast<int>(m_compiled_models.size());
+    OPENVINO_ASSERT(num_splitted_models > 1,
+                    "Splitted models should be at least 2, but got " + std::to_string(num_splitted_models));
+
+    std::future<bool> future_flag;
+    if (m_dynamic_load_model_weights) {
+        future_flag = std::move(thread_utils::load_model_weights_async(m_compiled_models[0], m_infer_requests[0]));
+    }
+
     // Preprocess
     for (const auto& input : inputs) {
         m_preprocess_infer_request.set_tensor(input.first, input.second.as<ov::Tensor>());
@@ -155,14 +178,32 @@ void CSplittedModelInfer::infer(const ov::AnyMap& inputs) {
     ov::Tensor ppw_tensor = m_preprocess_infer_request.get_tensor("ppw");
 
     // Splitted models
-    for (size_t i = 0; i < m_infer_requests.size(); ++i) {
+    std::future<bool> next_future_flag;
+    for (int i = 0; i < num_splitted_models; ++i) {
+        PROFILE(pm, "splitted_model_infer_" + std::to_string(i));
+        if (m_dynamic_load_model_weights) {
+            if (i + 1 < num_splitted_models) {
+                next_future_flag =
+                    thread_utils::load_model_weights_async(m_compiled_models[i + 1], m_infer_requests[i + 1]);
+            }
+            if (future_flag.valid())
+                future_flag.wait();
+        }
+
         m_infer_requests[i].set_output_tensor(0, hidden_states_tensor);
         m_infer_requests[i].set_tensor("hidden_states", hidden_states_tensor);
         m_infer_requests[i].set_tensor("text_embeds", text_embeds_tensor);
         m_infer_requests[i].set_tensor("timestep_proj", timestep_proj_tensor);
         m_infer_requests[i].set_tensor("rotary_cos", rotary_cos_tensor);
         m_infer_requests[i].set_tensor("rotary_sin", rotary_sin_tensor);
-        m_infer_requests[i].infer();
+        {
+            PROFILE(pmi, "infer");
+            m_infer_requests[i].infer();
+        }
+        if (m_dynamic_load_model_weights) {
+            thread_utils::release_model_weights_async(m_compiled_models[i], m_infer_requests[i]);
+        }
+        future_flag = std::move(next_future_flag);
     }
 
     GENAI_DEBUG("hidden_states_tensor is remote tensor: " + std::to_string(hidden_states_tensor.is<ov::RemoteTensor>()));

diff --git a/src/cpp/src/module_genai/utils/thread_helper.hpp b/src/cpp/src/module_genai/utils/thread_helper.hpp
@@ -3,37 +3,70 @@
 
 #pragma once
 
-#include <future>
 #include <chrono>
+#include <future>
 #include <thread>
+
 #include "openvino/runtime/compiled_model.hpp"
+#include "profiler.hpp"
 
 namespace ov::genai::module::thread_utils {
 
-inline std::future<bool> load_model_weights_async(ov::CompiledModel& compiled_model) {
+#ifndef ENABLE_DYNAMIC_MODEL_WEIGHTS
+#    define ENABLE_DYNAMIC_MODEL_WEIGHTS 1
+#endif
+
+#ifndef DISABLE_THREAD
+#    define DISABLE_THREAD 1  // Current mulitple threads may cause GPU crash.
+#endif
+
+#if ENABLE_DYNAMIC_MODEL_WEIGHTS
+inline std::future<bool> load_model_weights_async(ov::CompiledModel& compiled_model, ov::InferRequest& infer_request) {
+#if DISABLE_THREAD
+    PROFILE(pm, "load_model_weights sync");
+    compiled_model.load_model_weights();
+    infer_request = compiled_model.create_infer_request();
+    return std::async(std::launch::deferred, []() -> bool { return true; });
+#else
     auto load_fun = [&]() -> bool {
-        // compiled_model.load_model_weights();
-        std::this_thread::sleep_for(std::chrono::milliseconds(500));
+        PROFILE(pm, "load_model_weights async");
+        compiled_model.load_model_weights();
+        infer_request = compiled_model.create_infer_request();
         return true;
     };
     return std::async(std::launch::async, load_fun);
+#endif
 }
 
-inline void load_model_weights_finish(std::future<bool>& result_future) {
-    result_future.get();
-}
-
-inline std::future<bool> release_model_weights_async(ov::CompiledModel& compiled_model) {
+inline std::future<bool> release_model_weights_async(ov::CompiledModel& compiled_model, ov::InferRequest& infer_request) {
+#if DISABLE_THREAD
+    PROFILE(pm, "release_model_weights sync");
+    compiled_model.release_model_weights();
+    // release infer request to release the reference to the model weights
+    infer_request = ov::InferRequest();
+    return std::async(std::launch::deferred, []() -> bool { return true; });
+#else
     auto load_fun = [&]() -> bool {
-        // compiled_model.release_model_weights();
-        std::this_thread::sleep_for(std::chrono::milliseconds(200));
+        PROFILE(pm, "release_model_weights async");
+        compiled_model.release_model_weights();
+        infer_request = ov::InferRequest();  // reset infer request to release the reference to the model weights
         return true;
     };
     return std::async(std::launch::async, load_fun);
+#endif
 }
-
-inline void release_model_weights_finish(std::future<bool>& result_future) {
-    result_future.get();
+#else
+inline std::future<bool> load_model_weights_async(ov::CompiledModel& compiled_model, ov::InferRequest& infer_request) {
+    return std::async(std::launch::deferred, []() -> bool {
+        return true;
+    });
+}
+inline std::future<bool> release_model_weights_async(ov::CompiledModel& compiled_model,
+                                                     ov::InferRequest& infer_request) {
+    return std::async(std::launch::deferred, []() -> bool {
+        return true;
+    });
 }
+#endif
 
 }  // namespace ov::genai::module::thread_utils