xipingyan · xipingyan · Feb 26, 2026 · Feb 6, 2026 · Feb 6, 2026 · Feb 6, 2026
diff --git a/.gitignore b/.gitignore
@@ -45,3 +45,11 @@ __pycache__
 _codeql_detected_source_root
 install
 *.avi
+
+# Module-GenAI generated files
+dumped_*.yaml
+generated_*.bmp
+samples/python/module_genai/*.sh
+tests/module_genai/cpp/*.sh
+tests/module_genai/cpp/*.yaml
+tests/module_genai/cpp/test_data/*.json
diff --git a/cmake/features.cmake b/cmake/features.cmake
@@ -10,6 +10,7 @@ option(ENABLE_TESTS "Enable tests build" ON)
 option(ENABLE_TOOLS "Enable tools build" ON)
 option(ENABLE_GGUF "Enable support for GGUF format" ON)
 option(ENABLE_XGRAMMAR "Enable support for structured output generation with xgrammar backend" ON)
+option(ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS "Enable offloading model weights (load/release)" OFF)
 
 # Disable building samples for NPM package
 if(CPACK_GENERATOR STREQUAL "NPM")

diff --git a/samples/cpp/module_genai/config_yaml/Wan2.1-T2V-1.3B-Diffusers/config_split_transformer.yaml b/samples/cpp/module_genai/config_yaml/Wan2.1-T2V-1.3B-Diffusers/config_split_transformer.yaml
@@ -53,6 +53,7 @@ pipeline_modules:
       cache_dir: ./cache_dir_denoiser_loop/
       model_path: tests/module_genai/cpp/test_models/Wan2.1-T2V-1.3B-Diffusers
       splitted_model: true
+      dynamic_load_weights: false # performance is low.
     type: DenoiserLoopModule
   latent_image:
     device: CPU

diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
@@ -146,6 +146,10 @@ if(ENABLE_GGUF)
     target_compile_definitions(${TARGET_NAME_OBJ} PRIVATE ENABLE_GGUF)
 endif()
 
+if(ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS)
+    target_compile_definitions(${TARGET_NAME_OBJ} PRIVATE ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS=1)
+endif()
+
 target_include_directories(${TARGET_NAME_OBJ} SYSTEM PRIVATE "${safetensors.h_SOURCE_DIR}")
 
 target_link_libraries(${TARGET_NAME_OBJ} PRIVATE openvino::runtime openvino::threading nlohmann_json::nlohmann_json minja ${YAML_CPP_TARGET} PRIVATE TBB::tbb)

diff --git a/src/cpp/src/module_genai/module_base.cpp b/src/cpp/src/module_genai/module_base.cpp
@@ -146,6 +146,24 @@ void IBaseModule::check_splitted_model() {
     }
 }
 
+bool IBaseModule::check_bool_param(const std::string& param_name, const bool& default_value) {
+    auto p = get_optional_param(param_name);
+    if (p.empty()) {
+        return default_value;
+    }
+
+    if (p == "true" || p == "True" || p == "TRUE" || p == "1") {
+        GENAI_INFO("Module[" + module_desc->name + "]: " + param_name + " = true");
+        return true;
+    } else if (p == "false" || p == "False" || p == "FALSE" || p == "0") {
+        GENAI_INFO("Module[" + module_desc->name + "]: " + param_name + " = false");
+        return false;
+    }
+    GENAI_ERR("Module[" + module_desc->name + "]: Invalid bool param value for '" + param_name + "': " + p +
+              ", use default value: " + (default_value ? "true" : "false"));
+    return default_value;
+}
+
 // PipelineDesc implementation
 PipelineDesc::PipelineDesc() : m_resource_cache(std::make_unique<PipelineResourceCache>()) {}
 

diff --git a/src/cpp/src/module_genai/module_base.hpp b/src/cpp/src/module_genai/module_base.hpp
@@ -76,6 +76,8 @@ class IBaseModule {
     bool m_splitted_model = false;
     void check_splitted_model();
 
+    bool check_bool_param(const std::string& param_name, const bool& default_value);
+
     // Initialize ov::Model from config models_map with param_name: "ov_model"
     void init_ov_model();
 };

diff --git a/src/cpp/src/module_genai/modules/md_denoiser_loop/class.cpp b/src/cpp/src/module_genai/modules/md_denoiser_loop/class.cpp
@@ -52,8 +52,9 @@ void DenoiserLoopModule::print_static_config() {
         type: "OVTensor"                                   # Support DataType: [OVTensor]
     params:
       model_path: "model"
-      splitted_model: "bool value"    # [Optional], default false.
-      cache_dir: "./cache_dir_transformer/"  # [Optional], default is empty string.
+      splitted_model: "bool value"          # [Optional], default false.
+      cache_dir: "./cache_dir_transformer/" # [Optional], default is empty string. But `splitted_model` and `dynamic_load_weights` depend on it.
+      dynamic_load_weights: "bool value"    # [Optional], default false. Whether to dynamically load/release model weights during inference to save GPU memory.
     )" << std::endl;
 }
 
@@ -82,6 +83,13 @@ bool DenoiserLoopModule::initialize() {
 
     check_splitted_model();
 
+    m_dynamic_load_weights = check_bool_param("dynamic_load_weights", false);
+
+    check_cache_dir();
+    if (m_dynamic_load_weights && m_cache_dir.empty()) {
+        GENAI_ERR("TransformerModule[" + module_desc->name + "]: 'cache_dir' must be set when 'dynamic_load_weights' is enabled");
+        return false;
+    }
     std::filesystem::path model_path = module_desc->get_full_path(it_path->second);
     auto transformer_model_path = model_path / "transformer/openvino_model.xml";
     if (m_model_type == DiffusionModelType::ZIMAGE) {
@@ -104,7 +112,6 @@ bool DenoiserLoopModule::initialize() {
     }
 
     auto properties = ov::AnyMap{};
-    check_cache_dir();
     if (!m_cache_dir.empty()) {
         properties["CACHE_DIR"] = m_cache_dir;
     }

diff --git a/src/cpp/src/module_genai/modules/md_denoiser_loop/splitted_model_infer.cpp b/src/cpp/src/module_genai/modules/md_denoiser_loop/splitted_model_infer.cpp
@@ -1,7 +1,10 @@
 #include "splitted_model_infer.hpp"
 
 #include <regex>
+
 #include "logger.hpp"
+#include "module_genai/utils/tensor_utils.hpp"
+#include "module_genai/utils/thread_helper.hpp"
 
 namespace ov::genai::module {
 
@@ -12,9 +15,18 @@ CSplittedModelInfer::CSplittedModelInfer(const std::string& model_path,
     : m_dynamic_load_model_weights(dynamic_load_model_weights),
       m_is_gpu(device.find("GPU") != std::string::npos || device.find("gpu") != std::string::npos),
       m_properties(properties) {
+#ifndef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
+    OPENVINO_ASSERT(!m_dynamic_load_model_weights,
+                    "Dynamic loading of model weights is not enabled in this build. Please set "
+                    "ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS to 1 and rebuild.");
+#endif
+
+    if (m_dynamic_load_model_weights) {
+        OPENVINO_ASSERT(m_is_gpu, "Dynamic loading of model weights is currently only supported for GPU device.");
+    }
     // parse all splitted model paths, model_path is the directory that contains all splitted models
     get_splitted_model_paths(model_path, device);
-    load_model(model_path, properties, device);
+    load_model(model_path, m_properties, device);
 }
 
 void CSplittedModelInfer::get_splitted_model_paths(const std::string& model_path, const std::string& device) {
@@ -40,7 +52,8 @@ void CSplittedModelInfer::get_splitted_model_paths(const std::string& model_path
                 continue;
             }
 
-            // check if the file name end with "_preprocess.xml" or "_postprocess.xml" for preprocess and postprocess model
+            // check if the file name end with "_preprocess.xml" or "_postprocess.xml" for preprocess and postprocess
+            // model
             if (filename.size() > 15 && filename.substr(filename.size() - 15) == "_preprocess.xml") {
                 m_preprocess_model_path = entry.path().string();
             } else if (filename.size() > 16 && filename.substr(filename.size() - 16) == "_postprocess.xml") {
@@ -73,7 +86,9 @@ void CSplittedModelInfer::get_splitted_model_paths(const std::string& model_path
                     "Both preprocessing (_preprocess.xml) and postprocessing (_postprocess.xml) models are required.");
 }
 
-void CSplittedModelInfer::load_model(const std::string& model_path, const ov::AnyMap& properties, const std::string& device) {
+void CSplittedModelInfer::load_model(const std::string& model_path,
+                                     const ov::AnyMap& properties,
+                                     const std::string& device) {
 #if USE_FULL_MODEL
 #else
     {
@@ -96,14 +111,30 @@ void CSplittedModelInfer::load_model(const std::string& model_path, const ov::An
         m_postprocess_infer_request = m_postprocess_compiled_model.create_infer_request();
     }
 
+    auto properties_splitted_model = properties;
     for (const auto& path : m_splitted_model_paths) {
         auto model = utils::singleton_core().read_model(path);
         if (m_is_gpu) {
-            m_compiled_models.push_back(utils::singleton_core().compile_model(model, m_context, properties));
+            if (m_dynamic_load_model_weights) {
+                properties_splitted_model[ov::weights_path.name()] =
+                    std::filesystem::path(path).replace_extension(".bin").string();
+                auto cm = utils::singleton_core().compile_model(model, m_context, properties_splitted_model);
+#    ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
+                // Release model weights after compilation to save GPU memory. Load weights again in infer() when
+                // weights are needed.
+                cm.release_model_weights();
+#    endif
+                m_compiled_models.push_back(std::move(cm));
+            } else {
+                m_compiled_models.push_back(
+                    utils::singleton_core().compile_model(model, m_context, properties_splitted_model));
+                m_infer_requests.push_back(m_compiled_models.back().create_infer_request());
+            }
         } else {
-            m_compiled_models.push_back(utils::singleton_core().compile_model(model, device, properties));
+            m_compiled_models.push_back(
+                utils::singleton_core().compile_model(model, device, properties_splitted_model));
+            m_infer_requests.push_back(m_compiled_models.back().create_infer_request());
         }
-        m_infer_requests.push_back(m_compiled_models.back().create_infer_request());
     }
 #endif
 }
@@ -128,6 +159,24 @@ void CSplittedModelInfer::infer(const ov::AnyMap& inputs) {
 
     m_full_infer_request.infer();
 #else
+    int num_splitted_models = static_cast<int>(m_compiled_models.size());
+    OPENVINO_ASSERT(num_splitted_models > 1,
+                    "Splitted models should be at least 2, but got " + std::to_string(num_splitted_models));
+
+#    ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
+#        if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+    std::future<bool> future_flag;
+    if (m_dynamic_load_model_weights) {
+        future_flag = std::move(thread_utils::load_model_weights_async(m_compiled_models[0]));
+    }
+#        else   // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+    if (m_dynamic_load_model_weights) {
+        PROFILE(pm, "load_model_weights");
+        m_compiled_models[0].load_model_weights();
+    }
+#        endif  // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+#    endif      // ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
+
     // Preprocess
     for (const auto& input : inputs) {
         m_preprocess_infer_request.set_tensor(input.first, input.second.as<ov::Tensor>());
@@ -155,17 +204,64 @@ void CSplittedModelInfer::infer(const ov::AnyMap& inputs) {
     ov::Tensor ppw_tensor = m_preprocess_infer_request.get_tensor("ppw");
 
     // Splitted models
-    for (size_t i = 0; i < m_infer_requests.size(); ++i) {
-        m_infer_requests[i].set_output_tensor(0, hidden_states_tensor);
-        m_infer_requests[i].set_tensor("hidden_states", hidden_states_tensor);
-        m_infer_requests[i].set_tensor("text_embeds", text_embeds_tensor);
-        m_infer_requests[i].set_tensor("timestep_proj", timestep_proj_tensor);
-        m_infer_requests[i].set_tensor("rotary_cos", rotary_cos_tensor);
-        m_infer_requests[i].set_tensor("rotary_sin", rotary_sin_tensor);
-        m_infer_requests[i].infer();
+    std::future<bool> next_future_flag;
+    for (int i = 0; i < num_splitted_models; ++i) {
+        PROFILE(pm, "splitted_model_infer_" + std::to_string(i));
+        ov::InferRequest curInferRequest;
+        if (m_dynamic_load_model_weights) {
+#    ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
+            if (i + 1 < num_splitted_models) {
+#        if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+                next_future_flag = thread_utils::load_model_weights_async(m_compiled_models[i + 1]);
+#        else
+                m_compiled_models[i + 1].load_model_weights();
+#        endif  // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+            }
+#        if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+            if (future_flag.valid())
+                future_flag.wait();
+#        endif  // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+            curInferRequest = m_compiled_models[i].create_infer_request();
+#    endif      // ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
+        } else {
+            curInferRequest = m_infer_requests[i];
+        }
+
+        curInferRequest.set_output_tensor(0, hidden_states_tensor);
+        curInferRequest.set_tensor("hidden_states", hidden_states_tensor);
+        curInferRequest.set_tensor("text_embeds", text_embeds_tensor);
+        curInferRequest.set_tensor("timestep_proj", timestep_proj_tensor);
+        curInferRequest.set_tensor("rotary_cos", rotary_cos_tensor);
+        curInferRequest.set_tensor("rotary_sin", rotary_sin_tensor);
+        {
+            PROFILE(pmi, "infer");
+            curInferRequest.infer();
+        }
+
+#    ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
+        if (m_dynamic_load_model_weights) {
+#        if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+            auto release_future =
+                thread_utils::release_model_weights_async(m_compiled_models[i], std::move(curInferRequest));
+            if (release_future.valid()) {
+                release_future.wait();
-                release_future.wait();
+                try {
+                    release_future.get();
+                } catch (const std::exception& e) {
+                    GENAI_ERROR(std::string("Exception during async model weights release: ") + e.what());
+                } catch (...) {
+                    GENAI_ERROR("Unknown exception during async model weights release");
+                }
-                release_future.wait();
+                try {
+                    release_future.get();
+                } catch (const std::exception& e) {
+                    GENAI_ERROR(std::string("Exception during async model weights release: ") + e.what());
+                } catch (...) {
+                    GENAI_ERROR("Unknown exception during async model weights release");
+                }
+            }
+#        else
+            curInferRequest = ov::InferRequest();  // release infer request before releasing model weights to ensure the
+                                                   // model weights can be released successfully.
+            m_compiled_models[i].release_model_weights();
+#        endif
+        }
+
+#        if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+        future_flag = std::move(next_future_flag);
+#        endif
+#    endif  // ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
     }
 
-    GENAI_DEBUG("hidden_states_tensor is remote tensor: " + std::to_string(hidden_states_tensor.is<ov::RemoteTensor>()));
+    GENAI_DEBUG(
+        "hidden_states_tensor is remote tensor: " + std::to_string(hidden_states_tensor.is<ov::RemoteTensor>()) +
+        ", shape:" + tensor_utils::shape_to_string(hidden_states_tensor.get_shape()));
 
     // Postprocess
     m_postprocess_infer_request.set_tensor("hidden_states", hidden_states_tensor);

diff --git a/src/cpp/src/module_genai/utils/thread_helper.hpp b/src/cpp/src/module_genai/utils/thread_helper.hpp
@@ -3,37 +3,39 @@
 
 #pragma once
 
-#include <future>
 #include <chrono>
+#include <future>
 #include <thread>
+
 #include "openvino/runtime/compiled_model.hpp"
+#include "profiler.hpp"
 
 namespace ov::genai::module::thread_utils {
 
-inline std::future<bool> load_model_weights_async(ov::CompiledModel& compiled_model) {
-    auto load_fun = [&]() -> bool {
-        // compiled_model.load_model_weights();
-        std::this_thread::sleep_for(std::chrono::milliseconds(500));
+#ifndef ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+#    define ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT 0  // Current multiple threads may cause GPU crash.
+#endif
+
+#ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
+inline std::future<bool> load_model_weights_async(ov::CompiledModel compiled_model) {
+    auto load_fun = [compiled_model]() mutable -> bool {
+        PROFILE(pm, "load_model_weights async");
+        compiled_model.load_model_weights();
+        // infer_request = compiled_model.create_infer_request();
         return true;
     };
-    return std::async(std::launch::async, load_fun);
+    return std::async(std::launch::async, std::move(load_fun));
 }
 
-inline void load_model_weights_finish(std::future<bool>& result_future) {
-    result_future.get();
-}
-
-inline std::future<bool> release_model_weights_async(ov::CompiledModel& compiled_model) {
-    auto load_fun = [&]() -> bool {
-        // compiled_model.release_model_weights();
-        std::this_thread::sleep_for(std::chrono::milliseconds(200));
+inline std::future<bool> release_model_weights_async(ov::CompiledModel compiled_model, ov::InferRequest infer_request) {
+    auto release_fun = [compiled_model, infer_request]() mutable -> bool {
+        PROFILE(pm, "release_model_weights async");
+        infer_request = ov::InferRequest();  // reset infer request to release the reference to the model weights
+        compiled_model.release_model_weights();
         return true;
     };
-    return std::async(std::launch::async, load_fun);
-}
-
-inline void release_model_weights_finish(std::future<bool>& result_future) {
-    result_future.get();
+    return std::async(std::launch::async, std::move(release_fun));
 }
+#endif
 
 }  // namespace ov::genai::module::thread_utils
diff --git a/tests/module_genai/cpp/.gitignore b/tests/module_genai/cpp/.gitignore
@@ -1,2 +1,3 @@
 dumped*.yaml
-profile*.json
+profile*.json
+unittest_cache_dir_*