xipingyan · xipingyan · Feb 26, 2026 · Feb 6, 2026 · Feb 6, 2026 · Feb 6, 2026
diff --git a/.gitignore b/.gitignore
@@ -45,3 +45,11 @@ __pycache__
 _codeql_detected_source_root
 install
 *.avi
+
+# Module-GenAI generated files
+dumped_*.yaml
+generated_*.bmp
+samples/python/module_genai/*.sh
+tests/module_genai/cpp/*.sh
+tests/module_genai/cpp/*.yaml
+tests/module_genai/cpp/test_data/*.json
diff --git a/cmake/features.cmake b/cmake/features.cmake
@@ -10,6 +10,7 @@ option(ENABLE_TESTS "Enable tests build" ON)
 option(ENABLE_TOOLS "Enable tools build" ON)
 option(ENABLE_GGUF "Enable support for GGUF format" ON)
 option(ENABLE_XGRAMMAR "Enable support for structured output generation with xgrammar backend" ON)
+option(ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS "Enable offloading model weights (load/release)" OFF)
 
 # Disable building samples for NPM package
 if(CPACK_GENERATOR STREQUAL "NPM")

diff --git a/samples/cpp/module_genai/config_yaml/Wan2.1-T2V-1.3B-Diffusers/config_split_transformer.yaml b/samples/cpp/module_genai/config_yaml/Wan2.1-T2V-1.3B-Diffusers/config_split_transformer.yaml
@@ -53,6 +53,7 @@ pipeline_modules:
       cache_dir: ./cache_dir_denoiser_loop/
       model_path: tests/module_genai/cpp/test_models/Wan2.1-T2V-1.3B-Diffusers
       splitted_model: true
+      dynamic_load_weights: false # performance is low.
     type: DenoiserLoopModule
   latent_image:
     device: CPU

diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
@@ -146,6 +146,10 @@ if(ENABLE_GGUF)
     target_compile_definitions(${TARGET_NAME_OBJ} PRIVATE ENABLE_GGUF)
 endif()
 
+if(ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS)
+    target_compile_definitions(${TARGET_NAME_OBJ} PRIVATE ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS=1)
+endif()
+
 target_include_directories(${TARGET_NAME_OBJ} SYSTEM PRIVATE "${safetensors.h_SOURCE_DIR}")
 
 target_link_libraries(${TARGET_NAME_OBJ} PRIVATE openvino::runtime openvino::threading nlohmann_json::nlohmann_json minja ${YAML_CPP_TARGET} PRIVATE TBB::tbb)

diff --git a/src/cpp/src/module_genai/module_base.cpp b/src/cpp/src/module_genai/module_base.cpp
@@ -146,6 +146,24 @@ void IBaseModule::check_splitted_model() {
     }
 }
 
+bool IBaseModule::check_bool_param(const std::string& param_name, const bool& default_value) {
+    auto p = get_optional_param(param_name);
+    if (p.empty()) {
+        return default_value;
+    }
+
+    if (p == "true" || p == "True" || p == "TRUE" || p == "1") {
+        GENAI_INFO("Module[" + module_desc->name + "]: " + param_name + " = true");
+        return true;
+    } else if (p == "false" || p == "False" || p == "FALSE" || p == "0") {
+        GENAI_INFO("Module[" + module_desc->name + "]: " + param_name + " = false");
+        return false;
+    }
+    GENAI_ERR("Module[" + module_desc->name + "]: Invalid bool param value for '" + param_name + "': " + p +
+              ", use default value: " + (default_value ? "true" : "false"));
+    return default_value;
+}
+
 // PipelineDesc implementation
 PipelineDesc::PipelineDesc() : m_resource_cache(std::make_unique<PipelineResourceCache>()) {}
 

diff --git a/src/cpp/src/module_genai/module_base.hpp b/src/cpp/src/module_genai/module_base.hpp
@@ -76,6 +76,8 @@ class IBaseModule {
     bool m_splitted_model = false;
     void check_splitted_model();
 
+    bool check_bool_param(const std::string& param_name, const bool& default_value);
+
     // Initialize ov::Model from config models_map with param_name: "ov_model"
     void init_ov_model();
 };

diff --git a/src/cpp/src/module_genai/modules/md_denoiser_loop/class.cpp b/src/cpp/src/module_genai/modules/md_denoiser_loop/class.cpp
@@ -52,8 +52,9 @@ void DenoiserLoopModule::print_static_config() {
         type: "OVTensor"                                   # Support DataType: [OVTensor]
     params:
       model_path: "model"
-      splitted_model: "bool value"    # [Optional], default false.
-      cache_dir: "./cache_dir_transformer/"  # [Optional], default is empty string.
+      splitted_model: "bool value"          # [Optional], default false.
+      cache_dir: "./cache_dir_transformer/" # [Optional], default is empty string. But `splitted_model` and `dynamic_load_weights` depend on it.
+      dynamic_load_weights: "bool value"    # [Optional], default false. Whether to dynamically load/release model weights during inference to save GPU memory.
     )" << std::endl;
 }
 
@@ -82,6 +83,8 @@ bool DenoiserLoopModule::initialize() {
 
     check_splitted_model();
 
+    m_dynamic_load_weights = check_bool_param("dynamic_load_weights", false);
+
     std::filesystem::path model_path = module_desc->get_full_path(it_path->second);
     auto transformer_model_path = model_path / "transformer/openvino_model.xml";
     if (m_model_type == DiffusionModelType::ZIMAGE) {

diff --git a/src/cpp/src/module_genai/modules/md_denoiser_loop/splitted_model_infer.cpp b/src/cpp/src/module_genai/modules/md_denoiser_loop/splitted_model_infer.cpp
@@ -1,7 +1,10 @@
 #include "splitted_model_infer.hpp"
 
 #include <regex>
+
 #include "logger.hpp"
+#include "module_genai/utils/tensor_utils.hpp"
+#include "module_genai/utils/thread_helper.hpp"
 
 namespace ov::genai::module {
 
@@ -12,9 +15,18 @@ CSplittedModelInfer::CSplittedModelInfer(const std::string& model_path,
     : m_dynamic_load_model_weights(dynamic_load_model_weights),
       m_is_gpu(device.find("GPU") != std::string::npos || device.find("gpu") != std::string::npos),
       m_properties(properties) {
+#if !ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
+    OPENVINO_ASSERT(!m_dynamic_load_model_weights,
+                    "Dynamic loading of model weights is not enabled in this build. Please set "
+                    "ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS to 1 and rebuild.");
+#endif
+
+    if (m_dynamic_load_model_weights) {
+        OPENVINO_ASSERT(m_is_gpu, "Dynamic loading of model weights is currently only supported for GPU device.");
+    }
     // parse all splitted model paths, model_path is the directory that contains all splitted models
     get_splitted_model_paths(model_path, device);
-    load_model(model_path, properties, device);
+    load_model(model_path, m_properties, device);
 }
 
 void CSplittedModelInfer::get_splitted_model_paths(const std::string& model_path, const std::string& device) {
@@ -40,7 +52,8 @@ void CSplittedModelInfer::get_splitted_model_paths(const std::string& model_path
                 continue;
             }
 
-            // check if the file name end with "_preprocess.xml" or "_postprocess.xml" for preprocess and postprocess model
+            // check if the file name end with "_preprocess.xml" or "_postprocess.xml" for preprocess and postprocess
+            // model
             if (filename.size() > 15 && filename.substr(filename.size() - 15) == "_preprocess.xml") {
                 m_preprocess_model_path = entry.path().string();
             } else if (filename.size() > 16 && filename.substr(filename.size() - 16) == "_postprocess.xml") {
@@ -73,7 +86,9 @@ void CSplittedModelInfer::get_splitted_model_paths(const std::string& model_path
                     "Both preprocessing (_preprocess.xml) and postprocessing (_postprocess.xml) models are required.");
 }
 
-void CSplittedModelInfer::load_model(const std::string& model_path, const ov::AnyMap& properties, const std::string& device) {
+void CSplittedModelInfer::load_model(const std::string& model_path,
+                                     const ov::AnyMap& properties,
+                                     const std::string& device) {
 #if USE_FULL_MODEL
 #else
     {
@@ -96,14 +111,28 @@ void CSplittedModelInfer::load_model(const std::string& model_path, const ov::An
         m_postprocess_infer_request = m_postprocess_compiled_model.create_infer_request();
     }
 
+    auto properties_splitted_model = properties;
     for (const auto& path : m_splitted_model_paths) {
         auto model = utils::singleton_core().read_model(path);
         if (m_is_gpu) {
-            m_compiled_models.push_back(utils::singleton_core().compile_model(model, m_context, properties));
+            if (m_dynamic_load_model_weights) {
+                properties_splitted_model[ov::weights_path.name()] =
+                    std::filesystem::path(path).replace_extension(".bin").string();
+                auto cm = utils::singleton_core().compile_model(model, m_context, properties_splitted_model);
+                // Release model weights after compilation to save GPU memory. Load weights again in infer() when
+                // weights are needed.
+                cm.release_model_weights();
+                m_compiled_models.push_back(std::move(cm));
+            } else {
+                m_compiled_models.push_back(
+                    utils::singleton_core().compile_model(model, m_context, properties_splitted_model));
+                m_infer_requests.push_back(m_compiled_models.back().create_infer_request());
+            }
         } else {
-            m_compiled_models.push_back(utils::singleton_core().compile_model(model, device, properties));
+            m_compiled_models.push_back(
+                utils::singleton_core().compile_model(model, device, properties_splitted_model));
+            m_infer_requests.push_back(m_compiled_models.back().create_infer_request());
         }
-        m_infer_requests.push_back(m_compiled_models.back().create_infer_request());
     }
 #endif
 }
@@ -128,6 +157,24 @@ void CSplittedModelInfer::infer(const ov::AnyMap& inputs) {
 
     m_full_infer_request.infer();
 #else
+    int num_splitted_models = static_cast<int>(m_compiled_models.size());
+    OPENVINO_ASSERT(num_splitted_models > 1,
+                    "Splitted models should be at least 2, but got " + std::to_string(num_splitted_models));
+
+#    ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
+#        if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+    std::future<bool> future_flag;
+    if (m_dynamic_load_model_weights) {
+        future_flag = std::move(thread_utils::load_model_weights_async(m_compiled_models[0]));
+    }
+#        else   // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+    if (m_dynamic_load_model_weights) {
+        PROFILE(pm, "load_model_weights");
+        m_compiled_models[0].load_model_weights();
+    }
+#        endif  // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+#    endif      // ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
+
     // Preprocess
     for (const auto& input : inputs) {
         m_preprocess_infer_request.set_tensor(input.first, input.second.as<ov::Tensor>());
@@ -155,17 +202,60 @@ void CSplittedModelInfer::infer(const ov::AnyMap& inputs) {
     ov::Tensor ppw_tensor = m_preprocess_infer_request.get_tensor("ppw");
 
     // Splitted models
-    for (size_t i = 0; i < m_infer_requests.size(); ++i) {
-        m_infer_requests[i].set_output_tensor(0, hidden_states_tensor);
-        m_infer_requests[i].set_tensor("hidden_states", hidden_states_tensor);
-        m_infer_requests[i].set_tensor("text_embeds", text_embeds_tensor);
-        m_infer_requests[i].set_tensor("timestep_proj", timestep_proj_tensor);
-        m_infer_requests[i].set_tensor("rotary_cos", rotary_cos_tensor);
-        m_infer_requests[i].set_tensor("rotary_sin", rotary_sin_tensor);
-        m_infer_requests[i].infer();
+    std::future<bool> next_future_flag;
+    for (int i = 0; i < num_splitted_models; ++i) {
+        PROFILE(pm, "splitted_model_infer_" + std::to_string(i));
+        ov::InferRequest curInferRequest;
+        if (m_dynamic_load_model_weights) {
+#    ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
+            if (i + 1 < num_splitted_models) {
+#        if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+                next_future_flag = thread_utils::load_model_weights_async(m_compiled_models[i + 1]);
+#        else
+                m_compiled_models[i + 1].load_model_weights();
+#        endif  // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+            }
+#        if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+            if (future_flag.valid())
+                future_flag.wait();
+#        endif  // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+            curInferRequest = m_compiled_models[i].create_infer_request();
+#    endif      // ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
+        } else {
+            curInferRequest = m_infer_requests[i];
+        }
+
+        curInferRequest.set_output_tensor(0, hidden_states_tensor);
+        curInferRequest.set_tensor("hidden_states", hidden_states_tensor);
+        curInferRequest.set_tensor("text_embeds", text_embeds_tensor);
+        curInferRequest.set_tensor("timestep_proj", timestep_proj_tensor);
+        curInferRequest.set_tensor("rotary_cos", rotary_cos_tensor);
+        curInferRequest.set_tensor("rotary_sin", rotary_sin_tensor);
+        {
+            PROFILE(pmi, "infer");
+            curInferRequest.infer();
+        }
+
+#    ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
+        if (m_dynamic_load_model_weights) {
+#        if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+            thread_utils::release_model_weights_async(m_compiled_models[i], std::move(curInferRequest));
+#        else
+            curInferRequest = ov::InferRequest();  // release infer request before releasing model weights to ensure the
+                                                   // model weights can be released successfully.
+            m_compiled_models[i].release_model_weights();
+#        endif
+        }
+
+#        if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+        future_flag = std::move(next_future_flag);
+#        endif
+#    endif  // ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
     }
 
-    GENAI_DEBUG("hidden_states_tensor is remote tensor: " + std::to_string(hidden_states_tensor.is<ov::RemoteTensor>()));
+    GENAI_DEBUG(
+        "hidden_states_tensor is remote tensor: " + std::to_string(hidden_states_tensor.is<ov::RemoteTensor>()) +
+        ", shape:" + tensor_utils::shape_to_string(hidden_states_tensor.get_shape()));
 
     // Postprocess
     m_postprocess_infer_request.set_tensor("hidden_states", hidden_states_tensor);

diff --git a/src/cpp/src/module_genai/utils/thread_helper.hpp b/src/cpp/src/module_genai/utils/thread_helper.hpp
@@ -3,37 +3,39 @@
 
 #pragma once
 
-#include <future>
 #include <chrono>
+#include <future>
 #include <thread>
+
 #include "openvino/runtime/compiled_model.hpp"
+#include "profiler.hpp"
 
 namespace ov::genai::module::thread_utils {
 
+#ifndef ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+#    define ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT 0  // Current multiple threads may cause GPU crash.
+#endif
+
+#ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
 inline std::future<bool> load_model_weights_async(ov::CompiledModel& compiled_model) {
     auto load_fun = [&]() -> bool {
-        // compiled_model.load_model_weights();
-        std::this_thread::sleep_for(std::chrono::milliseconds(500));
+        PROFILE(pm, "load_model_weights async");
+        compiled_model.load_model_weights();
+        // infer_request = compiled_model.create_infer_request();
         return true;
     };
     return std::async(std::launch::async, load_fun);
 }
 
-inline void load_model_weights_finish(std::future<bool>& result_future) {
-    result_future.get();
-}
-
-inline std::future<bool> release_model_weights_async(ov::CompiledModel& compiled_model) {
+inline std::future<bool> release_model_weights_async(ov::CompiledModel& compiled_model, ov::InferRequest infer_request) {
     auto load_fun = [&]() -> bool {
-        // compiled_model.release_model_weights();
-        std::this_thread::sleep_for(std::chrono::milliseconds(200));
+        PROFILE(pm, "release_model_weights async");
+        infer_request = ov::InferRequest();  // reset infer request to release the reference to the model weights
+        compiled_model.release_model_weights();
         return true;
     };
     return std::async(std::launch::async, load_fun);
 }
-
-inline void release_model_weights_finish(std::future<bool>& result_future) {
-    result_future.get();
-}
+#endif
 
 }  // namespace ov::genai::module::thread_utils
diff --git a/tests/module_genai/cpp/.gitignore b/tests/module_genai/cpp/.gitignore
@@ -1,2 +1,3 @@
 dumped*.yaml
-profile*.json
+profile*.json
+unittest_cache_dir_*
diff --git a/tests/module_genai/cpp/modules/DenoiserLoopModule.cpp b/tests/module_genai/cpp/modules/DenoiserLoopModule.cpp
@@ -14,6 +14,7 @@ struct DenoiserLoopTestData {
     ov::genai::DiffusionModelType model_type;
     std::string model_path;
     bool splitted_model = false;
+    bool dynamic_load_model_weights = false;
     ov::Tensor latents;
     ov::Tensor prompt_embed;
     ov::Tensor negative_prompt_embed;
@@ -52,9 +53,18 @@ std::vector<DenoiserLoopTestData> denoiser_loop_test_data() {
     };
     datas.push_back(wan_data);
 
+    // Split model for Wan
     DenoiserLoopTestData wan_data_splitted_model = wan_data;
     wan_data_splitted_model.splitted_model = true;
     datas.push_back(wan_data_splitted_model);
+
+#ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
+    // Dynamic load weights for Split model
+    DenoiserLoopTestData wan_data_dyn_weights = wan_data;
+    wan_data_dyn_weights.splitted_model = true;
+    wan_data_dyn_weights.dynamic_load_model_weights = true;
+    datas.push_back(wan_data_dyn_weights);
+#endif
     return datas;
 }
 
@@ -77,12 +87,17 @@ class DenoiserLoopModuleTest : public ModuleTestBase, public ::testing::TestWith
         result = std::regex_replace(diffusion_model_type_to_string(test_data.model_type), std::regex("\\."), "_");
         result = result + "_" + device;
         result = result + "_SplittedModel_" + (test_data.splitted_model ? "true" : "false");
+        result = result + "_DynamicLoadWeights_" + (test_data.dynamic_load_model_weights ? "true" : "false");
         return result;
     }
 
     void SetUp() override {
         REGISTER_TEST_NAME();
         std::tie(m_test_data, m_device) = GetParam();
+
+        if (m_device != "GPU" && m_test_data.dynamic_load_model_weights) {
+            GTEST_SKIP() << "Skipping test for non-GPU device.";
+        }
     }
 
     void TearDown() override {}
@@ -122,6 +137,10 @@ class DenoiserLoopModuleTest : public ModuleTestBase, public ::testing::TestWith
         YAML::Node params;
         params["model_path"] = m_test_data.model_path;
         params["splitted_model"] = m_test_data.splitted_model ? "true" : "false";
+        params["dynamic_load_weights"] = m_test_data.dynamic_load_model_weights ? "true" : "false";
+        if (m_test_data.dynamic_load_model_weights) {
+            params["cache_dir"] = "./unittest_cache_dir_denoiserloop";
+        }
         denoiser_loop["params"] = params;
         pipeline_modules["denoiser_loop"] = denoiser_loop;
         return YAML::Dump(config);