add build option: ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS, default off;

xipingyan · xipingyan · commit 2b8db2d33116 · 2026-02-26T20:21:47.000+08:00
Signed-off-by: xiping.yan &lt;xiping.yan@intel.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -45,3 +45,11 @@ __pycache__
 _codeql_detected_source_root
 install
 *.avi
+
+# Module-GenAI generated files
+dumped_*.yaml
+generated_*.bmp
+samples/python/module_genai/*.sh
+tests/module_genai/cpp/*.sh
+tests/module_genai/cpp/*.yaml
+tests/module_genai/cpp/test_data/*.json
diff --git a/cmake/features.cmake b/cmake/features.cmake
@@ -10,6 +10,7 @@ option(ENABLE_TESTS "Enable tests build" ON)
 option(ENABLE_TOOLS "Enable tools build" ON)
 option(ENABLE_GGUF "Enable support for GGUF format" ON)
 option(ENABLE_XGRAMMAR "Enable support for structured output generation with xgrammar backend" ON)
+option(ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS "Enable offloading model weights (load/release)" OFF)
 
 # Disable building samples for NPM package
 if(CPACK_GENERATOR STREQUAL "NPM")
diff --git a/src/cpp/CMakeLists.txt b/src/cpp/CMakeLists.txt
@@ -146,6 +146,10 @@ if(ENABLE_GGUF)
     target_compile_definitions(${TARGET_NAME_OBJ} PRIVATE ENABLE_GGUF)
 endif()
 
+if(ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS)
+    target_compile_definitions(${TARGET_NAME_OBJ} PRIVATE ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS=1)
+endif()
+
 target_include_directories(${TARGET_NAME_OBJ} SYSTEM PRIVATE "${safetensors.h_SOURCE_DIR}")
 
 target_link_libraries(${TARGET_NAME_OBJ} PRIVATE openvino::runtime openvino::threading nlohmann_json::nlohmann_json minja ${YAML_CPP_TARGET} PRIVATE TBB::tbb)
diff --git a/src/cpp/src/module_genai/modules/md_denoiser_loop/splitted_model_infer.cpp b/src/cpp/src/module_genai/modules/md_denoiser_loop/splitted_model_infer.cpp
@@ -15,6 +15,12 @@ CSplittedModelInfer::CSplittedModelInfer(const std::string& model_path,
     : m_dynamic_load_model_weights(dynamic_load_model_weights),
       m_is_gpu(device.find("GPU") != std::string::npos || device.find("gpu") != std::string::npos),
       m_properties(properties) {
+#if !ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
+    OPENVINO_ASSERT(!m_dynamic_load_model_weights,
+                    "Dynamic loading of model weights is not enabled in this build. Please set "
+                    "ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS to 1 and rebuild.");
+#endif
+
     if (m_dynamic_load_model_weights) {
         OPENVINO_ASSERT(m_is_gpu, "Dynamic loading of model weights is currently only supported for GPU device.");
     }
@@ -46,7 +52,8 @@ void CSplittedModelInfer::get_splitted_model_paths(const std::string& model_path
                 continue;
             }
 
-            // check if the file name end with "_preprocess.xml" or "_postprocess.xml" for preprocess and postprocess model
+            // check if the file name end with "_preprocess.xml" or "_postprocess.xml" for preprocess and postprocess
+            // model
             if (filename.size() > 15 && filename.substr(filename.size() - 15) == "_preprocess.xml") {
                 m_preprocess_model_path = entry.path().string();
             } else if (filename.size() > 16 && filename.substr(filename.size() - 16) == "_postprocess.xml") {
@@ -79,7 +86,9 @@ void CSplittedModelInfer::get_splitted_model_paths(const std::string& model_path
                     "Both preprocessing (_preprocess.xml) and postprocessing (_postprocess.xml) models are required.");
 }
 
-void CSplittedModelInfer::load_model(const std::string& model_path, const ov::AnyMap& properties, const std::string& device) {
+void CSplittedModelInfer::load_model(const std::string& model_path,
+                                     const ov::AnyMap& properties,
+                                     const std::string& device) {
 #if USE_FULL_MODEL
 #else
     {
@@ -107,18 +116,21 @@ void CSplittedModelInfer::load_model(const std::string& model_path, const ov::An
         auto model = utils::singleton_core().read_model(path);
         if (m_is_gpu) {
             if (m_dynamic_load_model_weights) {
-                properties_splitted_model[ov::weights_path.name()] = std::filesystem::path(path).replace_extension(".bin").string();
+                properties_splitted_model[ov::weights_path.name()] =
+                    std::filesystem::path(path).replace_extension(".bin").string();
                 auto cm = utils::singleton_core().compile_model(model, m_context, properties_splitted_model);
                 // Release model weights after compilation to save GPU memory. Load weights again in infer() when
                 // weights are needed.
                 cm.release_model_weights();
                 m_compiled_models.push_back(std::move(cm));
             } else {
-                m_compiled_models.push_back(utils::singleton_core().compile_model(model, m_context, properties_splitted_model));
+                m_compiled_models.push_back(
+                    utils::singleton_core().compile_model(model, m_context, properties_splitted_model));
                 m_infer_requests.push_back(m_compiled_models.back().create_infer_request());
             }
         } else {
-            m_compiled_models.push_back(utils::singleton_core().compile_model(model, device, properties_splitted_model));
+            m_compiled_models.push_back(
+                utils::singleton_core().compile_model(model, device, properties_splitted_model));
             m_infer_requests.push_back(m_compiled_models.back().create_infer_request());
         }
     }
@@ -149,10 +161,19 @@ void CSplittedModelInfer::infer(const ov::AnyMap& inputs) {
     OPENVINO_ASSERT(num_splitted_models > 1,
                     "Splitted models should be at least 2, but got " + std::to_string(num_splitted_models));
 
+#    ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
+#        if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
     std::future<bool> future_flag;
     if (m_dynamic_load_model_weights) {
         future_flag = std::move(thread_utils::load_model_weights_async(m_compiled_models[0]));
     }
+#        else   // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+    if (m_dynamic_load_model_weights) {
+        PROFILE(pm, "load_model_weights");
+        m_compiled_models[0].load_model_weights();
+    }
+#        endif  // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+#    endif      // ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
 
     // Preprocess
     for (const auto& input : inputs) {
@@ -186,14 +207,20 @@ void CSplittedModelInfer::infer(const ov::AnyMap& inputs) {
         PROFILE(pm, "splitted_model_infer_" + std::to_string(i));
         ov::InferRequest curInferRequest;
         if (m_dynamic_load_model_weights) {
+#    ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
             if (i + 1 < num_splitted_models) {
-                next_future_flag =
-                    thread_utils::load_model_weights_async(m_compiled_models[i + 1]);
+#        if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+                next_future_flag = thread_utils::load_model_weights_async(m_compiled_models[i + 1]);
+#        else
+                m_compiled_models[i + 1].load_model_weights();
+#        endif  // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
             }
+#        if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
             if (future_flag.valid())
                 future_flag.wait();
-
+#        endif  // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
             curInferRequest = m_compiled_models[i].create_infer_request();
+#    endif      // ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
         } else {
             curInferRequest = m_infer_requests[i];
         }
@@ -208,10 +235,22 @@ void CSplittedModelInfer::infer(const ov::AnyMap& inputs) {
             PROFILE(pmi, "infer");
             curInferRequest.infer();
         }
+
+#    ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
         if (m_dynamic_load_model_weights) {
+#        if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
             thread_utils::release_model_weights_async(m_compiled_models[i], std::move(curInferRequest));
+#        else
+            curInferRequest = ov::InferRequest();  // release infer request before releasing model weights to ensure the
+                                                   // model weights can be released successfully.
+            m_compiled_models[i].release_model_weights();
+#        endif
         }
+
+#        if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
         future_flag = std::move(next_future_flag);
+#        endif
+#    endif  // ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
     }
 
     GENAI_DEBUG(
diff --git a/src/cpp/src/module_genai/utils/thread_helper.hpp b/src/cpp/src/module_genai/utils/thread_helper.hpp
@@ -12,60 +12,29 @@
 
 namespace ov::genai::module::thread_utils {
 
-#ifndef ENABLE_DYNAMIC_MODEL_WEIGHTS
-#    define ENABLE_DYNAMIC_MODEL_WEIGHTS 1
+#ifndef ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
+#    define ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT 0  // Current multiple threads may cause GPU crash.
 #endif
 
-#ifndef DISABLE_THREAD
-#    define DISABLE_THREAD 1  // Current multiple threads may cause GPU crash.
-#endif
-
-#if ENABLE_DYNAMIC_MODEL_WEIGHTS
+#ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
 inline std::future<bool> load_model_weights_async(ov::CompiledModel& compiled_model) {
-#if DISABLE_THREAD
-    PROFILE(pm, "load_model_weights sync");
-    compiled_model.load_model_weights();
-    // infer_request = compiled_model.create_infer_request();
-    return std::async(std::launch::deferred, []() -> bool { return true; });
-#else
     auto load_fun = [&]() -> bool {
         PROFILE(pm, "load_model_weights async");
         compiled_model.load_model_weights();
         // infer_request = compiled_model.create_infer_request();
         return true;
     };
     return std::async(std::launch::async, load_fun);
-#endif
 }
 
 inline std::future<bool> release_model_weights_async(ov::CompiledModel& compiled_model, ov::InferRequest infer_request) {
-#if DISABLE_THREAD
-    PROFILE(pm, "release_model_weights sync");
-    compiled_model.release_model_weights();
-    // release infer request to release the reference to the model weights
-    infer_request = ov::InferRequest();
-    return std::async(std::launch::deferred, []() -> bool { return true; });
-#else
     auto load_fun = [&]() -> bool {
         PROFILE(pm, "release_model_weights async");
-        compiled_model.release_model_weights();
         infer_request = ov::InferRequest();  // reset infer request to release the reference to the model weights
+        compiled_model.release_model_weights();
         return true;
     };
     return std::async(std::launch::async, load_fun);
-#endif
-}
-#else
-inline std::future<bool> load_model_weights_async(ov::CompiledModel& compiled_model) {
-    return std::async(std::launch::deferred, []() -> bool {
-        return true;
-    });
-}
-inline std::future<bool> release_model_weights_async(ov::CompiledModel& compiled_model,
-                                                     ov::InferRequest& infer_request) {
-    return std::async(std::launch::deferred, []() -> bool {
-        return true;
-    });
 }
 #endif
 
diff --git a/tests/module_genai/cpp/.gitignore b/tests/module_genai/cpp/.gitignore
@@ -1,2 +1,3 @@
 dumped*.yaml
-profile*.json
+profile*.json
+unittest_cache_dir_*
diff --git a/tests/module_genai/cpp/modules/DenoiserLoopModule.cpp b/tests/module_genai/cpp/modules/DenoiserLoopModule.cpp
@@ -58,12 +58,13 @@ std::vector<DenoiserLoopTestData> denoiser_loop_test_data() {
     wan_data_splitted_model.splitted_model = true;
     datas.push_back(wan_data_splitted_model);
 
+#ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
     // Dynamic load weights for Split model
     DenoiserLoopTestData wan_data_dyn_weights = wan_data;
     wan_data_dyn_weights.splitted_model = true;
     wan_data_dyn_weights.dynamic_load_model_weights = true;
     datas.push_back(wan_data_dyn_weights);
-
+#endif
     return datas;
 }