Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
1c0048a
remote context.
xipingyan Feb 6, 2026
3662cf6
Enable remote tensor.
xipingyan Feb 6, 2026
a10c31a
use genai_debug to print log.
xipingyan Feb 6, 2026
940531b
draft
xipingyan Feb 6, 2026
307c8e1
disable multiple thread load/release weight.
xipingyan Feb 6, 2026
234d0a5
Merge remote-tracking branch 'origin/master_modular_genai' into xp/en…
xipingyan Feb 6, 2026
5ab6d13
add macro, easy to debug.
xipingyan Feb 6, 2026
78d56a9
Default disable dynamic load weigths, performance is low.
xipingyan Feb 6, 2026
28f624c
Apply suggestion from @Copilot
xipingyan Feb 6, 2026
7266143
enable dynamic load model wights unit test.
xipingyan Feb 7, 2026
d45d2d2
skipped, only for dynamic load weights.
xipingyan Feb 7, 2026
bc53caf
dynamic load model weigths, not create inferrequest.
xipingyan Feb 7, 2026
de987d9
Apply suggestion from @Copilot
xipingyan Feb 12, 2026
2e3450f
Apply suggestions from code review
xipingyan Feb 12, 2026
8d6154d
Merge remote-tracking branch 'origin/master_modular_genai' into xp/en…
xipingyan Feb 25, 2026
2b8db2d
add build option: ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS, default off;
xipingyan Feb 26, 2026
1adbc6c
fix build issue.
xipingyan Feb 26, 2026
daf16dc
Update src/cpp/src/module_genai/modules/md_denoiser_loop/splitted_mod…
xipingyan Feb 26, 2026
3aad6b6
rename to release_fun
xipingyan Feb 26, 2026
4b3a6be
Fix dangling reference in async lambda captures in thread_helper.hpp …
Copilot Feb 26, 2026
1296c2c
-#if !ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
xipingyan Feb 26, 2026
0d416d5
Fix GPU device skip condition in DenoiserLoopModule test to use subst…
Copilot Feb 26, 2026
3a25241
Update src/cpp/src/module_genai/modules/md_denoiser_loop/class.cpp
xipingyan Feb 26, 2026
422d914
remove duplicated: check_cache_dir
xipingyan Feb 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ pipeline_modules:
cache_dir: ./cache_dir_denoiser_loop/
model_path: tests/module_genai/cpp/test_models/Wan2.1-T2V-1.3B-Diffusers
splitted_model: true
dynamic_load_weights: true
type: DenoiserLoopModule
latent_image:
device: CPU
Expand Down
18 changes: 18 additions & 0 deletions src/cpp/src/module_genai/module_base.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,24 @@ void IBaseModule::check_splitted_model() {
}
}

bool IBaseModule::check_bool_param(const std::string& param_name, const bool& default_value) {
auto p = get_optional_param(param_name);
if (p.empty()) {
return default_value;
}

if (p == "true" || p == "True" || p == "TRUE" || p == "1") {
GENAI_INFO("Module[" + module_desc->name + "]: " + param_name + " = true");
return true;
} else if (p == "false" || p == "False" || p == "FALSE" || p == "0") {
GENAI_INFO("Module[" + module_desc->name + "]: " + param_name + " = false");
return false;
}
GENAI_ERR("Module[" + module_desc->name + "]: Invalid bool param value for '" + param_name + "': " + p +
", use default value: " + (default_value ? "true" : "false"));
return default_value;
}

} // namespace module
} // namespace genai
} // namespace ov
2 changes: 2 additions & 0 deletions src/cpp/src/module_genai/module_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ class IBaseModule {
bool m_splitted_model = false;
void check_splitted_model();

bool check_bool_param(const std::string& param_name, const bool& default_value);

// Initialize ov::Model from config models_map with param_name: "ov_model"
void init_ov_model();
};
Expand Down
7 changes: 5 additions & 2 deletions src/cpp/src/module_genai/modules/md_denoiser_loop/class.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,9 @@ void DenoiserLoopModule::print_static_config() {
type: "OVTensor" # Support DataType: [OVTensor]
params:
model_path: "model"
splitted_model: "bool value" # [Optional], default false.
cache_dir: "./cache_dir_transformer/" # [Optional], default is empty string.
splitted_model: "bool value" # [Optional], default false.
cache_dir: "./cache_dir_transformer/" # [Optional], default is empty string. But `splitted_model` and `dynamic_load_weights` depend on it.
dynamic_load_weights: "bool value" # [Optional], default true. Whether to dynamically load/release model weights during inference to save GPU memory.
)" << std::endl;
}

Expand Down Expand Up @@ -82,6 +83,8 @@ bool DenoiserLoopModule::initialize() {

check_splitted_model();

m_dynamic_load_weights = check_bool_param("dynamic_load_weights", false);

std::filesystem::path model_path = module_desc->get_full_path(it_path->second);
auto transformer_model_path = model_path / "transformer/openvino_model.xml";
if (m_model_type == DiffusionModelType::ZIMAGE) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <regex>
#include "logger.hpp"
#include "module_genai/utils/thread_helper.hpp"

namespace ov::genai::module {

Expand All @@ -12,9 +13,12 @@ CSplittedModelInfer::CSplittedModelInfer(const std::string& model_path,
: m_dynamic_load_model_weights(dynamic_load_model_weights),
m_is_gpu(device.find("GPU") != std::string::npos || device.find("gpu") != std::string::npos),
m_properties(properties) {
if (m_dynamic_load_model_weights) {
OPENVINO_ASSERT(m_is_gpu, "Dynamic loading of model weights is currently only supported for GPU device.");
}
// parse all splitted model paths, model_path is the directory that contains all splitted models
get_splitted_model_paths(model_path, device);
load_model(model_path, properties, device);
load_model(model_path, m_properties, device);
}

void CSplittedModelInfer::get_splitted_model_paths(const std::string& model_path, const std::string& device) {
Expand Down Expand Up @@ -96,12 +100,22 @@ void CSplittedModelInfer::load_model(const std::string& model_path, const ov::An
m_postprocess_infer_request = m_postprocess_compiled_model.create_infer_request();
}

auto properties_splitted_model = properties;
for (const auto& path : m_splitted_model_paths) {
auto model = utils::singleton_core().read_model(path);
if (m_is_gpu) {
m_compiled_models.push_back(utils::singleton_core().compile_model(model, m_context, properties));
if (m_dynamic_load_model_weights) {
properties_splitted_model[ov::weights_path.name()] = std::filesystem::path(path).replace_extension(".bin").string();
auto cm = utils::singleton_core().compile_model(model, m_context, properties_splitted_model);
// Release model weights after compilation to save GPU memory. Load weights again in infer() when
// weights are needed.
cm.release_model_weights();
m_compiled_models.push_back(std::move(cm));
} else {
m_compiled_models.push_back(utils::singleton_core().compile_model(model, m_context, properties_splitted_model));
}
} else {
m_compiled_models.push_back(utils::singleton_core().compile_model(model, device, properties));
m_compiled_models.push_back(utils::singleton_core().compile_model(model, device, properties_splitted_model));
}
m_infer_requests.push_back(m_compiled_models.back().create_infer_request());
}
Expand All @@ -128,6 +142,15 @@ void CSplittedModelInfer::infer(const ov::AnyMap& inputs) {

m_full_infer_request.infer();
#else
int num_splitted_models = static_cast<int>(m_compiled_models.size());
OPENVINO_ASSERT(num_splitted_models > 1,
"Splitted models should be at least 2, but got " + std::to_string(num_splitted_models));

std::future<bool> future_flag;
if (m_dynamic_load_model_weights) {
future_flag = std::move(thread_utils::load_model_weights_async(m_compiled_models[0], m_infer_requests[0]));
}

// Preprocess
for (const auto& input : inputs) {
m_preprocess_infer_request.set_tensor(input.first, input.second.as<ov::Tensor>());
Expand Down Expand Up @@ -155,14 +178,32 @@ void CSplittedModelInfer::infer(const ov::AnyMap& inputs) {
ov::Tensor ppw_tensor = m_preprocess_infer_request.get_tensor("ppw");

// Splitted models
for (size_t i = 0; i < m_infer_requests.size(); ++i) {
std::future<bool> next_future_flag;
for (int i = 0; i < num_splitted_models; ++i) {
PROFILE(pm, "splitted_model_infer_" + std::to_string(i));
Comment on lines +207 to +209
Copy link

Copilot AI Feb 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

next_future_flag is declared unconditionally but is only used when ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT is enabled. With the default value (0), this becomes an unused variable and can trigger compiler warnings; consider declaring it under the same #if guard or marking it [[maybe_unused]].

Copilot uses AI. Check for mistakes.
if (m_dynamic_load_model_weights) {
if (i + 1 < num_splitted_models) {
next_future_flag =
thread_utils::load_model_weights_async(m_compiled_models[i + 1], m_infer_requests[i + 1]);
}
if (future_flag.valid())
future_flag.wait();
}

m_infer_requests[i].set_output_tensor(0, hidden_states_tensor);
m_infer_requests[i].set_tensor("hidden_states", hidden_states_tensor);
m_infer_requests[i].set_tensor("text_embeds", text_embeds_tensor);
m_infer_requests[i].set_tensor("timestep_proj", timestep_proj_tensor);
m_infer_requests[i].set_tensor("rotary_cos", rotary_cos_tensor);
m_infer_requests[i].set_tensor("rotary_sin", rotary_sin_tensor);
m_infer_requests[i].infer();
{
PROFILE(pmi, "infer");
m_infer_requests[i].infer();
}
if (m_dynamic_load_model_weights) {
thread_utils::release_model_weights_async(m_compiled_models[i], m_infer_requests[i]);
}
future_flag = std::move(next_future_flag);
}

GENAI_DEBUG("hidden_states_tensor is remote tensor: " + std::to_string(hidden_states_tensor.is<ov::RemoteTensor>()));
Expand Down
61 changes: 47 additions & 14 deletions src/cpp/src/module_genai/utils/thread_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,37 +3,70 @@

#pragma once

#include <future>
#include <chrono>
#include <future>
#include <thread>

#include "openvino/runtime/compiled_model.hpp"
#include "profiler.hpp"

namespace ov::genai::module::thread_utils {

inline std::future<bool> load_model_weights_async(ov::CompiledModel& compiled_model) {
#ifndef ENABLE_DYNAMIC_MODEL_WEIGHTS
# define ENABLE_DYNAMIC_MODEL_WEIGHTS 1
#endif

#ifndef DISABLE_THREAD
# define DISABLE_THREAD 1 // Current mulitple threads may cause GPU crash.
#endif

#if ENABLE_DYNAMIC_MODEL_WEIGHTS
inline std::future<bool> load_model_weights_async(ov::CompiledModel& compiled_model, ov::InferRequest& infer_request) {
#if DISABLE_THREAD
PROFILE(pm, "load_model_weights sync");
compiled_model.load_model_weights();
infer_request = compiled_model.create_infer_request();
return std::async(std::launch::deferred, []() -> bool { return true; });
#else
auto load_fun = [&]() -> bool {
// compiled_model.load_model_weights();
std::this_thread::sleep_for(std::chrono::milliseconds(500));
PROFILE(pm, "load_model_weights async");
compiled_model.load_model_weights();
infer_request = compiled_model.create_infer_request();
return true;
};
return std::async(std::launch::async, load_fun);
#endif
}

inline void load_model_weights_finish(std::future<bool>& result_future) {
result_future.get();
}

inline std::future<bool> release_model_weights_async(ov::CompiledModel& compiled_model) {
inline std::future<bool> release_model_weights_async(ov::CompiledModel& compiled_model, ov::InferRequest& infer_request) {
#if DISABLE_THREAD
PROFILE(pm, "release_model_weights sync");
compiled_model.release_model_weights();
// release infer request to release the reference to the model weights
infer_request = ov::InferRequest();
return std::async(std::launch::deferred, []() -> bool { return true; });
#else
auto load_fun = [&]() -> bool {
// compiled_model.release_model_weights();
std::this_thread::sleep_for(std::chrono::milliseconds(200));
PROFILE(pm, "release_model_weights async");
compiled_model.release_model_weights();
infer_request = ov::InferRequest(); // reset infer request to release the reference to the model weights
return true;
};
return std::async(std::launch::async, load_fun);
#endif
}

inline void release_model_weights_finish(std::future<bool>& result_future) {
result_future.get();
#else
inline std::future<bool> load_model_weights_async(ov::CompiledModel& compiled_model, ov::InferRequest& infer_request) {
return std::async(std::launch::deferred, []() -> bool {
return true;
});
}
inline std::future<bool> release_model_weights_async(ov::CompiledModel& compiled_model,
ov::InferRequest& infer_request) {
return std::async(std::launch::deferred, []() -> bool {
return true;
});
}
#endif

} // namespace ov::genai::module::thread_utils
Loading