Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
1c0048a
remote context.
xipingyan Feb 6, 2026
3662cf6
Enable remote tensor.
xipingyan Feb 6, 2026
a10c31a
use genai_debug to print log.
xipingyan Feb 6, 2026
940531b
draft
xipingyan Feb 6, 2026
307c8e1
disable multiple thread load/release weight.
xipingyan Feb 6, 2026
234d0a5
Merge remote-tracking branch 'origin/master_modular_genai' into xp/en…
xipingyan Feb 6, 2026
5ab6d13
add macro, easy to debug.
xipingyan Feb 6, 2026
78d56a9
Default disable dynamic load weigths, performance is low.
xipingyan Feb 6, 2026
28f624c
Apply suggestion from @Copilot
xipingyan Feb 6, 2026
7266143
enable dynamic load model wights unit test.
xipingyan Feb 7, 2026
d45d2d2
skipped, only for dynamic load weights.
xipingyan Feb 7, 2026
bc53caf
dynamic load model weigths, not create inferrequest.
xipingyan Feb 7, 2026
de987d9
Apply suggestion from @Copilot
xipingyan Feb 12, 2026
2e3450f
Apply suggestions from code review
xipingyan Feb 12, 2026
8d6154d
Merge remote-tracking branch 'origin/master_modular_genai' into xp/en…
xipingyan Feb 25, 2026
2b8db2d
add build option: ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS, default off;
xipingyan Feb 26, 2026
1adbc6c
fix build issue.
xipingyan Feb 26, 2026
daf16dc
Update src/cpp/src/module_genai/modules/md_denoiser_loop/splitted_mod…
xipingyan Feb 26, 2026
3aad6b6
rename to release_fun
xipingyan Feb 26, 2026
4b3a6be
Fix dangling reference in async lambda captures in thread_helper.hpp …
Copilot Feb 26, 2026
1296c2c
-#if !ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
xipingyan Feb 26, 2026
0d416d5
Fix GPU device skip condition in DenoiserLoopModule test to use subst…
Copilot Feb 26, 2026
3a25241
Update src/cpp/src/module_genai/modules/md_denoiser_loop/class.cpp
xipingyan Feb 26, 2026
422d914
remove duplicated: check_cache_dir
xipingyan Feb 26, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,11 @@ __pycache__
_codeql_detected_source_root
install
*.avi

# Module-GenAI generated files
dumped_*.yaml
generated_*.bmp
samples/python/module_genai/*.sh
tests/module_genai/cpp/*.sh
tests/module_genai/cpp/*.yaml
tests/module_genai/cpp/test_data/*.json
1 change: 1 addition & 0 deletions cmake/features.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ option(ENABLE_TESTS "Enable tests build" ON)
option(ENABLE_TOOLS "Enable tools build" ON)
option(ENABLE_GGUF "Enable support for GGUF format" ON)
option(ENABLE_XGRAMMAR "Enable support for structured output generation with xgrammar backend" ON)
option(ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS "Enable offloading model weights (load/release)" OFF)

# Disable building samples for NPM package
if(CPACK_GENERATOR STREQUAL "NPM")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ pipeline_modules:
cache_dir: ./cache_dir_denoiser_loop/
model_path: tests/module_genai/cpp/test_models/Wan2.1-T2V-1.3B-Diffusers
splitted_model: true
dynamic_load_weights: false # performance is low.
type: DenoiserLoopModule
latent_image:
device: CPU
Expand Down
4 changes: 4 additions & 0 deletions src/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,10 @@ if(ENABLE_GGUF)
target_compile_definitions(${TARGET_NAME_OBJ} PRIVATE ENABLE_GGUF)
endif()

if(ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS)
target_compile_definitions(${TARGET_NAME_OBJ} PRIVATE ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS=1)
endif()

target_include_directories(${TARGET_NAME_OBJ} SYSTEM PRIVATE "${safetensors.h_SOURCE_DIR}")

target_link_libraries(${TARGET_NAME_OBJ} PRIVATE openvino::runtime openvino::threading nlohmann_json::nlohmann_json minja ${YAML_CPP_TARGET} PRIVATE TBB::tbb)
Expand Down
18 changes: 18 additions & 0 deletions src/cpp/src/module_genai/module_base.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,24 @@ void IBaseModule::check_splitted_model() {
}
}

bool IBaseModule::check_bool_param(const std::string& param_name, const bool& default_value) {
auto p = get_optional_param(param_name);
if (p.empty()) {
return default_value;
}

if (p == "true" || p == "True" || p == "TRUE" || p == "1") {
GENAI_INFO("Module[" + module_desc->name + "]: " + param_name + " = true");
return true;
} else if (p == "false" || p == "False" || p == "FALSE" || p == "0") {
GENAI_INFO("Module[" + module_desc->name + "]: " + param_name + " = false");
return false;
}
GENAI_ERR("Module[" + module_desc->name + "]: Invalid bool param value for '" + param_name + "': " + p +
", use default value: " + (default_value ? "true" : "false"));
return default_value;
}

// PipelineDesc implementation
PipelineDesc::PipelineDesc() : m_resource_cache(std::make_unique<PipelineResourceCache>()) {}

Expand Down
2 changes: 2 additions & 0 deletions src/cpp/src/module_genai/module_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ class IBaseModule {
bool m_splitted_model = false;
void check_splitted_model();

bool check_bool_param(const std::string& param_name, const bool& default_value);

// Initialize ov::Model from config models_map with param_name: "ov_model"
void init_ov_model();
};
Expand Down
13 changes: 10 additions & 3 deletions src/cpp/src/module_genai/modules/md_denoiser_loop/class.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,9 @@ void DenoiserLoopModule::print_static_config() {
type: "OVTensor" # Support DataType: [OVTensor]
params:
model_path: "model"
splitted_model: "bool value" # [Optional], default false.
cache_dir: "./cache_dir_transformer/" # [Optional], default is empty string.
splitted_model: "bool value" # [Optional], default false.
cache_dir: "./cache_dir_transformer/" # [Optional], default is empty string. But `splitted_model` and `dynamic_load_weights` depend on it.
dynamic_load_weights: "bool value" # [Optional], default false. Whether to dynamically load/release model weights during inference to save GPU memory.
)" << std::endl;
}

Expand Down Expand Up @@ -82,6 +83,13 @@ bool DenoiserLoopModule::initialize() {

check_splitted_model();

m_dynamic_load_weights = check_bool_param("dynamic_load_weights", false);

check_cache_dir();
if (m_dynamic_load_weights && m_cache_dir.empty()) {
GENAI_ERR("TransformerModule[" + module_desc->name + "]: 'cache_dir' must be set when 'dynamic_load_weights' is enabled");
return false;
Comment on lines +89 to +91
Copy link

Copilot AI Feb 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These error logs reference TransformerModule[...], but this is DenoiserLoopModule. Using the wrong module name makes debugging configuration issues harder; update the log prefix to match the actual module.

Copilot uses AI. Check for mistakes.
}
std::filesystem::path model_path = module_desc->get_full_path(it_path->second);
auto transformer_model_path = model_path / "transformer/openvino_model.xml";
if (m_model_type == DiffusionModelType::ZIMAGE) {
Expand All @@ -104,7 +112,6 @@ bool DenoiserLoopModule::initialize() {
}

auto properties = ov::AnyMap{};
check_cache_dir();
if (!m_cache_dir.empty()) {
properties["CACHE_DIR"] = m_cache_dir;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
#include "splitted_model_infer.hpp"

#include <regex>

#include "logger.hpp"
#include "module_genai/utils/tensor_utils.hpp"
#include "module_genai/utils/thread_helper.hpp"

namespace ov::genai::module {

Expand All @@ -12,9 +15,18 @@ CSplittedModelInfer::CSplittedModelInfer(const std::string& model_path,
: m_dynamic_load_model_weights(dynamic_load_model_weights),
m_is_gpu(device.find("GPU") != std::string::npos || device.find("gpu") != std::string::npos),
m_properties(properties) {
#ifndef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
OPENVINO_ASSERT(!m_dynamic_load_model_weights,
"Dynamic loading of model weights is not enabled in this build. Please set "
"ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS to 1 and rebuild.");
#endif

if (m_dynamic_load_model_weights) {
OPENVINO_ASSERT(m_is_gpu, "Dynamic loading of model weights is currently only supported for GPU device.");
}
// parse all splitted model paths, model_path is the directory that contains all splitted models
get_splitted_model_paths(model_path, device);
load_model(model_path, properties, device);
load_model(model_path, m_properties, device);
}

void CSplittedModelInfer::get_splitted_model_paths(const std::string& model_path, const std::string& device) {
Expand All @@ -40,7 +52,8 @@ void CSplittedModelInfer::get_splitted_model_paths(const std::string& model_path
continue;
}

// check if the file name end with "_preprocess.xml" or "_postprocess.xml" for preprocess and postprocess model
// check if the file name end with "_preprocess.xml" or "_postprocess.xml" for preprocess and postprocess
// model
if (filename.size() > 15 && filename.substr(filename.size() - 15) == "_preprocess.xml") {
m_preprocess_model_path = entry.path().string();
} else if (filename.size() > 16 && filename.substr(filename.size() - 16) == "_postprocess.xml") {
Expand Down Expand Up @@ -73,7 +86,9 @@ void CSplittedModelInfer::get_splitted_model_paths(const std::string& model_path
"Both preprocessing (_preprocess.xml) and postprocessing (_postprocess.xml) models are required.");
}

void CSplittedModelInfer::load_model(const std::string& model_path, const ov::AnyMap& properties, const std::string& device) {
void CSplittedModelInfer::load_model(const std::string& model_path,
const ov::AnyMap& properties,
const std::string& device) {
#if USE_FULL_MODEL
#else
{
Expand All @@ -96,14 +111,30 @@ void CSplittedModelInfer::load_model(const std::string& model_path, const ov::An
m_postprocess_infer_request = m_postprocess_compiled_model.create_infer_request();
}

auto properties_splitted_model = properties;
for (const auto& path : m_splitted_model_paths) {
auto model = utils::singleton_core().read_model(path);
if (m_is_gpu) {
m_compiled_models.push_back(utils::singleton_core().compile_model(model, m_context, properties));
if (m_dynamic_load_model_weights) {
properties_splitted_model[ov::weights_path.name()] =
std::filesystem::path(path).replace_extension(".bin").string();
auto cm = utils::singleton_core().compile_model(model, m_context, properties_splitted_model);
# ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
// Release model weights after compilation to save GPU memory. Load weights again in infer() when
// weights are needed.
cm.release_model_weights();
# endif
m_compiled_models.push_back(std::move(cm));
} else {
m_compiled_models.push_back(
utils::singleton_core().compile_model(model, m_context, properties_splitted_model));
m_infer_requests.push_back(m_compiled_models.back().create_infer_request());
}
} else {
m_compiled_models.push_back(utils::singleton_core().compile_model(model, device, properties));
m_compiled_models.push_back(
utils::singleton_core().compile_model(model, device, properties_splitted_model));
m_infer_requests.push_back(m_compiled_models.back().create_infer_request());
}
m_infer_requests.push_back(m_compiled_models.back().create_infer_request());
}
#endif
}
Expand All @@ -128,6 +159,24 @@ void CSplittedModelInfer::infer(const ov::AnyMap& inputs) {

m_full_infer_request.infer();
#else
int num_splitted_models = static_cast<int>(m_compiled_models.size());
OPENVINO_ASSERT(num_splitted_models > 1,
"Splitted models should be at least 2, but got " + std::to_string(num_splitted_models));

# ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
# if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
std::future<bool> future_flag;
if (m_dynamic_load_model_weights) {
future_flag = std::move(thread_utils::load_model_weights_async(m_compiled_models[0]));
}
# else // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
if (m_dynamic_load_model_weights) {
PROFILE(pm, "load_model_weights");
m_compiled_models[0].load_model_weights();
}
# endif // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
# endif // ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS

// Preprocess
for (const auto& input : inputs) {
m_preprocess_infer_request.set_tensor(input.first, input.second.as<ov::Tensor>());
Expand Down Expand Up @@ -155,17 +204,64 @@ void CSplittedModelInfer::infer(const ov::AnyMap& inputs) {
ov::Tensor ppw_tensor = m_preprocess_infer_request.get_tensor("ppw");

// Splitted models
for (size_t i = 0; i < m_infer_requests.size(); ++i) {
m_infer_requests[i].set_output_tensor(0, hidden_states_tensor);
m_infer_requests[i].set_tensor("hidden_states", hidden_states_tensor);
m_infer_requests[i].set_tensor("text_embeds", text_embeds_tensor);
m_infer_requests[i].set_tensor("timestep_proj", timestep_proj_tensor);
m_infer_requests[i].set_tensor("rotary_cos", rotary_cos_tensor);
m_infer_requests[i].set_tensor("rotary_sin", rotary_sin_tensor);
m_infer_requests[i].infer();
std::future<bool> next_future_flag;
for (int i = 0; i < num_splitted_models; ++i) {
PROFILE(pm, "splitted_model_infer_" + std::to_string(i));
Comment on lines +207 to +209
Copy link

Copilot AI Feb 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

next_future_flag is declared unconditionally but is only used when ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT is enabled. With the default value (0), this becomes an unused variable and can trigger compiler warnings; consider declaring it under the same #if guard or marking it [[maybe_unused]].

Copilot uses AI. Check for mistakes.
ov::InferRequest curInferRequest;
if (m_dynamic_load_model_weights) {
# ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
if (i + 1 < num_splitted_models) {
# if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
next_future_flag = thread_utils::load_model_weights_async(m_compiled_models[i + 1]);
# else
m_compiled_models[i + 1].load_model_weights();
# endif // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
}
# if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
if (future_flag.valid())
future_flag.wait();
# endif // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
Comment on lines +220 to +223
Copy link

Copilot AI Feb 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When multithreaded loading is enabled, the code uses future_flag.wait() but never calls get(). This can silently swallow exceptions from load_model_weights_async and make failures harder to diagnose; use get() (or propagate/log exceptions) after waiting.

Copilot uses AI. Check for mistakes.
curInferRequest = m_compiled_models[i].create_infer_request();
# endif // ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
} else {
curInferRequest = m_infer_requests[i];
}

curInferRequest.set_output_tensor(0, hidden_states_tensor);
curInferRequest.set_tensor("hidden_states", hidden_states_tensor);
curInferRequest.set_tensor("text_embeds", text_embeds_tensor);
curInferRequest.set_tensor("timestep_proj", timestep_proj_tensor);
curInferRequest.set_tensor("rotary_cos", rotary_cos_tensor);
curInferRequest.set_tensor("rotary_sin", rotary_sin_tensor);
{
PROFILE(pmi, "infer");
curInferRequest.infer();
}

# ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
if (m_dynamic_load_model_weights) {
# if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
auto release_future =
thread_utils::release_model_weights_async(m_compiled_models[i], std::move(curInferRequest));
if (release_future.valid()) {
release_future.wait();
Copy link

Copilot AI Feb 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Similarly, release_future.wait() does not call get(), so exceptions from the async release path are ignored. Consider consuming the future with get() (and handling/logging failures) to avoid hiding runtime issues.

Suggested change
release_future.wait();
try {
release_future.get();
} catch (const std::exception& e) {
GENAI_ERROR(std::string("Exception during async model weights release: ") + e.what());
} catch (...) {
GENAI_ERROR("Unknown exception during async model weights release");
}

Copilot uses AI. Check for mistakes.
}
# else
curInferRequest = ov::InferRequest(); // release infer request before releasing model weights to ensure the
// model weights can be released successfully.
m_compiled_models[i].release_model_weights();
# endif
}

# if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
future_flag = std::move(next_future_flag);
# endif
# endif // ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
}

GENAI_DEBUG("hidden_states_tensor is remote tensor: " + std::to_string(hidden_states_tensor.is<ov::RemoteTensor>()));
GENAI_DEBUG(
"hidden_states_tensor is remote tensor: " + std::to_string(hidden_states_tensor.is<ov::RemoteTensor>()) +
", shape:" + tensor_utils::shape_to_string(hidden_states_tensor.get_shape()));

// Postprocess
m_postprocess_infer_request.set_tensor("hidden_states", hidden_states_tensor);
Expand Down
40 changes: 21 additions & 19 deletions src/cpp/src/module_genai/utils/thread_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,37 +3,39 @@

#pragma once

#include <future>
#include <chrono>
#include <future>
#include <thread>

#include "openvino/runtime/compiled_model.hpp"
#include "profiler.hpp"

namespace ov::genai::module::thread_utils {

inline std::future<bool> load_model_weights_async(ov::CompiledModel& compiled_model) {
auto load_fun = [&]() -> bool {
// compiled_model.load_model_weights();
std::this_thread::sleep_for(std::chrono::milliseconds(500));
#ifndef ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
# define ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT 0 // Current multiple threads may cause GPU crash.
#endif

#ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
inline std::future<bool> load_model_weights_async(ov::CompiledModel compiled_model) {
auto load_fun = [compiled_model]() mutable -> bool {
PROFILE(pm, "load_model_weights async");
compiled_model.load_model_weights();
// infer_request = compiled_model.create_infer_request();
return true;
};
return std::async(std::launch::async, load_fun);
return std::async(std::launch::async, std::move(load_fun));
}

inline void load_model_weights_finish(std::future<bool>& result_future) {
result_future.get();
}

inline std::future<bool> release_model_weights_async(ov::CompiledModel& compiled_model) {
auto load_fun = [&]() -> bool {
// compiled_model.release_model_weights();
std::this_thread::sleep_for(std::chrono::milliseconds(200));
inline std::future<bool> release_model_weights_async(ov::CompiledModel compiled_model, ov::InferRequest infer_request) {
auto release_fun = [compiled_model, infer_request]() mutable -> bool {
PROFILE(pm, "release_model_weights async");
infer_request = ov::InferRequest(); // reset infer request to release the reference to the model weights
compiled_model.release_model_weights();
return true;
};
return std::async(std::launch::async, load_fun);
}

inline void release_model_weights_finish(std::future<bool>& result_future) {
result_future.get();
return std::async(std::launch::async, std::move(release_fun));
}
#endif

} // namespace ov::genai::module::thread_utils
3 changes: 2 additions & 1 deletion tests/module_genai/cpp/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
dumped*.yaml
profile*.json
profile*.json
unittest_cache_dir_*
Loading
Loading