Skip to content

Commit 2b8db2d

Browse files
committed
add build option: ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS, default off;
Signed-off-by: xiping.yan <xiping.yan@intel.com>
1 parent 8d6154d commit 2b8db2d

File tree

7 files changed

+68
-45
lines changed

7 files changed

+68
-45
lines changed

.gitignore

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,11 @@ __pycache__
4545
_codeql_detected_source_root
4646
install
4747
*.avi
48+
49+
# Module-GenAI generated files
50+
dumped_*.yaml
51+
generated_*.bmp
52+
samples/python/module_genai/*.sh
53+
tests/module_genai/cpp/*.sh
54+
tests/module_genai/cpp/*.yaml
55+
tests/module_genai/cpp/test_data/*.json

cmake/features.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ option(ENABLE_TESTS "Enable tests build" ON)
1010
option(ENABLE_TOOLS "Enable tools build" ON)
1111
option(ENABLE_GGUF "Enable support for GGUF format" ON)
1212
option(ENABLE_XGRAMMAR "Enable support for structured output generation with xgrammar backend" ON)
13+
option(ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS "Enable offloading model weights (load/release)" OFF)
1314

1415
# Disable building samples for NPM package
1516
if(CPACK_GENERATOR STREQUAL "NPM")

src/cpp/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,10 @@ if(ENABLE_GGUF)
146146
target_compile_definitions(${TARGET_NAME_OBJ} PRIVATE ENABLE_GGUF)
147147
endif()
148148

149+
if(ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS)
150+
target_compile_definitions(${TARGET_NAME_OBJ} PRIVATE ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS=1)
151+
endif()
152+
149153
target_include_directories(${TARGET_NAME_OBJ} SYSTEM PRIVATE "${safetensors.h_SOURCE_DIR}")
150154

151155
target_link_libraries(${TARGET_NAME_OBJ} PRIVATE openvino::runtime openvino::threading nlohmann_json::nlohmann_json minja ${YAML_CPP_TARGET} PRIVATE TBB::tbb)

src/cpp/src/module_genai/modules/md_denoiser_loop/splitted_model_infer.cpp

Lines changed: 47 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@ CSplittedModelInfer::CSplittedModelInfer(const std::string& model_path,
1515
: m_dynamic_load_model_weights(dynamic_load_model_weights),
1616
m_is_gpu(device.find("GPU") != std::string::npos || device.find("gpu") != std::string::npos),
1717
m_properties(properties) {
18+
#if !ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
19+
OPENVINO_ASSERT(!m_dynamic_load_model_weights,
20+
"Dynamic loading of model weights is not enabled in this build. Please set "
21+
"ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS to 1 and rebuild.");
22+
#endif
23+
1824
if (m_dynamic_load_model_weights) {
1925
OPENVINO_ASSERT(m_is_gpu, "Dynamic loading of model weights is currently only supported for GPU device.");
2026
}
@@ -46,7 +52,8 @@ void CSplittedModelInfer::get_splitted_model_paths(const std::string& model_path
4652
continue;
4753
}
4854

49-
// check if the file name end with "_preprocess.xml" or "_postprocess.xml" for preprocess and postprocess model
55+
// check if the file name end with "_preprocess.xml" or "_postprocess.xml" for preprocess and postprocess
56+
// model
5057
if (filename.size() > 15 && filename.substr(filename.size() - 15) == "_preprocess.xml") {
5158
m_preprocess_model_path = entry.path().string();
5259
} else if (filename.size() > 16 && filename.substr(filename.size() - 16) == "_postprocess.xml") {
@@ -79,7 +86,9 @@ void CSplittedModelInfer::get_splitted_model_paths(const std::string& model_path
7986
"Both preprocessing (_preprocess.xml) and postprocessing (_postprocess.xml) models are required.");
8087
}
8188

82-
void CSplittedModelInfer::load_model(const std::string& model_path, const ov::AnyMap& properties, const std::string& device) {
89+
void CSplittedModelInfer::load_model(const std::string& model_path,
90+
const ov::AnyMap& properties,
91+
const std::string& device) {
8392
#if USE_FULL_MODEL
8493
#else
8594
{
@@ -107,18 +116,21 @@ void CSplittedModelInfer::load_model(const std::string& model_path, const ov::An
107116
auto model = utils::singleton_core().read_model(path);
108117
if (m_is_gpu) {
109118
if (m_dynamic_load_model_weights) {
110-
properties_splitted_model[ov::weights_path.name()] = std::filesystem::path(path).replace_extension(".bin").string();
119+
properties_splitted_model[ov::weights_path.name()] =
120+
std::filesystem::path(path).replace_extension(".bin").string();
111121
auto cm = utils::singleton_core().compile_model(model, m_context, properties_splitted_model);
112122
// Release model weights after compilation to save GPU memory. Load weights again in infer() when
113123
// weights are needed.
114124
cm.release_model_weights();
115125
m_compiled_models.push_back(std::move(cm));
116126
} else {
117-
m_compiled_models.push_back(utils::singleton_core().compile_model(model, m_context, properties_splitted_model));
127+
m_compiled_models.push_back(
128+
utils::singleton_core().compile_model(model, m_context, properties_splitted_model));
118129
m_infer_requests.push_back(m_compiled_models.back().create_infer_request());
119130
}
120131
} else {
121-
m_compiled_models.push_back(utils::singleton_core().compile_model(model, device, properties_splitted_model));
132+
m_compiled_models.push_back(
133+
utils::singleton_core().compile_model(model, device, properties_splitted_model));
122134
m_infer_requests.push_back(m_compiled_models.back().create_infer_request());
123135
}
124136
}
@@ -149,10 +161,19 @@ void CSplittedModelInfer::infer(const ov::AnyMap& inputs) {
149161
OPENVINO_ASSERT(num_splitted_models > 1,
150162
"Splitted models should be at least 2, but got " + std::to_string(num_splitted_models));
151163

164+
# ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
165+
# if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
152166
std::future<bool> future_flag;
153167
if (m_dynamic_load_model_weights) {
154168
future_flag = std::move(thread_utils::load_model_weights_async(m_compiled_models[0]));
155169
}
170+
# else // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
171+
if (m_dynamic_load_model_weights) {
172+
PROFILE(pm, "load_model_weights");
173+
m_compiled_models[0].load_model_weights();
174+
}
175+
# endif // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
176+
# endif // ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
156177

157178
// Preprocess
158179
for (const auto& input : inputs) {
@@ -186,14 +207,20 @@ void CSplittedModelInfer::infer(const ov::AnyMap& inputs) {
186207
PROFILE(pm, "splitted_model_infer_" + std::to_string(i));
187208
ov::InferRequest curInferRequest;
188209
if (m_dynamic_load_model_weights) {
210+
# ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
189211
if (i + 1 < num_splitted_models) {
190-
next_future_flag =
191-
thread_utils::load_model_weights_async(m_compiled_models[i + 1]);
212+
# if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
213+
next_future_flag = thread_utils::load_model_weights_async(m_compiled_models[i + 1]);
214+
# else
215+
m_compiled_models[i + 1].load_model_weights();
216+
# endif // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
192217
}
218+
# if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
193219
if (future_flag.valid())
194220
future_flag.wait();
195-
221+
# endif // ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
196222
curInferRequest = m_compiled_models[i].create_infer_request();
223+
# endif // ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
197224
} else {
198225
curInferRequest = m_infer_requests[i];
199226
}
@@ -208,10 +235,22 @@ void CSplittedModelInfer::infer(const ov::AnyMap& inputs) {
208235
PROFILE(pmi, "infer");
209236
curInferRequest.infer();
210237
}
238+
239+
# ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
211240
if (m_dynamic_load_model_weights) {
241+
# if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
212242
thread_utils::release_model_weights_async(m_compiled_models[i], std::move(curInferRequest));
243+
# else
244+
curInferRequest = ov::InferRequest(); // release infer request before releasing model weights to ensure the
245+
// model weights can be released successfully.
246+
m_compiled_models[i].release_model_weights();
247+
# endif
213248
}
249+
250+
# if ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
214251
future_flag = std::move(next_future_flag);
252+
# endif
253+
# endif // ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
215254
}
216255

217256
GENAI_DEBUG(

src/cpp/src/module_genai/utils/thread_helper.hpp

Lines changed: 4 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -12,60 +12,29 @@
1212

1313
namespace ov::genai::module::thread_utils {
1414

15-
#ifndef ENABLE_DYNAMIC_MODEL_WEIGHTS
16-
# define ENABLE_DYNAMIC_MODEL_WEIGHTS 1
15+
#ifndef ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT
16+
# define ENABLE_MULTIPLE_THREAD_LOAD_MODEL_WEIGHT 0 // Current multiple threads may cause GPU crash.
1717
#endif
1818

19-
#ifndef DISABLE_THREAD
20-
# define DISABLE_THREAD 1 // Current multiple threads may cause GPU crash.
21-
#endif
22-
23-
#if ENABLE_DYNAMIC_MODEL_WEIGHTS
19+
#ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
2420
inline std::future<bool> load_model_weights_async(ov::CompiledModel& compiled_model) {
25-
#if DISABLE_THREAD
26-
PROFILE(pm, "load_model_weights sync");
27-
compiled_model.load_model_weights();
28-
// infer_request = compiled_model.create_infer_request();
29-
return std::async(std::launch::deferred, []() -> bool { return true; });
30-
#else
3121
auto load_fun = [&]() -> bool {
3222
PROFILE(pm, "load_model_weights async");
3323
compiled_model.load_model_weights();
3424
// infer_request = compiled_model.create_infer_request();
3525
return true;
3626
};
3727
return std::async(std::launch::async, load_fun);
38-
#endif
3928
}
4029

4130
inline std::future<bool> release_model_weights_async(ov::CompiledModel& compiled_model, ov::InferRequest infer_request) {
42-
#if DISABLE_THREAD
43-
PROFILE(pm, "release_model_weights sync");
44-
compiled_model.release_model_weights();
45-
// release infer request to release the reference to the model weights
46-
infer_request = ov::InferRequest();
47-
return std::async(std::launch::deferred, []() -> bool { return true; });
48-
#else
4931
auto load_fun = [&]() -> bool {
5032
PROFILE(pm, "release_model_weights async");
51-
compiled_model.release_model_weights();
5233
infer_request = ov::InferRequest(); // reset infer request to release the reference to the model weights
34+
compiled_model.release_model_weights();
5335
return true;
5436
};
5537
return std::async(std::launch::async, load_fun);
56-
#endif
57-
}
58-
#else
59-
inline std::future<bool> load_model_weights_async(ov::CompiledModel& compiled_model) {
60-
return std::async(std::launch::deferred, []() -> bool {
61-
return true;
62-
});
63-
}
64-
inline std::future<bool> release_model_weights_async(ov::CompiledModel& compiled_model,
65-
ov::InferRequest& infer_request) {
66-
return std::async(std::launch::deferred, []() -> bool {
67-
return true;
68-
});
6938
}
7039
#endif
7140

tests/module_genai/cpp/.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
dumped*.yaml
2-
profile*.json
2+
profile*.json
3+
unittest_cache_dir_*

tests/module_genai/cpp/modules/DenoiserLoopModule.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,12 +58,13 @@ std::vector<DenoiserLoopTestData> denoiser_loop_test_data() {
5858
wan_data_splitted_model.splitted_model = true;
5959
datas.push_back(wan_data_splitted_model);
6060

61+
#ifdef ENABLE_DYNAMIC_LOAD_MODEL_WEIGHTS
6162
// Dynamic load weights for Split model
6263
DenoiserLoopTestData wan_data_dyn_weights = wan_data;
6364
wan_data_dyn_weights.splitted_model = true;
6465
wan_data_dyn_weights.dynamic_load_model_weights = true;
6566
datas.push_back(wan_data_dyn_weights);
66-
67+
#endif
6768
return datas;
6869
}
6970

0 commit comments

Comments
 (0)