Skip to content

Commit fb0c870

Browse files
popovaanilya-lavrenovWovchena
authored
Optimize get_inputs_embeds() for Qwen2VL. (openvinotoolkit#2037)
Image embeddings merger is moved to a separate method and in chat mode it is used only when new images are passed. Time measures are below for 6 chat iterations for Qwen2-VL-2B-Instruct, where image is passed on the first and third iteration. This branch: Chat iteration 1 (new image): encode time: 2012 ms get_inputs_embeds time: 7683 ms Chat iteration 2: encode time: 0 ms get_inputs_embeds time: 7 ms Chat iteration 3 (new image): encode time: 2359 ms get_inputs_embeds time: 29179 ms Chat iteration 4: encode time: 0 ms get_inputs_embeds time: 10 ms Chat iteration 5: encode time: 0 ms get_inputs_embeds time: 11 ms Chat iteration 6: encode time: 0 ms get_inputs_embeds time: 8 ms Master: Chat iteration 1 (new image): encode time: 1893ms get_inputs_embeds time: 8394ms Chat iteration 2: encode time: 0ms get_inputs_embeds time: 7664ms Chat iteration 3 (new image): encode time: 2126ms get_inputs_embeds time: 27954ms Chat iteration 4: encode time: 0ms get_inputs_embeds time: 27944ms Chat iteration 5: encode time: 0ms get_inputs_embeds time: 27974ms Chat iteration 6: encode time: 0ms get_inputs_embeds time: 27970ms --------- Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com> Co-authored-by: Vladimir Zlobin <vladimir.zlobin@intel.com>
1 parent b4ed057 commit fb0c870

File tree

15 files changed

+84
-51
lines changed

15 files changed

+84
-51
lines changed

src/cpp/src/icontinuous_batching.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,13 +174,17 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate(
174174
prompt_with_tags = add_image_tags_to_prompt(prompt_with_tags, rgbs, m_history_images.size());
175175
}
176176
m_history.push_back({{"role", "user"}, {"content", prompt_with_tags}});
177+
auto start_get_inputs_embeds = std::chrono::steady_clock::now();
177178
const auto encoded_images = m_inputs_embedder->encode_images(rgbs);
178179
m_history_images.insert(m_history_images.end(), encoded_images.begin(), encoded_images.end());
179-
std::string templated_history = m_tokenizer.apply_chat_template(m_history, true);
180180

181+
std::string templated_history = m_tokenizer.apply_chat_template(m_history, true);
181182
m_inputs_embedder->set_apply_chat_template_status(false);
182183

183-
input_embeds_list.push_back(m_inputs_embedder->get_inputs_embeds(templated_history, m_history_images, vlm_perf_metrics[0]));
184+
input_embeds_list.push_back(m_inputs_embedder->get_inputs_embeds(templated_history, m_history_images, vlm_perf_metrics[0], rgbs.size() > 0));
185+
auto end_get_inputs_embeds = std::chrono::steady_clock::now();
186+
vlm_perf_metrics[0].vlm_raw_metrics.prepare_embeddings_durations.emplace_back(PerfMetrics::get_microsec(end_get_inputs_embeds - start_get_inputs_embeds));
187+
184188
} else {
185189
for (size_t i = 0; i < prompts.size(); i++) {
186190
const auto& prompt = prompts[i];

src/cpp/src/visual_language/inputs_embedder.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -244,8 +244,8 @@ ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const st
244244
return m_impl->get_inputs_embeds(prompt, images, metrics);
245245
}
246246

247-
ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) {
248-
return m_impl->get_inputs_embeds(prompt, images, metrics);
247+
ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings) {
248+
return m_impl->get_inputs_embeds(prompt, images, metrics, recalculate_merged_embeddings);
249249
}
250250

251251
std::vector<ov::genai::EncodedImage> InputsEmbedder::encode_images(const std::vector<ov::Tensor>& images) {

src/cpp/src/visual_language/inputs_embedder.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ class InputsEmbedder {
3737
// compute input embedding for prompt and multiple images
3838
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics);
3939

40-
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics);
40+
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true);
4141

4242
std::vector<ov::genai::EncodedImage> encode_images(const std::vector<ov::Tensor>& images);
4343

@@ -98,7 +98,7 @@ class InputsEmbedder {
9898
size_t m_image_id = 0;
9999

100100
public:
101-
virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) = 0;
101+
virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true) = 0;
102102

103103
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::Tensor>& images, ov::genai::VLMPerfMetrics& metrics);
104104

src/cpp/src/visual_language/internvl_chat/classes.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ InputsEmbedderInternVLChat::InputsEmbedderInternVLChat(
226226
const ov::AnyMap device_config) :
227227
IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { }
228228

229-
ov::Tensor InputsEmbedderInternVLChat::get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) {
229+
ov::Tensor InputsEmbedderInternVLChat::get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings) {
230230
std::string image_start_token = m_vlm_config.image_start_token;
231231
std::string image_context_token = m_vlm_config.image_context_token;
232232
std::string image_end_token = m_vlm_config.image_end_token;

src/cpp/src/visual_language/internvl_chat/classes.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder {
3535
const std::string& device,
3636
const ov::AnyMap device_config);
3737

38-
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) override;
38+
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true) override;
3939
};
4040

4141
} // namespace ov::genai

src/cpp/src/visual_language/llava/classes.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ std::vector<ov::genai::EncodedImage> InputsEmbedderLLaVA::encode_images(const st
114114
return embeds;
115115
}
116116

117-
ov::Tensor InputsEmbedderLLaVA::get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) {
117+
ov::Tensor InputsEmbedderLLaVA::get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings) {
118118
std::string image_token = m_vlm_config.im_start;
119119

120120
std::string formatted_prompt;

src/cpp/src/visual_language/llava/classes.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder {
3535
const std::string& device,
3636
const ov::AnyMap device_config);
3737

38-
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) override;
38+
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true) override;
3939

4040
std::vector<ov::genai::EncodedImage> encode_images(const std::vector<ov::Tensor>& images) override;
4141
protected:

src/cpp/src/visual_language/llava_next/classes.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,7 @@ std::vector<ov::genai::EncodedImage> InputsEmbedderLLaVANext::encode_images(cons
343343
return embeds;
344344
}
345345

346-
ov::Tensor InputsEmbedderLLaVANext::get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) {
346+
ov::Tensor InputsEmbedderLLaVANext::get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings) {
347347
std::string image_token = m_vlm_config.im_start;
348348

349349
std::string formatted_prompt;

src/cpp/src/visual_language/llava_next/classes.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA {
2222
public:
2323
using InputsEmbedderLLaVA::InputsEmbedderLLaVA;
2424

25-
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) override;
25+
ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true) override;
2626

2727
std::vector<ov::genai::EncodedImage> encode_images(const std::vector<ov::Tensor>& images) override;
2828
};

src/cpp/src/visual_language/minicpm/classes.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -566,7 +566,7 @@ void adjust_pos_cache(
566566

567567
} // namespace
568568

569-
ov::Tensor InputsEmbedderMiniCPM::get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) {
569+
ov::Tensor InputsEmbedderMiniCPM::get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings) {
570570
auto [unified_prompt, images_sequence] = normalize_prompt(
571571
prompt,
572572
NATIVE_TAG,

0 commit comments

Comments
 (0)