Skip to content

Commit 9afd259

Browse files
authored
Move MiniCPM resampler to vision encoder (openvinotoolkit#1997)
Moving MiniCPM resampler to vision encoder allows to optimize get_inputs_embeds() on second and subsequent chat iterations and not re-run resampler each time. Time measures are below for 6 chat iterations, where image is passed only on the first iteration. This branch: encode time: 6585 ms get_inputs_embeds time: 5 ms Chat iteration 2: encode time: 0 ms get_inputs_embeds time: 3 ms Chat iteration 3: encode time: 0 ms get_inputs_embeds time: 4 ms Chat iteration 4: encode time: 0 ms get_inputs_embeds time: 4 ms Chat iteration 5: encode time: 0 ms get_inputs_embeds time: 3 ms Chat iteration 6: encode time: 0 ms get_inputs_embeds time: 3 ms Master: Chat iteration 1: encode time: 6176 ms get_inputs_embeds time: 358 ms Chat iteration 2: encode time: 0 ms get_inputs_embeds time: 338 ms Chat iteration 3: encode time: 0 ms get_inputs_embeds time: 339 ms Chat iteration 4: encode time: 0 ms get_inputs_embeds time: 340 ms Chat iteration 5: encode time: 0 ms get_inputs_embeds time: 335 ms Chat iteration 6: encode time: 0 ms get_inputs_embeds time: 337 ms
1 parent 5e51e16 commit 9afd259

File tree

7 files changed

+150
-80
lines changed

7 files changed

+150
-80
lines changed
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// Copyright (C) 2023-2025 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
#pragma once
5+
6+
#include <map>
7+
#include "openvino/core/core.hpp"
8+
9+
namespace ov {
10+
namespace genai {
11+
12+
/// @brief A map of models for VLMPipeline constructor.
13+
/// Key is model name (e.g. "vision_embeddings", "text_embeddings", "language", "resampler")
14+
/// and value is a pair of model IR as string and weights as tensor.
15+
using ModelsMap = std::map<std::string, std::pair<std::string, ov::Tensor>>;
16+
17+
}
18+
}

src/cpp/include/openvino/genai/llm_pipeline.hpp

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "openvino/genai/streamer_base.hpp"
1515
#include "openvino/genai/perf_metrics.hpp"
1616
#include "openvino/genai/scheduler_config.hpp"
17+
#include "openvino/genai/common_types.hpp"
1718

1819
namespace ov {
1920
namespace genai {
@@ -26,11 +27,6 @@ using OptionalGenerationConfig = std::optional<GenerationConfig>;
2627
using EncodedInputs = std::variant<ov::Tensor, TokenizedInputs>;
2728
using StringInputs = std::variant<std::string, std::vector<std::string>>;
2829

29-
/// @brief A map of models for VLMPipeline constructor.
30-
/// Key is model name (e.g. "vision_embeddings", "text_embeddings", "language", "resampler")
31-
/// and value is a pair of model IR as string and weights as tensor.
32-
using ModelsMap = std::map<std::string, std::pair<std::string, ov::Tensor>>;
33-
3430
/**
3531
* @brief Structure to store resulting batched tokens and scores for each batch sequence.
3632
* The first num_return_sequences elements correspond to the first batch element.

src/cpp/src/visual_language/inputs_embedder.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,7 @@ InputsEmbedder::IInputsEmbedder::IInputsEmbedder(
8888
const ov::AnyMap device_config) :
8989
m_vlm_config{vlm_config},
9090
m_vision_encoder(VisionEncoder::create(
91-
utils::get_model_weights_pair(models_map, "vision_embeddings").first,
92-
utils::get_model_weights_pair(models_map, "vision_embeddings").second,
91+
models_map,
9392
config_dir_path,
9493
m_vlm_config.model_type,
9594
device,

src/cpp/src/visual_language/minicpm/classes.cpp

Lines changed: 84 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -412,7 +412,31 @@ EncodedImage VisionEncoderMiniCPM::encode(const ov::Tensor& image, const ov::Any
412412
ctx_clip.image_size = config.image_size;
413413
std::copy(config.norm_mean.begin(), config.norm_mean.end(), ctx_clip.image_mean);
414414
std::copy(config.norm_std.begin(), config.norm_std.end(), ctx_clip.image_std);
415-
return llava_image_embed_make_with_bytes_slice(ctx_clip, image, encoder, config.max_slice_nums, config.scale_resolution, config.patch_size, 0 == config.max_slice_nums);
415+
EncodedImage encoded_image = llava_image_embed_make_with_bytes_slice(ctx_clip, image, encoder, config.max_slice_nums, config.scale_resolution, config.patch_size, 0 == config.max_slice_nums);
416+
encoded_image.resampled_image = resample_encoded_image(encoded_image);
417+
return encoded_image;
418+
}
419+
420+
ResampledImage VisionEncoderMiniCPM::resample_encoded_image(const EncodedImage& encoded_image) {
421+
const ov::Tensor& resampled_source = resample(encoded_image.resized_source, {encoded_image.resized_source_size});
422+
std::vector<std::vector<ov::Tensor>> vision_embed_tensors;
423+
if (encoded_image.slices) {
424+
size_t token_idx = 0;
425+
const ov::Shape& slices_shape = encoded_image.slices.get_shape();
426+
vision_embed_tensors.resize(slices_shape.at(0));
427+
for (size_t i = 0; i < slices_shape.at(0); ++i) {
428+
std::vector<ov::Tensor> vision_embeds;
429+
vision_embeds.resize(slices_shape.at(1));
430+
for (size_t ja = 0; ja < slices_shape.at(1); ++ja) {
431+
size_t d2 = slices_shape.at(2);
432+
size_t d3 = slices_shape.at(3);
433+
ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data<float>() + (i * slices_shape.at(1) + ja) * d2 * d3};
434+
vision_embeds[ja] = resample(encoded_view, {encoded_image.slices_size});
435+
}
436+
vision_embed_tensors[i] = vision_embeds;
437+
}
438+
}
439+
return {resampled_source, vision_embed_tensors};
416440
}
417441

418442
namespace {
@@ -542,44 +566,6 @@ void adjust_pos_cache(
542566

543567
} // namespace
544568

545-
InputsEmbedderMiniCPM::InputsEmbedderMiniCPM(
546-
const VLMConfig& vlm_config,
547-
const std::filesystem::path& model_dir,
548-
const std::string& device,
549-
const ov::AnyMap device_config) :
550-
IInputsEmbedder(vlm_config, model_dir, device, device_config) {
551-
auto compiled_model =
552-
utils::singleton_core().compile_model(model_dir / "openvino_resampler_model.xml", device, device_config);
553-
ov::genai::utils::print_compiled_model_properties(compiled_model, "VLM resampler model");
554-
m_ireq_queue_resampler = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
555-
compiled_model.get_property(ov::optimal_number_of_infer_requests),
556-
[&compiled_model]() -> ov::InferRequest {
557-
return compiled_model.create_infer_request();
558-
});
559-
m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70});
560-
}
561-
562-
InputsEmbedderMiniCPM::InputsEmbedderMiniCPM(
563-
const VLMConfig& vlm_config,
564-
const ModelsMap& models_map,
565-
const Tokenizer& tokenizer,
566-
const std::filesystem::path& config_dir_path,
567-
const std::string& device,
568-
const ov::AnyMap device_config) :
569-
IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {
570-
auto compiled_model = utils::singleton_core().compile_model(
571-
utils::get_model_weights_pair(models_map, "resampler").first,
572-
utils::get_model_weights_pair(models_map, "resampler").second,
573-
device,
574-
device_config);
575-
m_ireq_queue_resampler = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
576-
compiled_model.get_property(ov::optimal_number_of_infer_requests),
577-
[&compiled_model]() -> ov::InferRequest {
578-
return compiled_model.create_infer_request();
579-
});
580-
m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70});
581-
}
582-
583569
ov::Tensor InputsEmbedderMiniCPM::get_inputs_embeds(const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) {
584570
auto [unified_prompt, images_sequence] = normalize_prompt(
585571
prompt,
@@ -648,7 +634,7 @@ ov::Tensor InputsEmbedderMiniCPM::get_inputs_embeds(const std::string& prompt, c
648634
float* inputs_embeds_data = inputs_embeds.data<float>();
649635
for (size_t image_id : images_sequence) {
650636
const EncodedImage& encoded_image = images.at(image_id - m_prev_image_id);
651-
const ov::Tensor& resampled_source = resample(encoded_image.resized_source, {encoded_image.resized_source_size});
637+
const ov::Tensor& resampled_source = encoded_image.resampled_image.resampled_source;
652638
auto emb = resampled_source.data<float>();
653639
ids = std::find(ids, end, im_start_id);
654640
OPENVINO_ASSERT(end != ids);
@@ -660,10 +646,7 @@ ov::Tensor InputsEmbedderMiniCPM::get_inputs_embeds(const std::string& prompt, c
660646
const ov::Shape& slices_shape = encoded_image.slices.get_shape();
661647
for (size_t i = 0; i < slices_shape.at(0); ++i) {
662648
for (size_t ja = 0; ja < slices_shape.at(1); ++ja) {
663-
size_t d2 = slices_shape.at(2);
664-
size_t d3 = slices_shape.at(3);
665-
ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, encoded_image.slices.data<float>() + (i * slices_shape.at(1) + ja) * d2 * d3};
666-
const ov::Tensor& vision_embed_tensor_i_j = resample(encoded_view, {encoded_image.slices_size});
649+
const ov::Tensor& vision_embed_tensor_i_j = encoded_image.resampled_image.vision_embed_tensors[i][ja];
667650
ids = std::find(ids, end, slice_start_id);
668651
OPENVINO_ASSERT(end != ids);
669652
++ids;
@@ -703,7 +686,7 @@ bool InputsEmbedderMiniCPM::prompt_has_image_tag(const std::string& prompt) cons
703686
return IInputsEmbedder::prompt_has_image_tag(prompt) || prompt.find(NATIVE_TAG) != std::string::npos;
704687
}
705688

706-
ov::Tensor InputsEmbedderMiniCPM::resample(const ov::Tensor& encoded_image, const std::vector<ImageSize>& target_sizes) {
689+
ov::Tensor VisionEncoderMiniCPM::resample(const ov::Tensor& encoded_image, const std::vector<ImageSize>& target_sizes) {
707690
size_t bs = encoded_image.get_shape().at(0);
708691
std::vector<size_t> patch_len{target_sizes.size()};
709692
std::transform(target_sizes.begin(), target_sizes.end(), patch_len.begin(), [](const ImageSize& height_width) {
@@ -747,7 +730,62 @@ ov::Tensor InputsEmbedderMiniCPM::resample(const ov::Tensor& encoded_image, cons
747730
resampler.set_tensor("pos_embed", pos_embed); // [H*W, N, new_hidden_size]
748731
resampler.set_tensor("key_padding_mask", key_padding_mask); // [N, H*W]
749732
resampler.infer();
750-
return resampler.get_output_tensor(); // [N, query_num, new_hidden_size]
733+
auto resampler_out = resampler.get_output_tensor();
734+
// resampler_out is bound to infer request and the data may become corrupted after next resampler inference
735+
// so we need to return a copy to make sure data does not get corrupted
736+
ov::Tensor res(resampler_out.get_element_type(), resampler_out.get_shape());
737+
std::memcpy(res.data(), resampler_out.data(), resampler_out.get_byte_size());
738+
return res; // [N, query_num, new_hidden_size]
739+
}
740+
741+
VisionEncoderMiniCPM::VisionEncoderMiniCPM(
742+
const std::filesystem::path& model_dir,
743+
const std::string& device,
744+
const ov::AnyMap properties) : VisionEncoder{model_dir, device, properties} {
745+
m_vlm_config = utils::from_config_json_if_exists<VLMConfig>(model_dir, "config.json");
746+
auto compiled_model = utils::singleton_core().compile_model(model_dir / "openvino_resampler_model.xml", device, properties);
747+
ov::genai::utils::print_compiled_model_properties(compiled_model, "VLM resampler model");
748+
m_ireq_queue_resampler = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
749+
compiled_model.get_property(ov::optimal_number_of_infer_requests),
750+
[&compiled_model]() -> ov::InferRequest {
751+
return compiled_model.create_infer_request();
752+
});
753+
m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70});
754+
}
755+
756+
VisionEncoderMiniCPM::VisionEncoderMiniCPM(
757+
const ModelsMap& models_map,
758+
const std::filesystem::path& config_dir_path,
759+
const std::string& device,
760+
const ov::AnyMap device_config) : VisionEncoder{models_map, config_dir_path, device, device_config} {
761+
const auto& resampler_model = utils::get_model_weights_pair(models_map, "resampler").first;
762+
const auto& resampler_weights = utils::get_model_weights_pair(models_map, "resampler").second;
763+
m_vlm_config = utils::from_config_json_if_exists<VLMConfig>(config_dir_path, "config.json");
764+
auto compiled_model = utils::singleton_core().compile_model(resampler_model, resampler_weights, device, device_config);
765+
ov::genai::utils::print_compiled_model_properties(compiled_model, "VLM resampler model");
766+
m_ireq_queue_resampler = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
767+
compiled_model.get_property(ov::optimal_number_of_infer_requests),
768+
[&compiled_model]() -> ov::InferRequest {
769+
return compiled_model.create_infer_request();
770+
});
771+
m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70});
751772
}
752773

774+
775+
InputsEmbedderMiniCPM::InputsEmbedderMiniCPM(
776+
const VLMConfig& vlm_config,
777+
const std::filesystem::path& model_dir,
778+
const std::string& device,
779+
const ov::AnyMap device_config) :
780+
IInputsEmbedder(vlm_config, model_dir, device, device_config) {}
781+
782+
InputsEmbedderMiniCPM::InputsEmbedderMiniCPM(
783+
const VLMConfig& vlm_config,
784+
const ModelsMap& models_map,
785+
const Tokenizer& tokenizer,
786+
const std::filesystem::path& config_dir_path,
787+
const std::string& device,
788+
const ov::AnyMap device_config) :
789+
IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {}
790+
753791
} // namespace ov::genai

src/cpp/src/visual_language/minicpm/classes.hpp

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,6 @@
1313
namespace ov::genai {
1414

1515
class VisionEncoderMiniCPM : public VisionEncoder {
16-
public:
17-
using VisionEncoder::VisionEncoder;
18-
19-
EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map) override;
20-
};
21-
22-
class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
2316
// A resampler model to resample image embeddings.
2417
// [N, H*W, old_hidden_size] is the input shape.
2518
// [N, query_num, hidden_size] is the output shape.
@@ -28,6 +21,27 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
2821
// [70, 70, hidden_size]. 70 is the initial guess of the image
2922
// height and width after dividing by patch_size.
3023
ov::Tensor m_pos_embed_cache;
24+
// VLM config
25+
VLMConfig m_vlm_config;
26+
27+
ov::Tensor resample(const ov::Tensor& encoded_image, const std::vector<ImageSize>& target_sizes);
28+
public:
29+
VisionEncoderMiniCPM(
30+
const std::filesystem::path& model_dir,
31+
const std::string& device,
32+
const ov::AnyMap properties);
33+
34+
35+
VisionEncoderMiniCPM(
36+
const ModelsMap& models_map,
37+
const std::filesystem::path& config_dir_path,
38+
const std::string& device,
39+
const ov::AnyMap device_config);
40+
EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map) override;
41+
ResampledImage resample_encoded_image(const EncodedImage& image);
42+
};
43+
44+
class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
3145
size_t m_prev_image_id = 0;
3246

3347
public:
@@ -36,7 +50,7 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
3650
const std::filesystem::path& model_dir,
3751
const std::string& device,
3852
const ov::AnyMap device_config);
39-
53+
4054
InputsEmbedderMiniCPM(
4155
const VLMConfig& vlm_config,
4256
const ModelsMap& models_map,
@@ -55,8 +69,6 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder {
5569

5670
bool prompt_has_image_tag(const std::string& prompt) const override;
5771

58-
private:
59-
ov::Tensor resample(const ov::Tensor& encoded_image, const std::vector<ImageSize>& target_sizes);
6072
};
6173

6274
} // namespace ov::genai

src/cpp/src/visual_language/vision_encoder.cpp

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,13 @@ VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const std::
2626
}
2727

2828
VisionEncoder::VisionEncoder(
29-
const std::string& model,
30-
const ov::Tensor& weights,
29+
const ModelsMap& models_map,
3130
const std::filesystem::path& config_dir_path,
3231
const std::string& device,
3332
const ov::AnyMap device_config) {
34-
auto compiled_model = utils::singleton_core().compile_model(model, weights, device, device_config);
33+
const auto& vision_encoder_model = utils::get_model_weights_pair(models_map, "vision_embeddings").first;
34+
const auto& vision_encoder_weights = utils::get_model_weights_pair(models_map, "vision_embeddings").second;
35+
auto compiled_model = utils::singleton_core().compile_model(vision_encoder_model, vision_encoder_weights, device, device_config);
3536
ov::genai::utils::print_compiled_model_properties(compiled_model, "VLM vision embeddings model");
3637
m_ireq_queue_vision_encoder = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
3738
compiled_model.get_property(ov::optimal_number_of_infer_requests),
@@ -64,24 +65,23 @@ VisionEncoder::Ptr VisionEncoder::create(const std::filesystem::path& model_dir,
6465
}
6566

6667
VisionEncoder::Ptr VisionEncoder::create(
67-
const std::string& model,
68-
const ov::Tensor& weights,
68+
const ModelsMap& models_map,
6969
const std::filesystem::path& config_dir_path,
7070
const VLMModelType model_type,
7171
const std::string& device,
7272
const ov::AnyMap device_config) {
7373
if (model_type == VLMModelType::MINICPM) {
74-
return std::make_shared<VisionEncoderMiniCPM>(model, weights, config_dir_path, device, device_config);
74+
return std::make_shared<VisionEncoderMiniCPM>(models_map, config_dir_path, device, device_config);
7575
} else if (model_type == VLMModelType::LLAVA) {
76-
return std::make_shared<VisionEncoderLLaVA>(model, weights, config_dir_path, device, device_config);
76+
return std::make_shared<VisionEncoderLLaVA>(models_map, config_dir_path, device, device_config);
7777
} else if (model_type == VLMModelType::LLAVA_NEXT) {
78-
return std::make_shared<VisionEncoderLLaVANext>(model, weights, config_dir_path, device, device_config);
78+
return std::make_shared<VisionEncoderLLaVANext>(models_map, config_dir_path, device, device_config);
7979
} else if (model_type == VLMModelType::INTERNVL_CHAT) {
80-
return std::make_shared<VisionEncoderInternVLChat>(model, weights, config_dir_path, device, device_config);
80+
return std::make_shared<VisionEncoderInternVLChat>(models_map, config_dir_path, device, device_config);
8181
} else if (model_type == VLMModelType::PHI3_V) {
82-
return std::make_shared<VisionEncoderPhi3V>(model, weights, config_dir_path, device, device_config);
82+
return std::make_shared<VisionEncoderPhi3V>(models_map, config_dir_path, device, device_config);
8383
} else if (model_type == VLMModelType::QWEN2_VL) {
84-
return std::make_shared<VisionEncoderQwen2VL>(model, weights, config_dir_path, device, device_config);
84+
return std::make_shared<VisionEncoderQwen2VL>(models_map, config_dir_path, device, device_config);
8585
} else {
8686
OPENVINO_THROW("Unsupported model type in VLM VisionEncoder class. Please, create feature request on new model support");
8787
}

src/cpp/src/visual_language/vision_encoder.hpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <memory>
66
#include "openvino/runtime/infer_request.hpp"
77

8+
#include "openvino/genai/common_types.hpp"
89
#include "visual_language/vlm_config.hpp"
910
#include "visual_language/processor_config.hpp"
1011
#include "circular_buffer_queue.hpp"
@@ -18,6 +19,12 @@ struct ImageSize {
1819
size_t width;
1920
};
2021

22+
23+
struct ResampledImage {
24+
ov::Tensor resampled_source;
25+
std::vector<std::vector<ov::Tensor>> vision_embed_tensors;
26+
};
27+
2128
/// @brief Embeddings of a given image. The number of slices is no
2229
/// greater than ProcessorConfig's max_slice_nums.
2330
struct EncodedImage {
@@ -45,6 +52,9 @@ struct EncodedImage {
4552

4653
/// @brief Original size of the image
4754
ImageSize original_image_size;
55+
56+
/// @brief Resampled image, used only by MiniCPM.
57+
ResampledImage resampled_image;
4858
};
4959

5060
/// @brief A class used to infer embeddings of an image using
@@ -67,16 +77,14 @@ class VisionEncoder {
6777
const ov::AnyMap properties = {});
6878

6979
/// @brief Constructs the encoder from models map.
70-
/// @param model Model IR as string (openvino_vision_embeddings_model.xml)
71-
/// @param weights Model weights as tensor (openvino_vision_embeddings_model.bin)
80+
/// @param models_map Models map
7281
/// @param config_dir_path A path to directory containing preprocessor_config.json.
7382
/// @param model_type A type of VLM model.
7483
/// @param device A device to compile the encoder for.
7584
/// @param properties A config to be passed to
7685
/// ov::Core::compile_model().
7786
static VisionEncoder::Ptr create(
78-
const std::string& model,
79-
const ov::Tensor& weights,
87+
const ModelsMap& models_map,
8088
const std::filesystem::path& config_dir_path,
8189
const VLMModelType model_type,
8290
const std::string& device,
@@ -110,8 +118,7 @@ class VisionEncoder {
110118
const ov::AnyMap properties);
111119

112120
VisionEncoder(
113-
const std::string& model,
114-
const ov::Tensor& weights,
121+
const ModelsMap& models_map,
115122
const std::filesystem::path& config_dir_path,
116123
const std::string& device,
117124
const ov::AnyMap properties);

0 commit comments

Comments
 (0)