@@ -412,7 +412,31 @@ EncodedImage VisionEncoderMiniCPM::encode(const ov::Tensor& image, const ov::Any
412412 ctx_clip.image_size = config.image_size ;
413413 std::copy (config.norm_mean .begin (), config.norm_mean .end (), ctx_clip.image_mean );
414414 std::copy (config.norm_std .begin (), config.norm_std .end (), ctx_clip.image_std );
415- return llava_image_embed_make_with_bytes_slice (ctx_clip, image, encoder, config.max_slice_nums , config.scale_resolution , config.patch_size , 0 == config.max_slice_nums );
415+ EncodedImage encoded_image = llava_image_embed_make_with_bytes_slice (ctx_clip, image, encoder, config.max_slice_nums , config.scale_resolution , config.patch_size , 0 == config.max_slice_nums );
416+ encoded_image.resampled_image = resample_encoded_image (encoded_image);
417+ return encoded_image;
418+ }
419+
420+ ResampledImage VisionEncoderMiniCPM::resample_encoded_image (const EncodedImage& encoded_image) {
421+ const ov::Tensor& resampled_source = resample (encoded_image.resized_source , {encoded_image.resized_source_size });
422+ std::vector<std::vector<ov::Tensor>> vision_embed_tensors;
423+ if (encoded_image.slices ) {
424+ size_t token_idx = 0 ;
425+ const ov::Shape& slices_shape = encoded_image.slices .get_shape ();
426+ vision_embed_tensors.resize (slices_shape.at (0 ));
427+ for (size_t i = 0 ; i < slices_shape.at (0 ); ++i) {
428+ std::vector<ov::Tensor> vision_embeds;
429+ vision_embeds.resize (slices_shape.at (1 ));
430+ for (size_t ja = 0 ; ja < slices_shape.at (1 ); ++ja) {
431+ size_t d2 = slices_shape.at (2 );
432+ size_t d3 = slices_shape.at (3 );
433+ ov::Tensor encoded_view{ov::element::f32 , {1 , d2, d3}, encoded_image.slices .data <float >() + (i * slices_shape.at (1 ) + ja) * d2 * d3};
434+ vision_embeds[ja] = resample (encoded_view, {encoded_image.slices_size });
435+ }
436+ vision_embed_tensors[i] = vision_embeds;
437+ }
438+ }
439+ return {resampled_source, vision_embed_tensors};
416440}
417441
418442namespace {
@@ -542,44 +566,6 @@ void adjust_pos_cache(
542566
543567} // namespace
544568
545- InputsEmbedderMiniCPM::InputsEmbedderMiniCPM (
546- const VLMConfig& vlm_config,
547- const std::filesystem::path& model_dir,
548- const std::string& device,
549- const ov::AnyMap device_config) :
550- IInputsEmbedder (vlm_config, model_dir, device, device_config) {
551- auto compiled_model =
552- utils::singleton_core ().compile_model (model_dir / " openvino_resampler_model.xml" , device, device_config);
553- ov::genai::utils::print_compiled_model_properties (compiled_model, " VLM resampler model" );
554- m_ireq_queue_resampler = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
555- compiled_model.get_property (ov::optimal_number_of_infer_requests),
556- [&compiled_model]() -> ov::InferRequest {
557- return compiled_model.create_infer_request ();
558- });
559- m_pos_embed_cache = get_2d_sincos_pos_embed (m_vlm_config.hidden_size , {70 , 70 });
560- }
561-
562- InputsEmbedderMiniCPM::InputsEmbedderMiniCPM (
563- const VLMConfig& vlm_config,
564- const ModelsMap& models_map,
565- const Tokenizer& tokenizer,
566- const std::filesystem::path& config_dir_path,
567- const std::string& device,
568- const ov::AnyMap device_config) :
569- IInputsEmbedder (vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {
570- auto compiled_model = utils::singleton_core ().compile_model (
571- utils::get_model_weights_pair (models_map, " resampler" ).first ,
572- utils::get_model_weights_pair (models_map, " resampler" ).second ,
573- device,
574- device_config);
575- m_ireq_queue_resampler = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
576- compiled_model.get_property (ov::optimal_number_of_infer_requests),
577- [&compiled_model]() -> ov::InferRequest {
578- return compiled_model.create_infer_request ();
579- });
580- m_pos_embed_cache = get_2d_sincos_pos_embed (m_vlm_config.hidden_size , {70 , 70 });
581- }
582-
583569ov::Tensor InputsEmbedderMiniCPM::get_inputs_embeds (const std::string& prompt, const std::vector<ov::genai::EncodedImage>& images, ov::genai::VLMPerfMetrics& metrics) {
584570 auto [unified_prompt, images_sequence] = normalize_prompt (
585571 prompt,
@@ -648,7 +634,7 @@ ov::Tensor InputsEmbedderMiniCPM::get_inputs_embeds(const std::string& prompt, c
648634 float * inputs_embeds_data = inputs_embeds.data <float >();
649635 for (size_t image_id : images_sequence) {
650636 const EncodedImage& encoded_image = images.at (image_id - m_prev_image_id);
651- const ov::Tensor& resampled_source = resample ( encoded_image.resized_source , {encoded_image. resized_source_size }) ;
637+ const ov::Tensor& resampled_source = encoded_image.resampled_image . resampled_source ;
652638 auto emb = resampled_source.data <float >();
653639 ids = std::find (ids, end, im_start_id);
654640 OPENVINO_ASSERT (end != ids);
@@ -660,10 +646,7 @@ ov::Tensor InputsEmbedderMiniCPM::get_inputs_embeds(const std::string& prompt, c
660646 const ov::Shape& slices_shape = encoded_image.slices .get_shape ();
661647 for (size_t i = 0 ; i < slices_shape.at (0 ); ++i) {
662648 for (size_t ja = 0 ; ja < slices_shape.at (1 ); ++ja) {
663- size_t d2 = slices_shape.at (2 );
664- size_t d3 = slices_shape.at (3 );
665- ov::Tensor encoded_view{ov::element::f32 , {1 , d2, d3}, encoded_image.slices .data <float >() + (i * slices_shape.at (1 ) + ja) * d2 * d3};
666- const ov::Tensor& vision_embed_tensor_i_j = resample (encoded_view, {encoded_image.slices_size });
649+ const ov::Tensor& vision_embed_tensor_i_j = encoded_image.resampled_image .vision_embed_tensors [i][ja];
667650 ids = std::find (ids, end, slice_start_id);
668651 OPENVINO_ASSERT (end != ids);
669652 ++ids;
@@ -703,7 +686,7 @@ bool InputsEmbedderMiniCPM::prompt_has_image_tag(const std::string& prompt) cons
703686 return IInputsEmbedder::prompt_has_image_tag (prompt) || prompt.find (NATIVE_TAG) != std::string::npos;
704687}
705688
706- ov::Tensor InputsEmbedderMiniCPM ::resample (const ov::Tensor& encoded_image, const std::vector<ImageSize>& target_sizes) {
689+ ov::Tensor VisionEncoderMiniCPM ::resample (const ov::Tensor& encoded_image, const std::vector<ImageSize>& target_sizes) {
707690 size_t bs = encoded_image.get_shape ().at (0 );
708691 std::vector<size_t > patch_len{target_sizes.size ()};
709692 std::transform (target_sizes.begin (), target_sizes.end (), patch_len.begin (), [](const ImageSize& height_width) {
@@ -747,7 +730,62 @@ ov::Tensor InputsEmbedderMiniCPM::resample(const ov::Tensor& encoded_image, cons
747730 resampler.set_tensor (" pos_embed" , pos_embed); // [H*W, N, new_hidden_size]
748731 resampler.set_tensor (" key_padding_mask" , key_padding_mask); // [N, H*W]
749732 resampler.infer ();
750- return resampler.get_output_tensor (); // [N, query_num, new_hidden_size]
733+ auto resampler_out = resampler.get_output_tensor ();
734+ // resampler_out is bound to infer request and the data may become corrupted after next resampler inference
735+ // so we need to return a copy to make sure data does not get corrupted
736+ ov::Tensor res (resampler_out.get_element_type (), resampler_out.get_shape ());
737+ std::memcpy (res.data (), resampler_out.data (), resampler_out.get_byte_size ());
738+ return res; // [N, query_num, new_hidden_size]
739+ }
740+
741+ VisionEncoderMiniCPM::VisionEncoderMiniCPM (
742+ const std::filesystem::path& model_dir,
743+ const std::string& device,
744+ const ov::AnyMap properties) : VisionEncoder{model_dir, device, properties} {
745+ m_vlm_config = utils::from_config_json_if_exists<VLMConfig>(model_dir, " config.json" );
746+ auto compiled_model = utils::singleton_core ().compile_model (model_dir / " openvino_resampler_model.xml" , device, properties);
747+ ov::genai::utils::print_compiled_model_properties (compiled_model, " VLM resampler model" );
748+ m_ireq_queue_resampler = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
749+ compiled_model.get_property (ov::optimal_number_of_infer_requests),
750+ [&compiled_model]() -> ov::InferRequest {
751+ return compiled_model.create_infer_request ();
752+ });
753+ m_pos_embed_cache = get_2d_sincos_pos_embed (m_vlm_config.hidden_size , {70 , 70 });
754+ }
755+
756+ VisionEncoderMiniCPM::VisionEncoderMiniCPM (
757+ const ModelsMap& models_map,
758+ const std::filesystem::path& config_dir_path,
759+ const std::string& device,
760+ const ov::AnyMap device_config) : VisionEncoder{models_map, config_dir_path, device, device_config} {
761+ const auto & resampler_model = utils::get_model_weights_pair (models_map, " resampler" ).first ;
762+ const auto & resampler_weights = utils::get_model_weights_pair (models_map, " resampler" ).second ;
763+ m_vlm_config = utils::from_config_json_if_exists<VLMConfig>(config_dir_path, " config.json" );
764+ auto compiled_model = utils::singleton_core ().compile_model (resampler_model, resampler_weights, device, device_config);
765+ ov::genai::utils::print_compiled_model_properties (compiled_model, " VLM resampler model" );
766+ m_ireq_queue_resampler = std::make_unique<CircularBufferQueue<ov::InferRequest>>(
767+ compiled_model.get_property (ov::optimal_number_of_infer_requests),
768+ [&compiled_model]() -> ov::InferRequest {
769+ return compiled_model.create_infer_request ();
770+ });
771+ m_pos_embed_cache = get_2d_sincos_pos_embed (m_vlm_config.hidden_size , {70 , 70 });
751772}
752773
774+
775+ InputsEmbedderMiniCPM::InputsEmbedderMiniCPM (
776+ const VLMConfig& vlm_config,
777+ const std::filesystem::path& model_dir,
778+ const std::string& device,
779+ const ov::AnyMap device_config) :
780+ IInputsEmbedder (vlm_config, model_dir, device, device_config) {}
781+
782+ InputsEmbedderMiniCPM::InputsEmbedderMiniCPM (
783+ const VLMConfig& vlm_config,
784+ const ModelsMap& models_map,
785+ const Tokenizer& tokenizer,
786+ const std::filesystem::path& config_dir_path,
787+ const std::string& device,
788+ const ov::AnyMap device_config) :
789+ IInputsEmbedder (vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {}
790+
753791} // namespace ov::genai
0 commit comments