@@ -31,10 +31,9 @@ const std::string NATIVE_TAG = "<|vision_start|><|image_pad|><|vision_end|>";
3131std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::op::v0::Result>> patch_preprocess_branch_image (
3232 std::shared_ptr<ov::op::v0::Parameter> raw_images_1,
3333 std::shared_ptr<ov::op::v0::Parameter> resize_target_shape,
34- std::shared_ptr<ov::op::v0::Parameter > image_mean,
35- std::shared_ptr<ov::op::v0::Parameter > image_scale,
34+ std::shared_ptr<ov::op::v0::Constant > image_mean,
35+ std::shared_ptr<ov::op::v0::Constant > image_scale,
3636 std::shared_ptr<ov::op::v0::Parameter> broadcast_shape) {
37- std::cout << " patch_preprocess_branch_image" << std::endl;
3837 auto raw_images_f32_1 = std::make_shared<ov::op::v0::Convert>(raw_images_1, ov::element::f32 );
3938 auto img_trans_1 = std::make_shared<ov::op::v1::Transpose>(
4039 raw_images_f32_1,
@@ -55,11 +54,7 @@ std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::op::v0::Result>> patch
5554 auto temporal_images = std::make_shared<ov::op::v3::Broadcast>(resized_images_s_1, broadcast_shape);
5655 auto results = std::make_shared<ov::op::v0::Result>(temporal_images);
5756 return {std::make_shared<ov::Model>(results,
58- ov::ParameterVector{raw_images_1,
59- resize_target_shape,
60- image_mean,
61- image_scale,
62- broadcast_shape},
57+ ov::ParameterVector{raw_images_1, resize_target_shape, broadcast_shape},
6358 " then_body" ),
6459 results};
6560}
@@ -69,21 +64,16 @@ std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::op::v0::Result>> patch
6964 std::shared_ptr<ov::op::v0::Parameter> raw_images_1,
7065 std::shared_ptr<ov::op::v0::Parameter> raw_images_2,
7166 std::shared_ptr<ov::op::v0::Parameter> resize_target_shape,
72- std::shared_ptr<ov::op::v0::Parameter> image_mean,
73- std::shared_ptr<ov::op::v0::Parameter> image_scale) {
74- std::cout << " patch_preprocess_branch_video" << std::endl;
67+ std::shared_ptr<ov::op::v0::Constant> image_mean,
68+ std::shared_ptr<ov::op::v0::Constant> image_scale) {
7569 auto raw_images_f32_1 = std::make_shared<ov::op::v0::Convert>(raw_images_1, ov::element::f32 );
7670 auto raw_images_f32_2 = std::make_shared<ov::op::v0::Convert>(raw_images_2, ov::element::f32 );
7771 auto img_trans_1 = std::make_shared<ov::op::v1::Transpose>(
7872 raw_images_f32_1,
7973 std::make_shared<ov::op::v0::Constant>(ov::element::i32 , Shape{4 }, std::vector<int32_t >{0 , 3 , 1 , 2 }));
80- img_trans_1->set_friendly_name (" img_trans_1" );
81- img_trans_1->output (0 ).get_tensor ().set_names ({" img_trans_1" });
8274 auto img_trans_2 = std::make_shared<ov::op::v1::Transpose>(
8375 raw_images_f32_2,
8476 std::make_shared<ov::op::v0::Constant>(ov::element::i32 , Shape{4 }, std::vector<int32_t >{0 , 3 , 1 , 2 }));
85- img_trans_2->set_friendly_name (" img_trans_2" );
86- img_trans_2->output (0 ).get_tensor ().set_names ({" img_trans_2" });
8777
8878 ov::op::v0::Interpolate::Attributes attrs = {};
8979 attrs.axes = {2 , 3 };
@@ -109,24 +99,28 @@ std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::op::v0::Result>> patch
10999
110100 auto result_temperal_images = std::make_shared<ov::op::v0::Result>(temporal_images);
111101 auto result_tmp = std::make_shared<ov::op::v0::Result>(same_image);
112- return {std::make_shared<ov::Model>(
113- ov::ResultVector{result_temperal_images, result_tmp},
114- ov::ParameterVector{same_image, raw_images_1, raw_images_2, resize_target_shape, image_mean, image_scale },
115- " else_body" ),
116- result_temperal_images};
102+ return {
103+ std::make_shared<ov::Model>( ov::ResultVector{result_temperal_images, result_tmp},
104+ ov::ParameterVector{same_image, raw_images_1, raw_images_2, resize_target_shape},
105+ " else_body" ),
106+ result_temperal_images};
117107}
118108
119- std::shared_ptr<ov::Model> patch_preprocess_into_model (std::shared_ptr<ov::Model> model_org) {
120- std::cout << " patch_preprocess_into_model" << std::endl;
109+ std::shared_ptr<ov::Model> patch_preprocess_into_model (std::shared_ptr<ov::Model> model_org, const ProcessorConfig& config) {
110+ std::vector<float > a_image_mean (config.image_mean .begin (), config.image_mean .end ());
111+ std::vector<float > a_image_std (config.image_std .begin (), config.image_std .end ());
112+ for (auto & v : a_image_mean) v *= 255 .0f ;
113+ for (auto & v : a_image_std) v = 1 .0f / (v * 255 .0f );
114+ auto image_mean = std::make_shared<ov::op::v0::Constant>(ov::element::f32 , ov::Shape{1 , config.image_mean .size (), 1 , 1 }, a_image_mean.data ());
115+ auto image_std = std::make_shared<ov::op::v0::Constant>(ov::element::f32 , ov::Shape{1 , config.image_mean .size (), 1 , 1 }, a_image_std.data ());
116+
121117 auto same_image = std::make_shared<ov::op::v0::Parameter>(ov::element::f32 , ov::Shape{1 });
122118 same_image->set_friendly_name (" same_image" );
123119 same_image->output (0 ).get_tensor ().set_names ({" same_image" });
124120
125121 auto raw_images_1 = std::make_shared<ov::op::v0::Parameter>(ov::element::u8 , ov::PartialShape{-1 , -1 , -1 , -1 });
126122 auto raw_images_2 = std::make_shared<ov::op::v0::Parameter>(ov::element::u8 , ov::PartialShape{-1 , -1 , -1 , -1 });
127123 auto resize_target_shape = std::make_shared<ov::op::v0::Parameter>(ov::element::i64 , ov::PartialShape{2 });
128- auto image_mean = std::make_shared<ov::op::v0::Parameter>(ov::element::f32 , ov::PartialShape{1 , -1 , 1 , 1 });
129- auto image_scale = std::make_shared<ov::op::v0::Parameter>(ov::element::f32 , ov::PartialShape{1 , -1 , 1 , 1 });
130124
131125 auto broadcast_shape = std::make_shared<ov::op::v0::Parameter>(ov::element::i64 , ov::PartialShape{4 });
132126 auto temp_shape8d = std::make_shared<ov::op::v0::Parameter>(ov::element::i64 , ov::PartialShape{8 });
@@ -141,39 +135,28 @@ std::shared_ptr<ov::Model> patch_preprocess_into_model(std::shared_ptr<ov::Model
141135 resize_target_shape->set_friendly_name (" resize_target_shape" );
142136 resize_target_shape->output (0 ).get_tensor ().set_names ({" resize_target_shape" });
143137
144- image_mean->set_friendly_name (" image_mean" );
145- image_mean->output (0 ).get_tensor ().set_names ({" image_mean" });
146-
147- image_scale->set_friendly_name (" image_scale" );
148- image_scale->output (0 ).get_tensor ().set_names ({" image_scale" });
149-
150138 broadcast_shape->set_friendly_name (" broadcast_shape" );
151139 broadcast_shape->output (0 ).get_tensor ().set_names ({" broadcast_shape" });
152140
153141 auto then_raw_images_1 = std::make_shared<ov::op::v0::Parameter>(ov::element::u8 , ov::PartialShape{-1 , -1 , -1 , -1 });
154- auto then_raw_images_2 = std::make_shared<ov::op::v0::Parameter>(ov::element::u8 , ov::PartialShape{-1 , -1 , -1 , -1 });
155142 auto then_resize_target_shape = std::make_shared<ov::op::v0::Parameter>(ov::element::i64 , ov::PartialShape{2 });
156- auto then_image_mean = std::make_shared<ov::op::v0::Parameter>(ov::element::f32 , ov::PartialShape{1 , -1 , 1 , 1 });
157- auto then_image_scale = std::make_shared<ov::op::v0::Parameter>(ov::element::f32 , ov::PartialShape{1 , -1 , 1 , 1 });
158143 auto then_broadcast_shape = std::make_shared<ov::op::v0::Parameter>(ov::element::i64 , ov::PartialShape{4 });
159144 auto model_then = patch_preprocess_branch_image (then_raw_images_1,
160145 then_resize_target_shape,
161- then_image_mean ,
162- then_image_scale ,
146+ image_mean ,
147+ image_std ,
163148 then_broadcast_shape);
164149
165150 auto else_same_image = std::make_shared<ov::op::v0::Parameter>(ov::element::f32 , ov::Shape{1 });
166151 auto else_raw_images_1 = std::make_shared<ov::op::v0::Parameter>(ov::element::u8 , ov::PartialShape{-1 , -1 , -1 , -1 });
167152 auto else_raw_images_2 = std::make_shared<ov::op::v0::Parameter>(ov::element::u8 , ov::PartialShape{-1 , -1 , -1 , -1 });
168153 auto else_resize_target_shape = std::make_shared<ov::op::v0::Parameter>(ov::element::i64 , ov::PartialShape{2 });
169- auto else_image_mean = std::make_shared<ov::op::v0::Parameter>(ov::element::f32 , ov::PartialShape{1 , -1 , 1 , 1 });
170- auto else_image_scale = std::make_shared<ov::op::v0::Parameter>(ov::element::f32 , ov::PartialShape{1 , -1 , 1 , 1 });
171154 auto model_else = patch_preprocess_branch_video (else_same_image,
172155 else_raw_images_1,
173156 else_raw_images_2,
174157 else_resize_target_shape,
175- else_image_mean ,
176- else_image_scale );
158+ image_mean ,
159+ image_std );
177160
178161 auto if_op = std::make_shared<ov::op::v8::If>();
179162 if_op->set_then_body (model_then.first );
@@ -183,13 +166,9 @@ std::shared_ptr<ov::Model> patch_preprocess_into_model(std::shared_ptr<ov::Model
183166 if_op->set_input (raw_images_1->output (0 ), nullptr , else_raw_images_1);
184167 if_op->set_input (raw_images_2->output (0 ), nullptr , else_raw_images_2);
185168 if_op->set_input (resize_target_shape->output (0 ), nullptr , else_resize_target_shape);
186- if_op->set_input (image_mean->output (0 ), nullptr , else_image_mean);
187- if_op->set_input (image_scale->output (0 ), nullptr , else_image_scale);
188169
189170 if_op->set_input (raw_images_1->output (0 ), then_raw_images_1, nullptr );
190171 if_op->set_input (resize_target_shape->output (0 ), then_resize_target_shape, nullptr );
191- if_op->set_input (image_mean->output (0 ), then_image_mean, nullptr );
192- if_op->set_input (image_scale->output (0 ), then_image_scale, nullptr );
193172 if_op->set_input (broadcast_shape->output (0 ), then_broadcast_shape, nullptr );
194173
195174 auto temporal_images = if_op->set_output (model_then.second , model_else.second );
@@ -223,8 +202,6 @@ std::shared_ptr<ov::Model> patch_preprocess_into_model(std::shared_ptr<ov::Model
223202 raw_images_1,
224203 raw_images_2,
225204 resize_target_shape,
226- image_mean,
227- image_scale,
228205 broadcast_shape,
229206 temp_shape8d,
230207 temp_shape4d,
@@ -539,8 +516,9 @@ std::unique_ptr<CircularBufferQueue<ov::InferRequest>> VisionEncoderQwen2VL::cre
539516}
540517
541518VisionEncoderQwen2VL::VisionEncoderQwen2VL (const std::filesystem::path& model_dir, const std::string& device, const ov::AnyMap properties) : VisionEncoder(model_dir, device, properties) {
519+ ProcessorConfig config = utils::from_any_map ({}, m_processor_config);
542520 auto model_org = utils::singleton_core ().read_model (model_dir / " openvino_vision_embeddings_model.xml" );
543- auto model = patch_preprocess_into_model (model_org);
521+ auto model = patch_preprocess_into_model (model_org, config );
544522 auto compiled_model = utils::singleton_core ().compile_model (model, device, properties);
545523 m_ireq_queue_vision_encoder = create_ireq (compiled_model);
546524}
@@ -552,8 +530,9 @@ VisionEncoderQwen2VL::VisionEncoderQwen2VL(const ModelsMap& models_map,
552530 const auto & vision_encoder_model = utils::get_model_weights_pair (models_map, " vision_embeddings" ).first ;
553531 const auto & vision_encoder_weights = utils::get_model_weights_pair (models_map, " vision_embeddings" ).second ;
554532
533+ ProcessorConfig config = utils::from_any_map ({}, m_processor_config);
555534 auto model_org = utils::singleton_core ().read_model (vision_encoder_model, vision_encoder_weights);
556- auto model = patch_preprocess_into_model (model_org);
535+ auto model = patch_preprocess_into_model (model_org, config );
557536
558537 auto compiled_model = utils::singleton_core ().compile_model (model, device, device_config);
559538 m_ireq_queue_vision_encoder = create_ireq (compiled_model);
@@ -644,23 +623,13 @@ std::vector<EncodedImage> VisionEncoderQwen2VL::encode_video(const std::vector<o
644623 ov::Tensor temp_shape8d (ov::element::i64 , ov::Shape{8 }, a_temp_shape8d);
645624 ov::Tensor temp_shape4d (ov::element::i64 , ov::Shape{4 }, a_temp_shape4d);
646625 ov::Tensor last_shape (ov::element::i64 , ov::Shape{2 }, last_output_shape);
647-
648- std::vector<float > a_image_mean (config.image_mean .begin (), config.image_mean .end ());
649- std::vector<float > a_image_scale (config.image_std .begin (), config.image_std .end ());
650- for (auto & v : a_image_mean) v *= 255 .0f ;
651- for (auto & v : a_image_scale) v = 1 .0f / (v*255 .0f );
652-
653- ov::Tensor image_mean (ov::element::f32 , ov::Shape{1 ,a_image_mean.size (),1 ,1 }, a_image_mean.data ());
654- ov::Tensor image_scale (ov::element::f32 , ov::Shape{1 ,a_image_scale.size (),1 ,1 }, a_image_scale.data ());
655-
626+
656627 ov::Tensor same_image (ov::element::f32 , ov::Shape{1 }, std::vector<float >{0 }.data ());
657628
658629 encoder.set_tensor (" same_image" , same_image);
659630 encoder.set_tensor (" raw_images_1" , raw_image_1);
660631 encoder.set_tensor (" raw_images_2" , raw_image_2);
661632 encoder.set_tensor (" resize_target_shape" , target_shape);
662- encoder.set_tensor (" image_mean" , image_mean);
663- encoder.set_tensor (" image_scale" , image_scale);
664633 encoder.set_tensor (" broadcast_shape" , broadcast_shape);
665634 encoder.set_tensor (" temp_shape8d" , temp_shape8d);
666635 encoder.set_tensor (" temp_shape4d" , temp_shape4d);
@@ -722,22 +691,12 @@ EncodedImage VisionEncoderQwen2VL::encode(const ov::Tensor& image, const ov::Any
722691 ov::Tensor temp_shape8d (ov::element::i64 , ov::Shape{8 }, a_temp_shape8d);
723692 ov::Tensor temp_shape4d (ov::element::i64 , ov::Shape{4 }, a_temp_shape4d);
724693 ov::Tensor last_shape (ov::element::i64 , ov::Shape{2 }, last_output_shape);
725-
726- std::vector<float > a_image_mean (config.image_mean .begin (), config.image_mean .end ());
727- std::vector<float > a_image_scale (config.image_std .begin (), config.image_std .end ());
728- for (auto & v : a_image_mean) v *= 255 .0f ;
729- for (auto & v : a_image_scale) v = 1 .0f / (v*255 .0f );
730-
731- ov::Tensor image_mean (ov::element::f32 , ov::Shape{1 ,a_image_mean.size (),1 ,1 }, a_image_mean.data ());
732- ov::Tensor image_scale (ov::element::f32 , ov::Shape{1 ,a_image_scale.size (),1 ,1 }, a_image_scale.data ());
733-
734694 ov::Tensor same_image (ov::element::f32 , ov::Shape{1 }, std::vector<float >{0 }.data ());
695+
735696 encoder.set_tensor (" same_image" , same_image);
736697 encoder.set_tensor (" raw_images_1" , raw_images);
737698 encoder.set_tensor (" raw_images_2" , raw_images);
738699 encoder.set_tensor (" resize_target_shape" , target_shape);
739- encoder.set_tensor (" image_mean" , image_mean);
740- encoder.set_tensor (" image_scale" , image_scale);
741700 encoder.set_tensor (" broadcast_shape" , broadcast_shape);
742701 encoder.set_tensor (" temp_shape8d" , temp_shape8d);
743702 encoder.set_tensor (" temp_shape4d" , temp_shape4d);
0 commit comments