44#include " md_img_preprocess.hpp"
55
66#include " module_genai/module_factory.hpp"
7+ #include " module_genai/utils/tensor_utils.hpp"
78
89#include < chrono>
910#include < thread>
@@ -34,14 +35,24 @@ void ImagePreprocessModule::print_static_config() {
3435 type: "VecOVTensor" # Support DataType: [VecOVTensor]
3536 source: "ParentModuleName.OutputPortName"
3637 outputs:
37- - name: "raw_data" # Output port name
38+ - name: "raw_data" # Output port name, used by Qwen 2.5-VL
3839 type: "OVTensor" # Support DataType: [OVTensor]
39- - name: "source_size" # Output port name
40+ - name: "source_size" # Output port name, used by Qwen 2.5-VL
4041 type: "VecInt" # Support DataType: [VecInt]
41- - name: "raw_datas" # batch processed vision output
42+ - name: "raw_datas" # batch processed vision output, used by Qwen 2.5-VL
4243 type: "VecOVTensor" # Support DataType: [VecOVTensor]
43- - name: "source_sizes" # Output port name
44+ - name: "source_sizes" # Output port name, used by Qwen 2.5-VL
4445 type: "VecVecInt" # Support DataType: [VecVecInt]
46+ - name: "pixel_values" # Output port name, used by Qwen 3.5
47+ type: "OVTensor" # Support DataType: [OVTensor]
48+ - name: "grid_thw" # Output port name, used by Qwen 3.5
49+ type: "OVTensor" # Support DataType: [OVTensor]
50+ - name: "pos_embeds" # Output port name, used by Qwen 3.5
51+ type: "OVTensor" # Support DataType: [OVTensor]
52+ - name: "rotary_cos" # Output port name, used by Qwen 3.5
53+ type: "OVTensor" # Support DataType: [OVTensor]
54+ - name: "rotary_sin" # Output port name, used by Qwen 3.5
55+ type: "OVTensor" # Support DataType: [OVTensor]
4556 params:
4657 target_resolution: [224, 224] # optional
4758 mean: [0.485, 0.456, 0.406] # optional
@@ -62,6 +73,8 @@ ImagePreprocessModule::ImagePreprocessModule(const IBaseModuleDesc::PTR& desc, c
6273
6374 if (model_type == VLMModelType::QWEN2_VL || model_type == VLMModelType::QWEN2_5_VL) {
6475 encoder_ptr = std::make_shared<VisionEncoderQwen2VL>(std::filesystem::path (model_path), device, ov::AnyMap{});
76+ } else if (model_type == VLMModelType::QWEN3_5) {
77+ encoder_ptr = std::make_shared<Qwen3_5Preprocessor>(std::filesystem::path (model_path));
6578 } else {
6679 GENAI_ERR (" ImagePreprocessModule[" + desc->name + " ]: Unsupported model type: " + desc->model_type );
6780 }
@@ -72,28 +85,48 @@ ImagePreprocessModule::~ImagePreprocessModule() {}
7285void ImagePreprocessModule::run () {
7386 GENAI_INFO (" Running module: " + module_desc->name );
7487 prepare_inputs ();
88+ VLMModelType model_type = to_vlm_model_type (module_desc->model_type );
7589
7690 if (exists_input (" images" )) {
7791 auto images_data = get_input (" images" ).as <std::vector<ov::Tensor>>();
78- std::vector<ov::Tensor> output_tensors;
79- std::vector<ImageSize> output_sizes;
80- for (size_t i = 0 ; i < images_data.size (); ++i) {
81- auto encoded_img = encoder_ptr->encode (images_data[i], ov::AnyMap{});
82- output_tensors.push_back (encoded_img.resized_source );
83- output_sizes.push_back (encoded_img.resized_source_size );
84- }
85- this ->outputs [" raw_datas" ].data = output_tensors;
86- std::vector<std::vector<int >> sizes_vec;
87- for (const auto & sz : output_sizes) {
88- sizes_vec.push_back ({static_cast <int >(sz.height ), static_cast <int >(sz.width )});
92+ if (model_type == VLMModelType::QWEN2_VL || model_type == VLMModelType::QWEN2_5_VL) {
93+ std::vector<ov::Tensor> output_tensors;
94+ std::vector<ImageSize> output_sizes;
95+ for (size_t i = 0 ; i < images_data.size (); ++i) {
96+ auto encoded_img = std::get<std::shared_ptr<VisionEncoderQwen2VL>>(encoder_ptr)->encode (images_data[i], ov::AnyMap{});
97+ output_tensors.push_back (encoded_img.resized_source );
98+ output_sizes.push_back (encoded_img.resized_source_size );
99+ }
100+ this ->outputs [" raw_datas" ].data = output_tensors;
101+ std::vector<std::vector<int >> sizes_vec;
102+ for (const auto & sz : output_sizes) {
103+ sizes_vec.push_back ({static_cast <int >(sz.height ), static_cast <int >(sz.width )});
104+ }
105+ this ->outputs [" source_sizes" ].data = sizes_vec;
106+ } else if (model_type == VLMModelType::QWEN3_5) {
107+ ov::Tensor images = tensor_utils::stack (images_data, 0 );
108+ Qwen3_5PreprocessorOutput output = std::get<std::shared_ptr<Qwen3_5Preprocessor>>(encoder_ptr)->preprocess (images);
109+ this ->outputs [" pixel_values" ].data = output.pixel_values ;
110+ this ->outputs [" grid_thw" ].data = output.grid_thw ;
111+ this ->outputs [" pos_embeds" ].data = output.pos_embeds ;
112+ this ->outputs [" rotary_cos" ].data = output.rotary_cos ;
113+ this ->outputs [" rotary_sin" ].data = output.rotary_sin ;
89114 }
90- this ->outputs [" source_sizes" ].data = sizes_vec;
91115 } else {
92116 auto image1_data = get_input (" image" ).as <ov::Tensor>();
93- auto encoded_img = encoder_ptr->encode (image1_data, ov::AnyMap{});
94- this ->outputs [" raw_data" ].data = encoded_img.resized_source ;
95- this ->outputs [" source_size" ].data =
96- std::vector<int >{static_cast <int >(encoded_img.resized_source_size .height ), static_cast <int >(encoded_img.resized_source_size .width )};
117+ if (model_type == VLMModelType::QWEN2_VL || model_type == VLMModelType::QWEN2_5_VL) {
118+ auto encoded_img = std::get<std::shared_ptr<VisionEncoderQwen2VL>>(encoder_ptr)->encode (image1_data, ov::AnyMap{});
119+ this ->outputs [" raw_data" ].data = encoded_img.resized_source ;
120+ this ->outputs [" source_size" ].data =
121+ std::vector<int >{static_cast <int >(encoded_img.resized_source_size .height ), static_cast <int >(encoded_img.resized_source_size .width )};
122+ } else if (model_type == VLMModelType::QWEN3_5) {
123+ Qwen3_5PreprocessorOutput output = std::get<std::shared_ptr<Qwen3_5Preprocessor>>(encoder_ptr)->preprocess (image1_data);
124+ this ->outputs [" pixel_values" ].data = output.pixel_values ;
125+ this ->outputs [" grid_thw" ].data = output.grid_thw ;
126+ this ->outputs [" pos_embeds" ].data = output.pos_embeds ;
127+ this ->outputs [" rotary_cos" ].data = output.rotary_cos ;
128+ this ->outputs [" rotary_sin" ].data = output.rotary_sin ;
129+ }
97130 }
98131}
99132
0 commit comments