66#include " module_genai/module_factory.hpp"
77#include " openvino/genai/tokenizer.hpp"
88#include " tokenizer/tokenizer_impl.hpp"
9+ #include " model/qwen3_5/qwen3_5config.hpp"
910
1011#include < chrono>
1112#include < thread>
@@ -32,24 +33,27 @@ void TextEncoderModule::print_static_config() {
3233 - name: "prompts"
3334 type: "VecString" # [Optional] Support DataType: [VecString]
3435 source: "ParentModuleName.OutputPortName"
35- - name: "encoded_image"
36+ - name: "encoded_image" # Used by Qwen 2.5-VL
3637 type: "OVTensor" # [Optional] Support DataType: [OVTensor]
3738 source: "ParentModuleName.OutputPortName"
38- - name: "encoded_images"
39+ - name: "encoded_images" # Used by Qwen 2.5-VL
3940 type: "VecOVTensor" # [Optional] Support DataType: [VecOVTensor]
4041 source: "ParentModuleName.OutputPortName"
41- - name: "source_size"
42+ - name: "source_size" # Used by Qwen 2.5-VL
4243 type: "VecInt" # [Optional] Support DataType: [VecInt]
4344 source: "ParentModuleName.OutputPortName"
44- - name: "source_sizes"
45+ - name: "source_sizes" # Used by Qwen 2.5-VL
4546 type: "VecVecInt" # [Optional] Support DataType: [VecVecInt]
4647 source: "ParentModuleName.OutputPortName"
48+ - name: "grid_thw" # Used by Qwen 3.5
49+ type: "OVTensor" # [Optional] Support DataType: [OVTensor]
50+ source: "ParentModuleName.OutputPortName"
4751 outputs:
4852 - name: "input_ids"
4953 type: "OVTensor" # Support DataType: [OVTensor, OVRemoteTensor]
5054 - name: "mask"
5155 type: "OVTensor" # Support DataType: [OVTensor, OVRemoteTensor]
52- - name: "images_sequence"
56+ - name: "images_sequence" # Output by Qwen 2.5-VL
5357 type: "VecInt" # Support DataType: [VecInt]
5458 params:
5559 model_path: "models/text_encoder.xml" # Optional. OpenVINO IR
@@ -77,14 +81,25 @@ bool TextEncoderModule::initialize() {
7781
7882 m_tokenizer_impl = std::make_shared<Tokenizer::TokenizerImpl>(tokenizer_path, m_tokenization_params);
7983 OPENVINO_ASSERT (m_tokenizer_impl->m_ireq_queue_tokenizer != nullptr , std::string (" Load tokenizer model fail: " ) + tokenizer_path.string ());
84+ VLMModelType model_type = to_vlm_model_type (module_desc->model_type );
8085 m_vlm_config = utils::from_config_json_if_exists<VLMConfig>(tokenizer_path, " config.json" );
8186 m_processor_config = utils::from_config_json_if_exists<ProcessorConfig>(tokenizer_path, " preprocessor_config.json" );
82- m_merge_length = std::pow (m_processor_config.merge_size , 2 );
87+ if (model_type == VLMModelType::QWEN2_VL || model_type == VLMModelType::QWEN2_5_VL) {
88+ m_merge_length = std::pow (m_processor_config.merge_size , 2 );
89+ } else if (model_type == VLMModelType::QWEN3_5) {
90+ Qwen3_5VisionConfig vision_config = Qwen3_5VisionConfig::from_json_file (tokenizer_path / " config.json" );
91+ m_merge_length = std::pow (vision_config.spatial_merge_size , 2 );
92+ } else {
93+ GENAI_ERR (" TextEncoderModule[" + module_desc->name + " ]: Unsupported model type: " + module_desc->model_type );
94+ }
95+
8396 return true ;
8497}
8598
8699void TextEncoderModule::run () {
87100 GENAI_INFO (" Running module: " + module_desc->name );
101+
102+ VLMModelType model_type = to_vlm_model_type (module_desc->model_type );
88103
89104 prepare_inputs ();
90105 std::vector<std::string> m_prompts = {};
@@ -97,39 +112,52 @@ void TextEncoderModule::run() {
97112 OPENVINO_ASSERT (false , " TextEncoderModule[" + module_desc->name + " ]: No prompt input found." );
98113 }
99114
100- std::vector<ov::Tensor> encoded_images = {};
101- std::vector<std::vector<int >> source_sizes = {};
102- bool has_encoded_image = false ;
103- if (exists_input (" encoded_image" )) {
104- ov::Tensor encoded_image = get_input (" encoded_image" ).as <ov::Tensor>();
105- encoded_images.push_back (encoded_image);
106- has_encoded_image = true ;
107- }
108- if (exists_input (" encoded_images" )) {
109- encoded_images = get_input (" encoded_images" ).as <std::vector<ov::Tensor>>();
110- has_encoded_image = true ;
111- }
112- if (exists_input (" source_size" )) {
113- source_sizes.push_back (get_input (" source_size" ).as <std::vector<int >>());
114- }
115- if (exists_input (" source_sizes" )) {
116- source_sizes = get_input (" source_sizes" ).as <std::vector<std::vector<int >>>();
117- }
115+ if (model_type == VLMModelType::QWEN2_VL || model_type == VLMModelType::QWEN2_5_VL) {
116+ std::vector<ov::Tensor> encoded_images = {};
117+ std::vector<std::vector<int >> source_sizes = {};
118+ bool has_encoded_image = false ;
119+ if (exists_input (" encoded_image" )) {
120+ ov::Tensor encoded_image = get_input (" encoded_image" ).as <ov::Tensor>();
121+ encoded_images.push_back (encoded_image);
122+ has_encoded_image = true ;
123+ }
124+ if (exists_input (" encoded_images" )) {
125+ encoded_images = get_input (" encoded_images" ).as <std::vector<ov::Tensor>>();
126+ has_encoded_image = true ;
127+ }
128+ if (exists_input (" source_size" )) {
129+ source_sizes.push_back (get_input (" source_size" ).as <std::vector<int >>());
130+ }
131+ if (exists_input (" source_sizes" )) {
132+ source_sizes = get_input (" source_sizes" ).as <std::vector<std::vector<int >>>();
133+ }
118134
119- if (has_encoded_image) {
120- VLMModelType model_type = to_vlm_model_type (module_desc->model_type );
121- if (model_type != VLMModelType::QWEN2_VL && model_type != VLMModelType::QWEN2_5_VL) {
122- GENAI_ERR (" TextEncoderModule[" + module_desc->name + " ]: Unsupported model type: " + module_desc->model_type );
123- return ;
135+ if (has_encoded_image) {
136+ VLMModelType model_type = to_vlm_model_type (module_desc->model_type );
137+ if (model_type != VLMModelType::QWEN2_VL && model_type != VLMModelType::QWEN2_5_VL) {
138+ GENAI_ERR (" TextEncoderModule[" + module_desc->name + " ]: Unsupported model type: " + module_desc->model_type );
139+ return ;
140+ }
124141 }
125- }
126142
127- auto [encoded, images_sequence] = run (m_prompts, encoded_images, source_sizes, has_encoded_image);
143+ auto [encoded, images_sequence] = run (m_prompts, encoded_images, source_sizes, has_encoded_image);
128144
129- this ->outputs [" input_ids" ].data = encoded.input_ids ;
130- this ->outputs [" mask" ].data = encoded.attention_mask ;
131- if (images_sequence.size () > 0 ) {
132- this ->outputs [" images_sequence" ].data = images_sequence;
145+ this ->outputs [" input_ids" ].data = encoded.input_ids ;
146+ this ->outputs [" mask" ].data = encoded.attention_mask ;
147+ if (images_sequence.size () > 0 ) {
148+ this ->outputs [" images_sequence" ].data = images_sequence;
149+ }
150+ } else if (model_type == VLMModelType::QWEN3_5) {
151+ std::optional<ov::Tensor> grid_thw = std::nullopt ;
152+ if (exists_input (" grid_thw" )) {
153+ grid_thw = get_input (" grid_thw" ).as <ov::Tensor>();
154+ }
155+
156+ auto encoded = run (m_prompts, grid_thw);
157+ this ->outputs [" input_ids" ].data = encoded.input_ids ;
158+ this ->outputs [" mask" ].data = encoded.attention_mask ;
159+ } else {
160+ OPENVINO_THROW (" Unsupported model type: " + module_desc->model_type );
133161 }
134162}
135163
@@ -160,24 +188,45 @@ std::pair<TokenizedInputs, std::vector<int>> TextEncoderModule::run(const std::v
160188 }
161189}
162190
191+ TokenizedInputs TextEncoderModule::run (const std::vector<std::string>& prompts, std::optional<ov::Tensor>& grid_thw) {
192+ if (grid_thw.has_value ()) {
193+ std::vector<std::string> unified_prompts = {};
194+ for (const auto &prompt : prompts) {
195+ // Hard code base image/video id and encoded images/videos
196+ auto [unified_prompt, images_sequence, videos_sequence] = normalize_prompt (prompt, 0 , 0 , grid_thw.value ());
197+ std::stringstream ss;
198+ ss << " <|im_start|>user\n " ;
199+ ss << unified_prompt;
200+ ss << " <|im_end|>\n <|im_start|>assistant\n " ;
201+ unified_prompts.push_back (ss.str ());
202+ }
203+ return m_tokenizer_impl->encode (unified_prompts, m_tokenization_params);
204+ } else {
205+ return m_tokenizer_impl->encode (prompts, m_tokenization_params);
206+ }
207+ }
208+
163209NormalizedPrompt TextEncoderModule::normalize_prompt (const std::string& prompt,
164210 size_t base_image_id,
165211 size_t base_video_id,
166212 const std::vector<ov::Tensor>& encoded_images,
167213 const std::vector<ov::Tensor>& encoded_videos,
168214 const std::vector<std::vector<int >>& source_sizes) {
169- auto [unified_prompt, images_sequence] = normalize (prompt, NATIVE_TAG, NATIVE_TAG, base_image_id, encoded_images.size ());
170- std::vector<std::array<size_t , 3 >> images_grid_thw;
171- images_grid_thw.reserve (encoded_images.size ());
172- for (const auto & source_size : source_sizes) {
173- size_t grid_t = 1 ;
174- size_t grid_h = source_size[0 ];
175- size_t grid_w = source_size[1 ];
176- images_grid_thw.push_back ({grid_t , grid_h, grid_w});
177- }
215+ auto thw = calc_thw (source_sizes);
216+ return normalize_prompt (prompt, base_image_id, base_video_id, thw);
217+ }
178218
219+ NormalizedPrompt TextEncoderModule::normalize_prompt (const std::string& prompt,
220+ size_t base_image_id,
221+ size_t base_video_id,
222+ const ov::Tensor& grid_thw) {
223+ const ov::Shape& thw_shape = grid_thw.get_shape ();
224+ auto thw_data = grid_thw.data <const int64_t >();
225+ auto [unified_prompt, images_sequence] = normalize (prompt, NATIVE_TAG, NATIVE_TAG, base_image_id, thw_shape[0 ]);
179226 for (size_t new_image_id : images_sequence) {
180- auto [grid_t , grid_h, grid_w] = images_grid_thw.at (new_image_id - base_image_id);
227+ size_t grid_t = thw_data[(new_image_id - base_image_id) * 3 + 0 ];
228+ size_t grid_h = thw_data[(new_image_id - base_image_id) * 3 + 1 ];
229+ size_t grid_w = thw_data[(new_image_id - base_image_id) * 3 + 2 ];
181230 const size_t num_image_pad_tokens = calc_tokens_num (grid_t , grid_h, grid_w);
182231
183232 std::string expanded_tag;
@@ -189,6 +238,9 @@ NormalizedPrompt TextEncoderModule::normalize_prompt(const std::string& prompt,
189238 expanded_tag.append (m_vlm_config.image_pad_token );
190239 }
191240 expanded_tag.append (m_vlm_config.vision_end_token );
241+ if (to_vlm_model_type (module_desc->model_type ) == VLMModelType::QWEN3_5) {
242+ expanded_tag.append (" \n " );
243+ }
192244
193245 unified_prompt.replace (unified_prompt.find (NATIVE_TAG), NATIVE_TAG.length (), expanded_tag);
194246 }
@@ -233,6 +285,20 @@ size_t TextEncoderModule::calc_tokens_num(size_t grid_t, size_t grid_h, size_t g
233285 return grid_t * grid_h * grid_w / m_merge_length;
234286}
235287
288+ ov::Tensor TextEncoderModule::calc_thw (const std::vector<std::vector<int >>& source_sizes) {
289+ ov::Tensor thw_tensor (ov::element::i64 , ov::Shape{source_sizes.size (), 3 });
290+ auto thw_data = thw_tensor.data <int64_t >();
291+ for (size_t i = 0 ; i < source_sizes.size (); i++) {
292+ int64_t grid_t = 1 ;
293+ auto grid_h = static_cast <int64_t >(source_sizes[i][0 ]);
294+ auto grid_w = static_cast <int64_t >(source_sizes[i][1 ]);
295+ thw_data[i * 3 + 0 ] = grid_t ;
296+ thw_data[i * 3 + 1 ] = grid_h;
297+ thw_data[i * 3 + 2 ] = grid_w;
298+ }
299+ return thw_tensor;
300+ }
301+
236302} // namespace module
237303} // namespace genai
238304} // namespace ov
0 commit comments