Skip to content

Commit 2d03373

Browse files
committed
Enable text encoder for Qwen 3.5
Enable text encoder for Qwen 3.5. Signed-off-by: Ziniu Lin <ziniu.lin@intel.com>
1 parent add8ce5 commit 2d03373

File tree

10 files changed

+280
-53
lines changed

10 files changed

+280
-53
lines changed

src/cpp/src/module_genai/modules/md_img_preprocess.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
#include "module_genai/module_factory.hpp"
77
#include "module_genai/utils/tensor_utils.hpp"
8+
#include "model/qwen3_5/qwen3_5preprocessor.hpp"
89

910
#include <chrono>
1011
#include <thread>
@@ -105,7 +106,7 @@ void ImagePreprocessModule::run() {
105106
this->outputs["source_sizes"].data = sizes_vec;
106107
} else if (model_type == VLMModelType::QWEN3_5) {
107108
ov::Tensor images = tensor_utils::stack(images_data, 0);
108-
Qwen3_5PreprocessorOutput output = std::get<std::shared_ptr<Qwen3_5Preprocessor>>(encoder_ptr)->preprocess(images);
109+
auto output = std::any_cast<Qwen3_5PreprocessorOutput>(std::get<std::shared_ptr<Preprocessor>>(encoder_ptr)->preprocess(images));
109110
this->outputs["pixel_values"].data = output.pixel_values;
110111
this->outputs["grid_thw"].data = output.grid_thw;
111112
this->outputs["pos_embeds"].data = output.pos_embeds;
@@ -120,7 +121,7 @@ void ImagePreprocessModule::run() {
120121
this->outputs["source_size"].data =
121122
std::vector<int>{static_cast<int>(encoded_img.resized_source_size.height), static_cast<int>(encoded_img.resized_source_size.width)};
122123
} else if (model_type == VLMModelType::QWEN3_5) {
123-
Qwen3_5PreprocessorOutput output = std::get<std::shared_ptr<Qwen3_5Preprocessor>>(encoder_ptr)->preprocess(image1_data);
124+
auto output = std::any_cast<Qwen3_5PreprocessorOutput>(std::get<std::shared_ptr<Preprocessor>>(encoder_ptr)->preprocess(image1_data));
124125
this->outputs["pixel_values"].data = output.pixel_values;
125126
this->outputs["grid_thw"].data = output.grid_thw;
126127
this->outputs["pos_embeds"].data = output.pos_embeds;

src/cpp/src/module_genai/modules/md_img_preprocess.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
#include "module_genai/module.hpp"
1010
#include "module_genai/module_type.hpp"
11-
#include "model/qwen3_5/qwen3_5preprocessor.hpp"
11+
#include "preprocessor.hpp"
1212
#include "visual_language/qwen2vl/classes.hpp"
1313

1414
namespace ov {
@@ -18,7 +18,7 @@ class ImagePreprocessModule : public IBaseModule {
1818
DeclareModuleConstructor(ImagePreprocessModule);
1919

2020
private:
21-
std::variant<std::shared_ptr<VisionEncoderQwen2VL>, std::shared_ptr<Qwen3_5Preprocessor>> encoder_ptr;
21+
std::variant<std::shared_ptr<VisionEncoderQwen2VL>, std::shared_ptr<Preprocessor>> encoder_ptr;
2222
};
2323

2424
REGISTER_MODULE_CONFIG(ImagePreprocessModule);

src/cpp/src/module_genai/modules/md_text_encoder.cpp

Lines changed: 111 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include "module_genai/module_factory.hpp"
77
#include "openvino/genai/tokenizer.hpp"
88
#include "tokenizer/tokenizer_impl.hpp"
9+
#include "model/qwen3_5/qwen3_5config.hpp"
910

1011
#include <chrono>
1112
#include <thread>
@@ -32,24 +33,27 @@ void TextEncoderModule::print_static_config() {
3233
- name: "prompts"
3334
type: "VecString" # [Optional] Support DataType: [VecString]
3435
source: "ParentModuleName.OutputPortName"
35-
- name: "encoded_image"
36+
- name: "encoded_image" # Used by Qwen 2.5-VL
3637
type: "OVTensor" # [Optional] Support DataType: [OVTensor]
3738
source: "ParentModuleName.OutputPortName"
38-
- name: "encoded_images"
39+
- name: "encoded_images" # Used by Qwen 2.5-VL
3940
type: "VecOVTensor" # [Optional] Support DataType: [VecOVTensor]
4041
source: "ParentModuleName.OutputPortName"
41-
- name: "source_size"
42+
- name: "source_size" # Used by Qwen 2.5-VL
4243
type: "VecInt" # [Optional] Support DataType: [VecInt]
4344
source: "ParentModuleName.OutputPortName"
44-
- name: "source_sizes"
45+
- name: "source_sizes" # Used by Qwen 2.5-VL
4546
type: "VecVecInt" # [Optional] Support DataType: [VecVecInt]
4647
source: "ParentModuleName.OutputPortName"
48+
- name: "grid_thw" # Used by Qwen 3.5
49+
type: "OVTensor" # [Optional] Support DataType: [OVTensor]
50+
source: "ParentModuleName.OutputPortName"
4751
outputs:
4852
- name: "input_ids"
4953
type: "OVTensor" # Support DataType: [OVTensor, OVRemoteTensor]
5054
- name: "mask"
5155
type: "OVTensor" # Support DataType: [OVTensor, OVRemoteTensor]
52-
- name: "images_sequence"
56+
- name: "images_sequence" # Output by Qwen 2.5-VL
5357
type: "VecInt" # Support DataType: [VecInt]
5458
params:
5559
model_path: "models/text_encoder.xml" # Optional. OpenVINO IR
@@ -77,14 +81,25 @@ bool TextEncoderModule::initialize() {
7781

7882
m_tokenizer_impl = std::make_shared<Tokenizer::TokenizerImpl>(tokenizer_path, m_tokenization_params);
7983
OPENVINO_ASSERT(m_tokenizer_impl->m_ireq_queue_tokenizer != nullptr, std::string("Load tokenizer model fail: ") + tokenizer_path.string());
84+
VLMModelType model_type = to_vlm_model_type(module_desc->model_type);
8085
m_vlm_config = utils::from_config_json_if_exists<VLMConfig>(tokenizer_path, "config.json");
8186
m_processor_config = utils::from_config_json_if_exists<ProcessorConfig>(tokenizer_path, "preprocessor_config.json");
82-
m_merge_length = std::pow(m_processor_config.merge_size, 2);
87+
if (model_type == VLMModelType::QWEN2_VL || model_type == VLMModelType::QWEN2_5_VL) {
88+
m_merge_length = std::pow(m_processor_config.merge_size, 2);
89+
} else if (model_type == VLMModelType::QWEN3_5) {
90+
Qwen3_5VisionConfig vision_config = Qwen3_5VisionConfig::from_json_file(tokenizer_path / "config.json");
91+
m_merge_length = std::pow(vision_config.spatial_merge_size, 2);
92+
} else {
93+
GENAI_ERR("TextEncoderModule[" + module_desc->name + "]: Unsupported model type: " + module_desc->model_type);
94+
}
95+
8396
return true;
8497
}
8598

8699
void TextEncoderModule::run() {
87100
GENAI_INFO("Running module: " + module_desc->name);
101+
102+
VLMModelType model_type = to_vlm_model_type(module_desc->model_type);
88103

89104
prepare_inputs();
90105
std::vector<std::string> m_prompts = {};
@@ -97,39 +112,52 @@ void TextEncoderModule::run() {
97112
OPENVINO_ASSERT(false, "TextEncoderModule[" + module_desc->name + "]: No prompt input found.");
98113
}
99114

100-
std::vector<ov::Tensor> encoded_images = {};
101-
std::vector<std::vector<int>> source_sizes = {};
102-
bool has_encoded_image = false;
103-
if (exists_input("encoded_image")) {
104-
ov::Tensor encoded_image = get_input("encoded_image").as<ov::Tensor>();
105-
encoded_images.push_back(encoded_image);
106-
has_encoded_image = true;
107-
}
108-
if (exists_input("encoded_images")) {
109-
encoded_images = get_input("encoded_images").as<std::vector<ov::Tensor>>();
110-
has_encoded_image = true;
111-
}
112-
if (exists_input("source_size")) {
113-
source_sizes.push_back(get_input("source_size").as<std::vector<int>>());
114-
}
115-
if (exists_input("source_sizes")) {
116-
source_sizes = get_input("source_sizes").as<std::vector<std::vector<int>>>();
117-
}
115+
if (model_type == VLMModelType::QWEN2_VL || model_type == VLMModelType::QWEN2_5_VL) {
116+
std::vector<ov::Tensor> encoded_images = {};
117+
std::vector<std::vector<int>> source_sizes = {};
118+
bool has_encoded_image = false;
119+
if (exists_input("encoded_image")) {
120+
ov::Tensor encoded_image = get_input("encoded_image").as<ov::Tensor>();
121+
encoded_images.push_back(encoded_image);
122+
has_encoded_image = true;
123+
}
124+
if (exists_input("encoded_images")) {
125+
encoded_images = get_input("encoded_images").as<std::vector<ov::Tensor>>();
126+
has_encoded_image = true;
127+
}
128+
if (exists_input("source_size")) {
129+
source_sizes.push_back(get_input("source_size").as<std::vector<int>>());
130+
}
131+
if (exists_input("source_sizes")) {
132+
source_sizes = get_input("source_sizes").as<std::vector<std::vector<int>>>();
133+
}
118134

119-
if (has_encoded_image) {
120-
VLMModelType model_type = to_vlm_model_type(module_desc->model_type);
121-
if (model_type != VLMModelType::QWEN2_VL && model_type != VLMModelType::QWEN2_5_VL) {
122-
GENAI_ERR("TextEncoderModule[" + module_desc->name + "]: Unsupported model type: " + module_desc->model_type);
123-
return;
135+
if (has_encoded_image) {
136+
VLMModelType model_type = to_vlm_model_type(module_desc->model_type);
137+
if (model_type != VLMModelType::QWEN2_VL && model_type != VLMModelType::QWEN2_5_VL) {
138+
GENAI_ERR("TextEncoderModule[" + module_desc->name + "]: Unsupported model type: " + module_desc->model_type);
139+
return;
140+
}
124141
}
125-
}
126142

127-
auto [encoded, images_sequence] = run(m_prompts, encoded_images, source_sizes, has_encoded_image);
143+
auto [encoded, images_sequence] = run(m_prompts, encoded_images, source_sizes, has_encoded_image);
128144

129-
this->outputs["input_ids"].data = encoded.input_ids;
130-
this->outputs["mask"].data = encoded.attention_mask;
131-
if (images_sequence.size() > 0) {
132-
this->outputs["images_sequence"].data = images_sequence;
145+
this->outputs["input_ids"].data = encoded.input_ids;
146+
this->outputs["mask"].data = encoded.attention_mask;
147+
if (images_sequence.size() > 0) {
148+
this->outputs["images_sequence"].data = images_sequence;
149+
}
150+
} else if (model_type == VLMModelType::QWEN3_5) {
151+
std::optional<ov::Tensor> grid_thw = std::nullopt;
152+
if (exists_input("grid_thw")) {
153+
grid_thw = get_input("grid_thw").as<ov::Tensor>();
154+
}
155+
156+
auto encoded = run(m_prompts, grid_thw);
157+
this->outputs["input_ids"].data = encoded.input_ids;
158+
this->outputs["mask"].data = encoded.attention_mask;
159+
} else {
160+
OPENVINO_THROW("Unsupported model type: " + module_desc->model_type);
133161
}
134162
}
135163

@@ -160,24 +188,45 @@ std::pair<TokenizedInputs, std::vector<int>> TextEncoderModule::run(const std::v
160188
}
161189
}
162190

191+
TokenizedInputs TextEncoderModule::run(const std::vector<std::string>& prompts, std::optional<ov::Tensor>& grid_thw) {
192+
if (grid_thw.has_value()) {
193+
std::vector<std::string> unified_prompts = {};
194+
for (const auto &prompt : prompts) {
195+
// Hard code base image/video id and encoded images/videos
196+
auto [unified_prompt, images_sequence, videos_sequence] = normalize_prompt(prompt, 0, 0, grid_thw.value());
197+
std::stringstream ss;
198+
ss << "<|im_start|>user\n";
199+
ss << unified_prompt;
200+
ss << "<|im_end|>\n<|im_start|>assistant\n";
201+
unified_prompts.push_back(ss.str());
202+
}
203+
return m_tokenizer_impl->encode(unified_prompts, m_tokenization_params);
204+
} else {
205+
return m_tokenizer_impl->encode(prompts, m_tokenization_params);
206+
}
207+
}
208+
163209
NormalizedPrompt TextEncoderModule::normalize_prompt(const std::string& prompt,
164210
size_t base_image_id,
165211
size_t base_video_id,
166212
const std::vector<ov::Tensor>& encoded_images,
167213
const std::vector<ov::Tensor>& encoded_videos,
168214
const std::vector<std::vector<int>>& source_sizes) {
169-
auto [unified_prompt, images_sequence] = normalize(prompt, NATIVE_TAG, NATIVE_TAG, base_image_id, encoded_images.size());
170-
std::vector<std::array<size_t, 3>> images_grid_thw;
171-
images_grid_thw.reserve(encoded_images.size());
172-
for (const auto& source_size : source_sizes) {
173-
size_t grid_t = 1;
174-
size_t grid_h = source_size[0];
175-
size_t grid_w = source_size[1];
176-
images_grid_thw.push_back({grid_t, grid_h, grid_w});
177-
}
215+
auto thw = calc_thw(source_sizes);
216+
return normalize_prompt(prompt, base_image_id, base_video_id, thw);
217+
}
178218

219+
NormalizedPrompt TextEncoderModule::normalize_prompt(const std::string& prompt,
220+
size_t base_image_id,
221+
size_t base_video_id,
222+
const ov::Tensor& grid_thw) {
223+
const ov::Shape& thw_shape = grid_thw.get_shape();
224+
auto thw_data = grid_thw.data<const int64_t>();
225+
auto [unified_prompt, images_sequence] = normalize(prompt, NATIVE_TAG, NATIVE_TAG, base_image_id, thw_shape[0]);
179226
for (size_t new_image_id : images_sequence) {
180-
auto [grid_t, grid_h, grid_w] = images_grid_thw.at(new_image_id - base_image_id);
227+
size_t grid_t = thw_data[(new_image_id - base_image_id) * 3 + 0];
228+
size_t grid_h = thw_data[(new_image_id - base_image_id) * 3 + 1];
229+
size_t grid_w = thw_data[(new_image_id - base_image_id) * 3 + 2];
181230
const size_t num_image_pad_tokens = calc_tokens_num(grid_t, grid_h, grid_w);
182231

183232
std::string expanded_tag;
@@ -189,6 +238,9 @@ NormalizedPrompt TextEncoderModule::normalize_prompt(const std::string& prompt,
189238
expanded_tag.append(m_vlm_config.image_pad_token);
190239
}
191240
expanded_tag.append(m_vlm_config.vision_end_token);
241+
if (to_vlm_model_type(module_desc->model_type) == VLMModelType::QWEN3_5) {
242+
expanded_tag.append("\n");
243+
}
192244

193245
unified_prompt.replace(unified_prompt.find(NATIVE_TAG), NATIVE_TAG.length(), expanded_tag);
194246
}
@@ -233,6 +285,20 @@ size_t TextEncoderModule::calc_tokens_num(size_t grid_t, size_t grid_h, size_t g
233285
return grid_t * grid_h * grid_w / m_merge_length;
234286
}
235287

288+
ov::Tensor TextEncoderModule::calc_thw(const std::vector<std::vector<int>>& source_sizes) {
289+
ov::Tensor thw_tensor(ov::element::i64, ov::Shape{source_sizes.size(), 3});
290+
auto thw_data = thw_tensor.data<int64_t>();
291+
for (size_t i = 0; i < source_sizes.size(); i++) {
292+
int64_t grid_t = 1;
293+
auto grid_h = static_cast<int64_t>(source_sizes[i][0]);
294+
auto grid_w = static_cast<int64_t>(source_sizes[i][1]);
295+
thw_data[i * 3 + 0] = grid_t;
296+
thw_data[i * 3 + 1] = grid_h;
297+
thw_data[i * 3 + 2] = grid_w;
298+
}
299+
return thw_tensor;
300+
}
301+
236302
} // namespace module
237303
} // namespace genai
238304
} // namespace ov

src/cpp/src/module_genai/modules/md_text_encoder.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,19 +30,26 @@ class TextEncoderModule : public IBaseModule {
3030
const std::vector<ov::Tensor>& encoded_images,
3131
const std::vector<std::vector<int>>& source_sizes,
3232
bool has_encoded_image = false);
33+
TokenizedInputs run(const std::vector<std::string>& prompts, std::optional<ov::Tensor>& grid_thw);
3334
NormalizedPrompt normalize_prompt(const std::string& prompt,
3435
size_t base_image_id,
3536
size_t base_video_id,
3637
const std::vector<ov::Tensor>& encoded_images,
3738
const std::vector<ov::Tensor>& encoded_videos,
3839
const std::vector<std::vector<int>>& source_sizes);
40+
NormalizedPrompt normalize_prompt(const std::string& prompt,
41+
size_t base_image_id,
42+
size_t base_video_id,
43+
const ov::Tensor& grid_thw);
3944
std::pair<std::string, std::vector<size_t>> normalize(
4045
const std::string& prompt,
4146
const std::string& native_tag,
4247
const std::string& automatic_tag,
4348
size_t base_id,
4449
size_t n_images);
4550
size_t calc_tokens_num(size_t grid_t, size_t grid_h, size_t grid_w) const;
51+
52+
ov::Tensor calc_thw(const std::vector<std::vector<int>>& source_sizes);
4653
};
4754

4855
REGISTER_MODULE_CONFIG(TextEncoderModule);

src/cpp/src/module_genai/modules/model/qwen3_5/qwen3_5preprocessor.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ Qwen3_5Preprocessor::Qwen3_5Preprocessor(const std::filesystem::path &model_path
2626
load_pos_embed_weight(model_path);
2727
}
2828

29-
Qwen3_5PreprocessorOutput Qwen3_5Preprocessor::preprocess(const ov::Tensor &images) {
29+
std::any Qwen3_5Preprocessor::preprocess(const ov::Tensor &images) {
3030
const auto img_shape = images.get_shape();
3131
if (img_shape.size() != 3 && img_shape.size() != 4) {
3232
OPENVINO_THROW("images must have shape [H, W, C] or [B, H, W, C]");
@@ -173,7 +173,7 @@ Qwen3_5PreprocessorOutput Qwen3_5Preprocessor::preprocess(const ov::Tensor &imag
173173
auto pos_embeds = build_pos_embeds(grid_thw);
174174
auto rotary = build_rotary_cos_sin(grid_thw);
175175

176-
return {pixel_values, grid_thw, pos_embeds, rotary.first, rotary.second};
176+
return Qwen3_5PreprocessorOutput{pixel_values, grid_thw, pos_embeds, rotary.first, rotary.second};
177177
}
178178

179179
void Qwen3_5Preprocessor::load_pos_embed_weight(const std::filesystem::path &model_path) {

src/cpp/src/module_genai/modules/model/qwen3_5/qwen3_5preprocessor.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <string>
1010
#include "openvino/runtime/tensor.hpp"
1111
#include "qwen3_5config.hpp"
12+
#include "../../preprocessor.hpp"
1213

1314
namespace ov::genai::module {
1415

@@ -20,11 +21,11 @@ struct Qwen3_5PreprocessorOutput {
2021
ov::Tensor rotary_sin;
2122
};
2223

23-
class Qwen3_5Preprocessor {
24+
class Qwen3_5Preprocessor : public Preprocessor {
2425
public:
2526
explicit Qwen3_5Preprocessor(const std::filesystem::path& model_path);
2627

27-
Qwen3_5PreprocessorOutput preprocess(const ov::Tensor &images);
28+
std::any preprocess(const ov::Tensor &images) override;
2829
private:
2930
Qwen3_5VisionPreprocessConfig m_preprocess_config;
3031
Qwen3_5VisionConfig m_vision_config;
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// Copyright (C) 2023-2026 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
#pragma once
5+
#include <any>
6+
#include <openvino/runtime/tensor.hpp>
7+
8+
namespace ov::genai::module {
9+
10+
class Preprocessor {
11+
public:
12+
Preprocessor() = default;
13+
virtual ~Preprocessor() = default;
14+
15+
virtual std::any preprocess(const ov::Tensor &images) = 0;
16+
};
17+
18+
}

src/cpp/src/visual_language/vlm_config.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ VLMModelType to_vlm_model_type(const std::string& value) {
2323
{"qwen2_5_vl", VLMModelType::QWEN2_5_VL},
2424
{"gemma3", VLMModelType::GEMMA3},
2525
{"qwen3_5", VLMModelType::QWEN3_5},
26+
{"qwen3_5_moe", VLMModelType::QWEN3_5_MOE},
2627
};
2728

2829
auto it = model_types_map.find(value);

src/cpp/src/visual_language/vlm_config.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ enum class VLMModelType {
2222
QWEN2_5_VL,
2323
GEMMA3,
2424
QWEN3_5,
25+
QWEN3_5_MOE,
2526
};
2627

2728
VLMModelType to_vlm_model_type(const std::string& value);

0 commit comments

Comments
 (0)