Skip to content

Commit 5f6a60b

Browse files
committed
Enable vision encoder module for Qwen 3.5
Enable vision encoder module for Qwen 3.5. Signed-off-by: Ziniu Lin <ziniu.lin@intel.com>
1 parent 2d03373 commit 5f6a60b

File tree

11 files changed

+587
-85
lines changed

11 files changed

+587
-85
lines changed

src/cpp/src/module_genai/modules/md_img_preprocess.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ void ImagePreprocessModule::run() {
106106
this->outputs["source_sizes"].data = sizes_vec;
107107
} else if (model_type == VLMModelType::QWEN3_5) {
108108
ov::Tensor images = tensor_utils::stack(images_data, 0);
109-
auto output = std::any_cast<Qwen3_5PreprocessorOutput>(std::get<std::shared_ptr<Preprocessor>>(encoder_ptr)->preprocess(images));
109+
Qwen3_5PreprocessorOutput output = std::get<std::shared_ptr<Qwen3_5Preprocessor>>(encoder_ptr)->preprocess(images);
110110
this->outputs["pixel_values"].data = output.pixel_values;
111111
this->outputs["grid_thw"].data = output.grid_thw;
112112
this->outputs["pos_embeds"].data = output.pos_embeds;
@@ -121,7 +121,7 @@ void ImagePreprocessModule::run() {
121121
this->outputs["source_size"].data =
122122
std::vector<int>{static_cast<int>(encoded_img.resized_source_size.height), static_cast<int>(encoded_img.resized_source_size.width)};
123123
} else if (model_type == VLMModelType::QWEN3_5) {
124-
auto output = std::any_cast<Qwen3_5PreprocessorOutput>(std::get<std::shared_ptr<Preprocessor>>(encoder_ptr)->preprocess(image1_data));
124+
Qwen3_5PreprocessorOutput output = std::get<std::shared_ptr<Qwen3_5Preprocessor>>(encoder_ptr)->preprocess(image1_data);
125125
this->outputs["pixel_values"].data = output.pixel_values;
126126
this->outputs["grid_thw"].data = output.grid_thw;
127127
this->outputs["pos_embeds"].data = output.pos_embeds;

src/cpp/src/module_genai/modules/md_img_preprocess.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
#include "module_genai/module.hpp"
1010
#include "module_genai/module_type.hpp"
11-
#include "preprocessor.hpp"
11+
#include "model/qwen3_5/qwen3_5preprocessor.hpp"
1212
#include "visual_language/qwen2vl/classes.hpp"
1313

1414
namespace ov {
@@ -18,7 +18,7 @@ class ImagePreprocessModule : public IBaseModule {
1818
DeclareModuleConstructor(ImagePreprocessModule);
1919

2020
private:
21-
std::variant<std::shared_ptr<VisionEncoderQwen2VL>, std::shared_ptr<Preprocessor>> encoder_ptr;
21+
std::variant<std::shared_ptr<VisionEncoderQwen2VL>, std::shared_ptr<Qwen3_5Preprocessor>> encoder_ptr;
2222
};
2323

2424
REGISTER_MODULE_CONFIG(ImagePreprocessModule);

src/cpp/src/module_genai/modules/md_text_encoder.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ bool TextEncoderModule::initialize() {
9191
m_merge_length = std::pow(vision_config.spatial_merge_size, 2);
9292
} else {
9393
GENAI_ERR("TextEncoderModule[" + module_desc->name + "]: Unsupported model type: " + module_desc->model_type);
94+
return false;
9495
}
9596

9697
return true;

src/cpp/src/module_genai/modules/md_vision_encoder.cpp

Lines changed: 356 additions & 53 deletions
Large diffs are not rendered by default.

src/cpp/src/module_genai/modules/md_vision_encoder.hpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "visual_language/processor_config.hpp"
99
#include "visual_language/vision_encoder.hpp"
1010
#include "visual_language/vlm_config.hpp"
11+
#include "model/qwen3_5/qwen3_5config.hpp"
1112

1213

1314
namespace ov {
@@ -20,6 +21,14 @@ class VisionEncoderModule : public IBaseModule {
2021
private:
2122
bool initialize();
2223
std::pair<ov::Tensor, ov::Tensor> embed(const EncodedImage &image, const std::vector<int>& images_sequence, const ov::Tensor& input_ids);
24+
Qwen3_5VisionEmbeddingResult embed(
25+
const ov::Tensor &pixel_values,
26+
const ov::Tensor &grid_thw,
27+
const ov::Tensor &pos_embeds,
28+
const ov::Tensor &rotary_cos,
29+
const ov::Tensor &rotary_sin,
30+
const ov::Tensor &input_ids,
31+
const ov::Tensor &attention_mask);
2332
ov::Tensor get_rotary_pos_emb(const std::vector<std::array<size_t, 3>>& grids_thw);
2433
size_t calc_vec_tokens_num(const std::vector<std::array<size_t, 3UL>>& vec_grid_thw) const;
2534
size_t calc_tokens_num(size_t grid_t, size_t grid_h, size_t grid_w) const;
@@ -33,7 +42,7 @@ class VisionEncoderModule : public IBaseModule {
3342
const int64_t vision_start_token_id,
3443
const std::vector<std::pair<std::size_t, std::size_t>>& history_vision_count);
3544

36-
std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_vision_embeddings_merger;
45+
std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_request_queue;
3746
bool m_with_cu_seqlens_input { false };
3847
VLMConfig m_vlm_config;
3948
ProcessorConfig m_processor_config;
@@ -42,6 +51,8 @@ class VisionEncoderModule : public IBaseModule {
4251
ov::Tensor m_position_ids;
4352
int64_t m_rope_delta = 0;
4453
int64_t m_vision_start_token_id = 0;
54+
int64_t m_image_pad_token_id = 0;
55+
int64_t m_video_pad_token_id = 0;
4556
};
4657

4758
REGISTER_MODULE_CONFIG(VisionEncoderModule) ;

src/cpp/src/module_genai/modules/model/qwen3_5/qwen3_5config.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <vector>
88
#include <filesystem>
99
#include <array>
10+
#include "openvino/runtime/tensor.hpp"
1011

1112
namespace ov::genai::module {
1213

@@ -43,4 +44,11 @@ struct Qwen3_5VisionPreprocessConfig {
4344
static Qwen3_5VisionPreprocessConfig from_json_file(const std::filesystem::path& path);
4445
};
4546

47+
struct Qwen3_5VisionEmbeddingResult {
48+
ov::Tensor position_ids;
49+
ov::Tensor visual_pos_mask;
50+
ov::Tensor rope_deltas;
51+
ov::Tensor visual_embeds;
52+
};
53+
4654
}

src/cpp/src/module_genai/modules/model/qwen3_5/qwen3_5preprocessor.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ Qwen3_5Preprocessor::Qwen3_5Preprocessor(const std::filesystem::path &model_path
2626
load_pos_embed_weight(model_path);
2727
}
2828

29-
std::any Qwen3_5Preprocessor::preprocess(const ov::Tensor &images) {
29+
Qwen3_5PreprocessorOutput Qwen3_5Preprocessor::preprocess(const ov::Tensor &images) {
3030
const auto img_shape = images.get_shape();
3131
if (img_shape.size() != 3 && img_shape.size() != 4) {
3232
OPENVINO_THROW("images must have shape [H, W, C] or [B, H, W, C]");

src/cpp/src/module_genai/modules/model/qwen3_5/qwen3_5preprocessor.hpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
#include <string>
1010
#include "openvino/runtime/tensor.hpp"
1111
#include "qwen3_5config.hpp"
12-
#include "../../preprocessor.hpp"
1312

1413
namespace ov::genai::module {
1514

@@ -21,11 +20,11 @@ struct Qwen3_5PreprocessorOutput {
2120
ov::Tensor rotary_sin;
2221
};
2322

24-
class Qwen3_5Preprocessor : public Preprocessor {
23+
class Qwen3_5Preprocessor {
2524
public:
2625
explicit Qwen3_5Preprocessor(const std::filesystem::path& model_path);
2726

28-
std::any preprocess(const ov::Tensor &images) override;
27+
Qwen3_5PreprocessorOutput preprocess(const ov::Tensor &images);
2928
private:
3029
Qwen3_5VisionPreprocessConfig m_preprocess_config;
3130
Qwen3_5VisionConfig m_vision_config;

src/cpp/src/module_genai/modules/preprocessor.hpp

Lines changed: 0 additions & 18 deletions
This file was deleted.

tests/module_genai/cpp/modules/TextEncoderModule.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -184,10 +184,10 @@ class Qwen3_5TextEncoderModuleTest : public ModuleTestBase, public ::testing::Te
184184
input_node("image", to_string(DataType::OVTensor), pipeline_params_name + ".image"));
185185
cur_node["outputs"] = YAML::Node(YAML::NodeType::Sequence);
186186
cur_node["outputs"].push_back(output_node("pixel_values", to_string(DataType::OVTensor)));
187-
cur_node["outputs"].push_back(output_node("grid_thw", to_string(DataType::VecInt)));
188-
cur_node["outputs"].push_back(output_node("pos_embeds", to_string(DataType::VecInt)));
189-
cur_node["outputs"].push_back(output_node("rotary_cos", to_string(DataType::VecInt)));
190-
cur_node["outputs"].push_back(output_node("rotary_sin", to_string(DataType::VecInt)));
187+
cur_node["outputs"].push_back(output_node("grid_thw", to_string(DataType::OVTensor)));
188+
cur_node["outputs"].push_back(output_node("pos_embeds", to_string(DataType::OVTensor)));
189+
cur_node["outputs"].push_back(output_node("rotary_cos", to_string(DataType::OVTensor)));
190+
cur_node["outputs"].push_back(output_node("rotary_sin", to_string(DataType::OVTensor)));
191191
cur_node["params"] = YAML::Node();
192192
cur_node["params"]["model_path"] = TEST_MODEL::Qwen3_5();
193193
pipeline_modules[image_preprocessor_name] = cur_node;

0 commit comments

Comments
 (0)