Skip to content

Commit 459bd02

Browse files
committed
Enable Qwen3.5 image preprocessor
Enable Qwen3.5 image preprocessor. Signed-off-by: Ziniu Lin <ziniu.lin@intel.com>
1 parent 370fd44 commit 459bd02

File tree

12 files changed

+978
-25
lines changed

12 files changed

+978
-25
lines changed

src/cpp/src/module_genai/modules/md_img_preprocess.cpp

Lines changed: 54 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include "md_img_preprocess.hpp"
55

66
#include "module_genai/module_factory.hpp"
7+
#include "module_genai/utils/tensor_utils.hpp"
78

89
#include <chrono>
910
#include <thread>
@@ -34,14 +35,24 @@ void ImagePreprocessModule::print_static_config() {
3435
type: "VecOVTensor" # Support DataType: [VecOVTensor]
3536
source: "ParentModuleName.OutputPortName"
3637
outputs:
37-
- name: "raw_data" # Output port name
38+
- name: "raw_data" # Output port name, used by Qwen 2.5-VL
3839
type: "OVTensor" # Support DataType: [OVTensor]
39-
- name: "source_size" # Output port name
40+
- name: "source_size" # Output port name, used by Qwen 2.5-VL
4041
type: "VecInt" # Support DataType: [VecInt]
41-
- name: "raw_datas" # batch processed vision output
42+
- name: "raw_datas" # batch processed vision output, used by Qwen 2.5-VL
4243
type: "VecOVTensor" # Support DataType: [VecOVTensor]
43-
- name: "source_sizes" # Output port name
44+
- name: "source_sizes" # Output port name, used by Qwen 2.5-VL
4445
type: "VecVecInt" # Support DataType: [VecVecInt]
46+
- name: "pixel_values" # Output port name, used by Qwen 3.5
47+
type: "OVTensor" # Support DataType: [OVTensor]
48+
- name: "grid_thw" # Output port name, used by Qwen 3.5
49+
type: "OVTensor" # Support DataType: [OVTensor]
50+
- name: "pos_embeds" # Output port name, used by Qwen 3.5
51+
type: "OVTensor" # Support DataType: [OVTensor]
52+
- name: "rotary_cos" # Output port name, used by Qwen 3.5
53+
type: "OVTensor" # Support DataType: [OVTensor]
54+
- name: "rotary_sin" # Output port name, used by Qwen 3.5
55+
type: "OVTensor" # Support DataType: [OVTensor]
4556
params:
4657
target_resolution: [224, 224] # optional
4758
mean: [0.485, 0.456, 0.406] # optional
@@ -62,8 +73,10 @@ ImagePreprocessModule::ImagePreprocessModule(const IBaseModuleDesc::PTR& desc, c
6273

6374
if (model_type == VLMModelType::QWEN2_VL || model_type == VLMModelType::QWEN2_5_VL) {
6475
encoder_ptr = std::make_shared<VisionEncoderQwen2VL>(std::filesystem::path(model_path), device, ov::AnyMap{});
76+
} else if (model_type == VLMModelType::QWEN3_5) {
77+
encoder_ptr = std::make_shared<Qwen3_5Preprocessor>(std::filesystem::path(model_path));
6578
} else {
66-
GENAI_ERR("ImagePreprocessModule[" + desc->name + "]: Unsupported model type: " + desc->model_type);
79+
OPENVINO_THROW("ImagePreprocessModule[" + desc->name + "]: Unsupported model type: " + desc->model_type);
6780
}
6881
}
6982

@@ -72,28 +85,48 @@ ImagePreprocessModule::~ImagePreprocessModule() {}
7285
void ImagePreprocessModule::run() {
7386
GENAI_INFO("Running module: " + module_desc->name);
7487
prepare_inputs();
88+
VLMModelType model_type = to_vlm_model_type(module_desc->model_type);
7589

7690
if (exists_input("images")) {
7791
auto images_data = get_input("images").as<std::vector<ov::Tensor>>();
78-
std::vector<ov::Tensor> output_tensors;
79-
std::vector<ImageSize> output_sizes;
80-
for (size_t i = 0; i < images_data.size(); ++i) {
81-
auto encoded_img = encoder_ptr->encode(images_data[i], ov::AnyMap{});
82-
output_tensors.push_back(encoded_img.resized_source);
83-
output_sizes.push_back(encoded_img.resized_source_size);
84-
}
85-
this->outputs["raw_datas"].data = output_tensors;
86-
std::vector<std::vector<int>> sizes_vec;
87-
for (const auto& sz : output_sizes) {
88-
sizes_vec.push_back({static_cast<int>(sz.height), static_cast<int>(sz.width)});
92+
if (model_type == VLMModelType::QWEN2_VL || model_type == VLMModelType::QWEN2_5_VL) {
93+
std::vector<ov::Tensor> output_tensors;
94+
std::vector<ImageSize> output_sizes;
95+
for (size_t i = 0; i < images_data.size(); ++i) {
96+
auto encoded_img = std::get<std::shared_ptr<VisionEncoderQwen2VL>>(encoder_ptr)->encode(images_data[i], ov::AnyMap{});
97+
output_tensors.push_back(encoded_img.resized_source);
98+
output_sizes.push_back(encoded_img.resized_source_size);
99+
}
100+
this->outputs["raw_datas"].data = output_tensors;
101+
std::vector<std::vector<int>> sizes_vec;
102+
for (const auto& sz : output_sizes) {
103+
sizes_vec.push_back({static_cast<int>(sz.height), static_cast<int>(sz.width)});
104+
}
105+
this->outputs["source_sizes"].data = sizes_vec;
106+
} else if (model_type == VLMModelType::QWEN3_5) {
107+
ov::Tensor images = tensor_utils::stack(images_data, 0);
108+
Qwen3_5PreprocessorOutput output = std::get<std::shared_ptr<Qwen3_5Preprocessor>>(encoder_ptr)->preprocess(images);
109+
this->outputs["pixel_values"].data = output.pixel_values;
110+
this->outputs["grid_thw"].data = output.grid_thw;
111+
this->outputs["pos_embeds"].data = output.pos_embeds;
112+
this->outputs["rotary_cos"].data = output.rotary_cos;
113+
this->outputs["rotary_sin"].data = output.rotary_sin;
89114
}
90-
this->outputs["source_sizes"].data = sizes_vec;
91115
} else {
92116
auto image1_data = get_input("image").as<ov::Tensor>();
93-
auto encoded_img = encoder_ptr->encode(image1_data, ov::AnyMap{});
94-
this->outputs["raw_data"].data = encoded_img.resized_source;
95-
this->outputs["source_size"].data =
96-
std::vector<int>{static_cast<int>(encoded_img.resized_source_size.height), static_cast<int>(encoded_img.resized_source_size.width)};
117+
if (model_type == VLMModelType::QWEN2_VL || model_type == VLMModelType::QWEN2_5_VL) {
118+
auto encoded_img = std::get<std::shared_ptr<VisionEncoderQwen2VL>>(encoder_ptr)->encode(image1_data, ov::AnyMap{});
119+
this->outputs["raw_data"].data = encoded_img.resized_source;
120+
this->outputs["source_size"].data =
121+
std::vector<int>{static_cast<int>(encoded_img.resized_source_size.height), static_cast<int>(encoded_img.resized_source_size.width)};
122+
} else if (model_type == VLMModelType::QWEN3_5) {
123+
Qwen3_5PreprocessorOutput output = std::get<std::shared_ptr<Qwen3_5Preprocessor>>(encoder_ptr)->preprocess(image1_data);
124+
this->outputs["pixel_values"].data = output.pixel_values;
125+
this->outputs["grid_thw"].data = output.grid_thw;
126+
this->outputs["pos_embeds"].data = output.pos_embeds;
127+
this->outputs["rotary_cos"].data = output.rotary_cos;
128+
this->outputs["rotary_sin"].data = output.rotary_sin;
129+
}
97130
}
98131
}
99132

src/cpp/src/module_genai/modules/md_img_preprocess.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@
44
#pragma once
55

66
#include <yaml-cpp/yaml.h>
7+
#include <variant>
78

89
#include "module_genai/module.hpp"
910
#include "module_genai/module_type.hpp"
10-
11+
#include "qwen3_5preprocessor.hpp"
1112
#include "visual_language/qwen2vl/classes.hpp"
1213

1314
namespace ov {
@@ -17,7 +18,7 @@ class ImagePreprocessModule : public IBaseModule {
1718
DeclareModuleConstructor(ImagePreprocessModule);
1819

1920
private:
20-
std::shared_ptr<VisionEncoderQwen2VL> encoder_ptr = nullptr;
21+
std::variant<std::shared_ptr<VisionEncoderQwen2VL>, std::shared_ptr<Qwen3_5Preprocessor>> encoder_ptr;
2122
};
2223

2324
REGISTER_MODULE_CONFIG(ImagePreprocessModule);
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
// Copyright (C) 2023-2026 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
#include <fstream>
5+
#include "nlohmann/json.hpp"
6+
#include "qwen3_5config.hpp"
7+
#include "openvino/core/except.hpp"
8+
#include "json_utils.hpp"
9+
10+
namespace ov::genai::module {
11+
12+
Qwen3_5VisionConfig Qwen3_5VisionConfig::from_json_file(const std::filesystem::path &path) {
13+
std::ifstream json_file(path);
14+
if (!json_file.is_open()) {
15+
OPENVINO_THROW("Failed to open vision config file: ", path.string());
16+
}
17+
nlohmann::json data;
18+
json_file >> data;
19+
Qwen3_5VisionConfig cfg;
20+
using ov::genai::utils::read_json_param;
21+
read_json_param(data, "vision_config.model_type", cfg.model_type);
22+
read_json_param(data, "vision_config.depth", cfg.depth);
23+
read_json_param(data, "vision_config.hidden_size", cfg.hidden_size);
24+
read_json_param(data, "vision_config.hidden_act", cfg.hidden_act);
25+
read_json_param(data, "vision_config.intermediate_size", cfg.intermediate_size);
26+
read_json_param(data, "vision_config.num_heads", cfg.num_heads);
27+
read_json_param(data, "vision_config.in_channels", cfg.in_channels);
28+
read_json_param(data, "vision_config.patch_size", cfg.patch_size);
29+
read_json_param(data, "vision_config.spatial_merge_size", cfg.spatial_merge_size);
30+
read_json_param(data, "vision_config.temporal_patch_size", cfg.temporal_patch_size);
31+
read_json_param(data, "vision_config.out_hidden_size", cfg.out_hidden_size);
32+
read_json_param(data, "vision_config.num_position_embeddings", cfg.num_position_embeddings);
33+
read_json_param(data, "vision_config.deepstack_visual_indexes", cfg.deepstack_visual_indexes);
34+
read_json_param(data, "vision_config.initializer_range", cfg.initializer_range);
35+
36+
return cfg;
37+
}
38+
39+
int32_t Qwen3_5VisionConfig::head_dim() const {
40+
if (num_heads <= 0) {
41+
return 0;
42+
}
43+
return hidden_size / num_heads;
44+
}
45+
46+
Qwen3_5VisionPreprocessConfig Qwen3_5VisionPreprocessConfig::from_json_file(const std::filesystem::path &path) {
47+
std::ifstream json_file(path);
48+
if (!json_file.is_open()) {
49+
OPENVINO_THROW("Failed to open vision preprocess config file: ", path.string());
50+
}
51+
nlohmann::json data;
52+
json_file >> data;
53+
Qwen3_5VisionPreprocessConfig cfg;
54+
using ov::genai::utils::read_json_param;
55+
read_json_param(data, "size.shortest_edge", cfg.min_pixels);
56+
read_json_param(data, "size.longest_edge", cfg.max_pixels);
57+
read_json_param(data, "patch_size", cfg.patch_size);
58+
read_json_param(data, "temporal_patch_size", cfg.temporal_patch_size);
59+
read_json_param(data, "merge_size", cfg.merge_size);
60+
read_json_param(data, "image_mean", cfg.image_mean);
61+
read_json_param(data, "image_std", cfg.image_std);
62+
63+
return cfg;
64+
}
65+
66+
}
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
// Copyright (C) 2023-2026 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
#pragma once
5+
6+
#include <string>
7+
#include <vector>
8+
#include <filesystem>
9+
#include <array>
10+
11+
namespace ov::genai::module {
12+
13+
struct Qwen3_5VisionConfig {
14+
std::string model_type = "qwen3_5";
15+
int32_t depth = 0;
16+
int32_t hidden_size = 0;
17+
std::string hidden_act = "gelu_pytorch_tanh";
18+
int32_t intermediate_size = 0;
19+
int32_t num_heads = 0;
20+
int32_t in_channels = 3;
21+
int32_t patch_size = 16;
22+
int32_t spatial_merge_size = 2;
23+
int32_t temporal_patch_size = 2;
24+
int32_t out_hidden_size = 0;
25+
int32_t num_position_embeddings = 0;
26+
std::vector<int32_t> deepstack_visual_indexes;
27+
float initializer_range = 0.02f;
28+
29+
static Qwen3_5VisionConfig from_json_file(const std::filesystem::path& path);
30+
int32_t head_dim() const;
31+
};
32+
33+
struct Qwen3_5VisionPreprocessConfig {
34+
int64_t min_pixels = 56 * 56;
35+
int64_t max_pixels = 28 * 28 * 1280;
36+
int32_t patch_size = 16;
37+
int32_t temporal_patch_size = 2;
38+
int32_t merge_size = 2;
39+
std::array<float, 3> image_mean = {0.5f, 0.5f, 0.5f};
40+
std::array<float, 3> image_std = {0.5f, 0.5f, 0.5f};
41+
bool do_resize = true;
42+
43+
static Qwen3_5VisionPreprocessConfig from_json_file(const std::filesystem::path& path);
44+
};
45+
46+
}

0 commit comments

Comments
 (0)