xipingyan
diff --git a/‎src/cpp/src/module_genai/modules/md_img_preprocess.cpp‎
Lines changed: 53 additions & 20 deletions b/‎src/cpp/src/module_genai/modules/md_img_preprocess.cpp‎
Lines changed: 53 additions & 20 deletions
diff --git a/‎src/cpp/src/module_genai/modules/md_img_preprocess.hpp‎
Lines changed: 2 additions & 2 deletions b/‎src/cpp/src/module_genai/modules/md_img_preprocess.hpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/cpp/src/module_genai/modules/qwen3_5config.cpp‎
Lines changed: 66 additions & 0 deletions b/‎src/cpp/src/module_genai/modules/qwen3_5config.cpp‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎src/cpp/src/module_genai/modules/qwen3_5config.hpp‎
Lines changed: 46 additions & 0 deletions b/‎src/cpp/src/module_genai/modules/qwen3_5config.hpp‎
Lines changed: 46 additions & 0 deletions
@@ -4,6 +4,7 @@
 #include "md_img_preprocess.hpp"
 
 #include "module_genai/module_factory.hpp"
+#include "module_genai/utils/tensor_utils.hpp"
 
 #include <chrono>
 #include <thread>
@@ -34,14 +35,24 @@ void ImagePreprocessModule::print_static_config() {
         type: "VecOVTensor"     # Support DataType: [VecOVTensor]
         source: "ParentModuleName.OutputPortName"
     outputs:
-      - name: "raw_data"        # Output port name
+      - name: "raw_data"        # Output port name, used by Qwen 2.5-VL
         type: "OVTensor"        # Support DataType: [OVTensor]
-      - name: "source_size"     # Output port name
+      - name: "source_size"     # Output port name, used by Qwen 2.5-VL
         type: "VecInt"          # Support DataType: [VecInt]
-      - name: "raw_datas"       # batch processed vision output
+      - name: "raw_datas"       # batch processed vision output, used by Qwen 2.5-VL
         type: "VecOVTensor"     # Support DataType: [VecOVTensor]
-      - name: "source_sizes"    # Output port name
+      - name: "source_sizes"    # Output port name, used by Qwen 2.5-VL
         type: "VecVecInt"       # Support DataType: [VecVecInt]
+      - name: "pixel_values"    # Output port name, used by Qwen 3.5
+        type: "OVTensor"        # Support DataType: [OVTensor]
+      - name: "grid_thw"        # Output port name, used by Qwen 3.5
+        type: "OVTensor"        # Support DataType: [OVTensor]
+      - name: "pos_embeds"      # Output port name, used by Qwen 3.5
+        type: "OVTensor"        # Support DataType: [OVTensor]
+      - name: "rotary_cos"      # Output port name, used by Qwen 3.5
+        type: "OVTensor"        # Support DataType: [OVTensor]
+      - name: "rotary_sin"      # Output port name, used by Qwen 3.5
+        type: "OVTensor"        # Support DataType: [OVTensor]
     params:
       target_resolution: [224, 224]   # optional
       mean: [0.485, 0.456, 0.406]     # optional
@@ -62,6 +73,8 @@ ImagePreprocessModule::ImagePreprocessModule(const IBaseModuleDesc::PTR& desc, c
 
     if (model_type == VLMModelType::QWEN2_VL || model_type == VLMModelType::QWEN2_5_VL) {
         encoder_ptr = std::make_shared<VisionEncoderQwen2VL>(std::filesystem::path(model_path), device, ov::AnyMap{});
+    } else if (model_type == VLMModelType::QWEN3_5) {
+        encoder_ptr = std::make_shared<Qwen3_5Preprocessor>(std::filesystem::path(model_path));
     } else {
         GENAI_ERR("ImagePreprocessModule[" + desc->name + "]: Unsupported model type: " + desc->model_type);
     }
@@ -72,28 +85,48 @@ ImagePreprocessModule::~ImagePreprocessModule() {}
 void ImagePreprocessModule::run() {
     GENAI_INFO("Running module: " + module_desc->name);
     prepare_inputs();
+    VLMModelType model_type = to_vlm_model_type(module_desc->model_type);
 
     if (exists_input("images")) {
         auto images_data = get_input("images").as<std::vector<ov::Tensor>>();
-        std::vector<ov::Tensor> output_tensors;
-        std::vector<ImageSize> output_sizes;
-        for (size_t i = 0; i < images_data.size(); ++i) {
-            auto encoded_img = encoder_ptr->encode(images_data[i], ov::AnyMap{});
-            output_tensors.push_back(encoded_img.resized_source);
-            output_sizes.push_back(encoded_img.resized_source_size);
-        }
-        this->outputs["raw_datas"].data = output_tensors;
-        std::vector<std::vector<int>> sizes_vec;
-        for (const auto& sz : output_sizes) {
-            sizes_vec.push_back({static_cast<int>(sz.height), static_cast<int>(sz.width)});
+        if (model_type == VLMModelType::QWEN2_VL || model_type == VLMModelType::QWEN2_5_VL) {
+            std::vector<ov::Tensor> output_tensors;
+            std::vector<ImageSize> output_sizes;
+            for (size_t i = 0; i < images_data.size(); ++i) {
+                auto encoded_img = std::get<std::shared_ptr<VisionEncoderQwen2VL>>(encoder_ptr)->encode(images_data[i], ov::AnyMap{});
+                output_tensors.push_back(encoded_img.resized_source);
+                output_sizes.push_back(encoded_img.resized_source_size);
+            }
+            this->outputs["raw_datas"].data = output_tensors;
+            std::vector<std::vector<int>> sizes_vec;
+            for (const auto& sz : output_sizes) {
+                sizes_vec.push_back({static_cast<int>(sz.height), static_cast<int>(sz.width)});
+            }
+            this->outputs["source_sizes"].data = sizes_vec;
+        } else if (model_type == VLMModelType::QWEN3_5) {
+            ov::Tensor images = tensor_utils::stack(images_data, 0);
+            Qwen3_5PreprocessorOutput output = std::get<std::shared_ptr<Qwen3_5Preprocessor>>(encoder_ptr)->preprocess(images);
+            this->outputs["pixel_values"].data = output.pixel_values;
+            this->outputs["grid_thw"].data = output.grid_thw;
+            this->outputs["pos_embeds"].data = output.pos_embeds;
+            this->outputs["rotary_cos"].data = output.rotary_cos;
+            this->outputs["rotary_sin"].data = output.rotary_sin;
         }
-        this->outputs["source_sizes"].data = sizes_vec;
     } else {
         auto image1_data = get_input("image").as<ov::Tensor>();
-        auto encoded_img = encoder_ptr->encode(image1_data, ov::AnyMap{});
-        this->outputs["raw_data"].data = encoded_img.resized_source;
-        this->outputs["source_size"].data =
-            std::vector<int>{static_cast<int>(encoded_img.resized_source_size.height), static_cast<int>(encoded_img.resized_source_size.width)};
+        if (model_type == VLMModelType::QWEN2_VL || model_type == VLMModelType::QWEN2_5_VL) {
+            auto encoded_img = std::get<std::shared_ptr<VisionEncoderQwen2VL>>(encoder_ptr)->encode(image1_data, ov::AnyMap{});
+            this->outputs["raw_data"].data = encoded_img.resized_source;
+            this->outputs["source_size"].data =
+                std::vector<int>{static_cast<int>(encoded_img.resized_source_size.height), static_cast<int>(encoded_img.resized_source_size.width)};
+        } else if (model_type == VLMModelType::QWEN3_5) {
+            Qwen3_5PreprocessorOutput output = std::get<std::shared_ptr<Qwen3_5Preprocessor>>(encoder_ptr)->preprocess(image1_data);
+            this->outputs["pixel_values"].data = output.pixel_values;
+            this->outputs["grid_thw"].data = output.grid_thw;
+            this->outputs["pos_embeds"].data = output.pos_embeds;
+            this->outputs["rotary_cos"].data = output.rotary_cos;
+            this->outputs["rotary_sin"].data = output.rotary_sin;
+        }
     }
 }
 
 
@@ -7,7 +7,7 @@
 
 #include "module_genai/module.hpp"
 #include "module_genai/module_type.hpp"
-
+#include "qwen3_5preprocessor.hpp"
 #include "visual_language/qwen2vl/classes.hpp"
 
 namespace ov {
@@ -17,7 +17,7 @@ class ImagePreprocessModule : public IBaseModule {
     DeclareModuleConstructor(ImagePreprocessModule);
 
 private:
-    std::shared_ptr<VisionEncoderQwen2VL> encoder_ptr = nullptr;
+    std::variant<std::shared_ptr<VisionEncoderQwen2VL>, std::shared_ptr<Qwen3_5Preprocessor>> encoder_ptr;
 };
 
 REGISTER_MODULE_CONFIG(ImagePreprocessModule);
 
@@ -0,0 +1,66 @@
+// Copyright (C) 2023-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <fstream>
+#include "nlohmann/json.hpp"
+#include "qwen3_5config.hpp"
+#include "openvino/core/except.hpp"
+#include "json_utils.hpp"
+
+namespace ov::genai::module {
+
+Qwen3_5VisionConfig Qwen3_5VisionConfig::from_json_file(const std::filesystem::path &path) {
+    std::ifstream json_file(path);
+    if (!json_file.is_open()) {
+        OPENVINO_THROW("Failed to open vision config file: ", path.string());
+    }
+    nlohmann::json data;
+    json_file >> data;
+    Qwen3_5VisionConfig cfg;
+    using ov::genai::utils::read_json_param;
+    read_json_param(data, "vision_config.model_type", cfg.model_type);
+    read_json_param(data, "vision_config.depth", cfg.depth);
+    read_json_param(data, "vision_config.hidden_size", cfg.hidden_size);
+    read_json_param(data, "vision_config.hidden_act", cfg.hidden_act);
+    read_json_param(data, "vision_config.intermediate_size", cfg.intermediate_size);
+    read_json_param(data, "vision_config.num_heads", cfg.num_heads);
+    read_json_param(data, "vision_config.in_channels", cfg.in_channels);
+    read_json_param(data, "vision_config.patch_size", cfg.patch_size);
+    read_json_param(data, "vision_config.spatial_merge_size", cfg.spatial_merge_size);
+    read_json_param(data, "vision_config.temporal_patch_size", cfg.temporal_patch_size);
+    read_json_param(data, "vision_config.out_hidden_size", cfg.out_hidden_size);
+    read_json_param(data, "vision_config.num_position_embeddings", cfg.num_position_embeddings);
+    read_json_param(data, "vision_config.deepstack_visual_indexes", cfg.deepstack_visual_indexes);
+    read_json_param(data, "vision_config.initializer_range", cfg.initializer_range);
+
+    return cfg;
+}
+
+int32_t Qwen3_5VisionConfig::head_dim() const {
+    if (num_heads <= 0) {
+        return 0;
+    }
+    return hidden_size / num_heads;
+}
+
+Qwen3_5VisionPreprocessConfig Qwen3_5VisionPreprocessConfig::from_json_file(const std::filesystem::path &path) {
+    std::ifstream json_file(path);
+    if (!json_file.is_open()) {
+        OPENVINO_THROW("Failed to open vision preprocess config file: ", path.string());
+    }
+    nlohmann::json data;
+    json_file >> data;
+    Qwen3_5VisionPreprocessConfig cfg;
+    using ov::genai::utils::read_json_param;
+    read_json_param(data, "size.shortest_edge", cfg.min_pixels);
+    read_json_param(data, "size.longest_edge", cfg.max_pixels);
+    read_json_param(data, "patch_size", cfg.patch_size);
+    read_json_param(data, "temporal_patch_size", cfg.temporal_patch_size);
+    read_json_param(data, "merge_size", cfg.merge_size);
+    read_json_param(data, "image_mean", cfg.image_mean);
+    read_json_param(data, "image_std", cfg.image_std);
+
+    return cfg;
+}
+
+}
@@ -0,0 +1,46 @@
+// Copyright (C) 2023-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <filesystem>
+#include <array>
+
+namespace ov::genai::module {
+
+struct Qwen3_5VisionConfig {
+    std::string model_type = "qwen3_5";
+    int32_t depth = 0;
+    int32_t hidden_size = 0;
+    std::string hidden_act = "gelu_pytorch_tanh";
+    int32_t intermediate_size = 0;
+    int32_t num_heads = 0;
+    int32_t in_channels = 3;
+    int32_t patch_size = 16;
+    int32_t spatial_merge_size = 2;
+    int32_t temporal_patch_size = 2;
+    int32_t out_hidden_size = 0;
+    int32_t num_position_embeddings = 0;
+    std::vector<int32_t> deepstack_visual_indexes;
+    float initializer_range = 0.02f;
+
+    static Qwen3_5VisionConfig from_json_file(const std::filesystem::path& path);
+    int32_t head_dim() const;
+};
+
+struct Qwen3_5VisionPreprocessConfig {
+    int64_t min_pixels = 56 * 56;
+    int64_t max_pixels = 28 * 28 * 1280;
+    int32_t patch_size = 16;
+    int32_t temporal_patch_size = 2;
+    int32_t merge_size = 2;
+    std::array<float, 3> image_mean = {0.5f, 0.5f, 0.5f};
+    std::array<float, 3> image_std = {0.5f, 0.5f, 0.5f};
+    bool do_resize = true;
+
+    static Qwen3_5VisionPreprocessConfig from_json_file(const std::filesystem::path& path);
+};
+
+}