Enable text encoder for Qwen 3.5

ZiniuLin · ZiniuLin · commit 2d0337374e99 · 2026-03-02T14:56:00.000+08:00
Enable text encoder for Qwen 3.5.

Signed-off-by: Ziniu Lin &lt;ziniu.lin@intel.com&gt;
diff --git a/src/cpp/src/module_genai/modules/md_img_preprocess.cpp b/src/cpp/src/module_genai/modules/md_img_preprocess.cpp
@@ -5,6 +5,7 @@
 
 #include "module_genai/module_factory.hpp"
 #include "module_genai/utils/tensor_utils.hpp"
+#include "model/qwen3_5/qwen3_5preprocessor.hpp"
 
 #include <chrono>
 #include <thread>
@@ -105,7 +106,7 @@ void ImagePreprocessModule::run() {
             this->outputs["source_sizes"].data = sizes_vec;
         } else if (model_type == VLMModelType::QWEN3_5) {
             ov::Tensor images = tensor_utils::stack(images_data, 0);
-            Qwen3_5PreprocessorOutput output = std::get<std::shared_ptr<Qwen3_5Preprocessor>>(encoder_ptr)->preprocess(images);
+            auto output = std::any_cast<Qwen3_5PreprocessorOutput>(std::get<std::shared_ptr<Preprocessor>>(encoder_ptr)->preprocess(images));
             this->outputs["pixel_values"].data = output.pixel_values;
             this->outputs["grid_thw"].data = output.grid_thw;
             this->outputs["pos_embeds"].data = output.pos_embeds;
@@ -120,7 +121,7 @@ void ImagePreprocessModule::run() {
             this->outputs["source_size"].data =
                 std::vector<int>{static_cast<int>(encoded_img.resized_source_size.height), static_cast<int>(encoded_img.resized_source_size.width)};
         } else if (model_type == VLMModelType::QWEN3_5) {
-            Qwen3_5PreprocessorOutput output = std::get<std::shared_ptr<Qwen3_5Preprocessor>>(encoder_ptr)->preprocess(image1_data);
+            auto output = std::any_cast<Qwen3_5PreprocessorOutput>(std::get<std::shared_ptr<Preprocessor>>(encoder_ptr)->preprocess(image1_data));
             this->outputs["pixel_values"].data = output.pixel_values;
             this->outputs["grid_thw"].data = output.grid_thw;
             this->outputs["pos_embeds"].data = output.pos_embeds;
diff --git a/src/cpp/src/module_genai/modules/md_img_preprocess.hpp b/src/cpp/src/module_genai/modules/md_img_preprocess.hpp
@@ -8,7 +8,7 @@
 
 #include "module_genai/module.hpp"
 #include "module_genai/module_type.hpp"
-#include "model/qwen3_5/qwen3_5preprocessor.hpp"
+#include "preprocessor.hpp"
 #include "visual_language/qwen2vl/classes.hpp"
 
 namespace ov {
@@ -18,7 +18,7 @@ class ImagePreprocessModule : public IBaseModule {
     DeclareModuleConstructor(ImagePreprocessModule);
 
 private:
-    std::variant<std::shared_ptr<VisionEncoderQwen2VL>, std::shared_ptr<Qwen3_5Preprocessor>> encoder_ptr;
+    std::variant<std::shared_ptr<VisionEncoderQwen2VL>, std::shared_ptr<Preprocessor>> encoder_ptr;
 };
 
 REGISTER_MODULE_CONFIG(ImagePreprocessModule);
diff --git a/src/cpp/src/module_genai/modules/md_text_encoder.cpp b/src/cpp/src/module_genai/modules/md_text_encoder.cpp
@@ -6,6 +6,7 @@
 #include "module_genai/module_factory.hpp"
 #include "openvino/genai/tokenizer.hpp"
 #include "tokenizer/tokenizer_impl.hpp"
+#include "model/qwen3_5/qwen3_5config.hpp"
 
 #include <chrono>
 #include <thread>
@@ -32,24 +33,27 @@ void TextEncoderModule::print_static_config() {
       - name: "prompts"
         type: "VecString"         # [Optional] Support DataType: [VecString]
         source: "ParentModuleName.OutputPortName"
-      - name: "encoded_image"
+      - name: "encoded_image"     # Used by Qwen 2.5-VL
         type: "OVTensor"          # [Optional] Support DataType: [OVTensor]
         source: "ParentModuleName.OutputPortName"
-      - name: "encoded_images"
+      - name: "encoded_images"    # Used by Qwen 2.5-VL
         type: "VecOVTensor"       # [Optional] Support DataType: [VecOVTensor]
         source: "ParentModuleName.OutputPortName"
-      - name: "source_size"
+      - name: "source_size"       # Used by Qwen 2.5-VL
         type: "VecInt"            # [Optional] Support DataType: [VecInt]
         source: "ParentModuleName.OutputPortName"
-      - name: "source_sizes"
+      - name: "source_sizes"      # Used by Qwen 2.5-VL
         type: "VecVecInt"         # [Optional] Support DataType: [VecVecInt]
         source: "ParentModuleName.OutputPortName"
+      - name: "grid_thw"           # Used by Qwen 3.5
+        type: "OVTensor"          # [Optional] Support DataType: [OVTensor]
+        source: "ParentModuleName.OutputPortName"
     outputs:
       - name: "input_ids"
         type: "OVTensor"     # Support DataType: [OVTensor, OVRemoteTensor]
       - name: "mask"
         type: "OVTensor"     # Support DataType: [OVTensor, OVRemoteTensor]
-      - name: "images_sequence"
+      - name: "images_sequence"    # Output by Qwen 2.5-VL
         type: "VecInt"             # Support DataType: [VecInt]
     params:
       model_path: "models/text_encoder.xml"  # Optional. OpenVINO IR
@@ -77,14 +81,25 @@ bool TextEncoderModule::initialize() {
 
     m_tokenizer_impl = std::make_shared<Tokenizer::TokenizerImpl>(tokenizer_path, m_tokenization_params);
     OPENVINO_ASSERT(m_tokenizer_impl->m_ireq_queue_tokenizer != nullptr, std::string("Load tokenizer model fail: ") + tokenizer_path.string());
+    VLMModelType model_type = to_vlm_model_type(module_desc->model_type);
     m_vlm_config = utils::from_config_json_if_exists<VLMConfig>(tokenizer_path, "config.json");
     m_processor_config = utils::from_config_json_if_exists<ProcessorConfig>(tokenizer_path, "preprocessor_config.json");
-    m_merge_length = std::pow(m_processor_config.merge_size, 2);
+    if (model_type == VLMModelType::QWEN2_VL || model_type == VLMModelType::QWEN2_5_VL) {
+        m_merge_length = std::pow(m_processor_config.merge_size, 2);
+    } else if (model_type == VLMModelType::QWEN3_5) {
+        Qwen3_5VisionConfig vision_config = Qwen3_5VisionConfig::from_json_file(tokenizer_path / "config.json");
+        m_merge_length = std::pow(vision_config.spatial_merge_size, 2);
+    } else {
+        GENAI_ERR("TextEncoderModule[" + module_desc->name + "]: Unsupported model type: " + module_desc->model_type);
+    }
+    
     return true;
 }
 
 void TextEncoderModule::run() {
     GENAI_INFO("Running module: " + module_desc->name);
+
+    VLMModelType model_type = to_vlm_model_type(module_desc->model_type);
     
     prepare_inputs();
     std::vector<std::string> m_prompts = {};
@@ -97,39 +112,52 @@ void TextEncoderModule::run() {
         OPENVINO_ASSERT(false, "TextEncoderModule[" + module_desc->name + "]: No prompt input found.");
     }
 
-    std::vector<ov::Tensor> encoded_images = {};
-    std::vector<std::vector<int>> source_sizes = {};
-    bool has_encoded_image = false;
-    if (exists_input("encoded_image")) {
-        ov::Tensor encoded_image = get_input("encoded_image").as<ov::Tensor>();
-        encoded_images.push_back(encoded_image);
-        has_encoded_image = true;
-    }
-    if (exists_input("encoded_images")) {
-        encoded_images = get_input("encoded_images").as<std::vector<ov::Tensor>>();
-        has_encoded_image = true;
-    }
-    if (exists_input("source_size")) {
-        source_sizes.push_back(get_input("source_size").as<std::vector<int>>());
-    }
-    if (exists_input("source_sizes")) {
-        source_sizes = get_input("source_sizes").as<std::vector<std::vector<int>>>();
-    }
+    if (model_type == VLMModelType::QWEN2_VL || model_type == VLMModelType::QWEN2_5_VL) {
+        std::vector<ov::Tensor> encoded_images = {};
+        std::vector<std::vector<int>> source_sizes = {};
+        bool has_encoded_image = false;
+        if (exists_input("encoded_image")) {
+            ov::Tensor encoded_image = get_input("encoded_image").as<ov::Tensor>();
+            encoded_images.push_back(encoded_image);
+            has_encoded_image = true;
+        }
+        if (exists_input("encoded_images")) {
+            encoded_images = get_input("encoded_images").as<std::vector<ov::Tensor>>();
+            has_encoded_image = true;
+        }
+        if (exists_input("source_size")) {
+            source_sizes.push_back(get_input("source_size").as<std::vector<int>>());
+        }
+        if (exists_input("source_sizes")) {
+            source_sizes = get_input("source_sizes").as<std::vector<std::vector<int>>>();
+        }
 
-    if (has_encoded_image) {
-        VLMModelType model_type = to_vlm_model_type(module_desc->model_type);
-        if (model_type != VLMModelType::QWEN2_VL && model_type != VLMModelType::QWEN2_5_VL) {
-            GENAI_ERR("TextEncoderModule[" + module_desc->name + "]: Unsupported model type: " + module_desc->model_type);
-            return;
+        if (has_encoded_image) {
+            VLMModelType model_type = to_vlm_model_type(module_desc->model_type);
+            if (model_type != VLMModelType::QWEN2_VL && model_type != VLMModelType::QWEN2_5_VL) {
+                GENAI_ERR("TextEncoderModule[" + module_desc->name + "]: Unsupported model type: " + module_desc->model_type);
+                return;
+            }
         }
-    }
 
-    auto [encoded, images_sequence] = run(m_prompts, encoded_images, source_sizes, has_encoded_image);
+        auto [encoded, images_sequence] = run(m_prompts, encoded_images, source_sizes, has_encoded_image);
 
-    this->outputs["input_ids"].data = encoded.input_ids;
-    this->outputs["mask"].data = encoded.attention_mask;
-    if (images_sequence.size() > 0) {
-        this->outputs["images_sequence"].data = images_sequence;
+        this->outputs["input_ids"].data = encoded.input_ids;
+        this->outputs["mask"].data = encoded.attention_mask;
+        if (images_sequence.size() > 0) {
+            this->outputs["images_sequence"].data = images_sequence;
+        }
+    } else if (model_type == VLMModelType::QWEN3_5) {
+        std::optional<ov::Tensor> grid_thw = std::nullopt;
+        if (exists_input("grid_thw")) {
+            grid_thw = get_input("grid_thw").as<ov::Tensor>();
+        }
+
+        auto encoded = run(m_prompts, grid_thw);
+        this->outputs["input_ids"].data = encoded.input_ids;
+        this->outputs["mask"].data = encoded.attention_mask;
+    } else {
+        OPENVINO_THROW("Unsupported model type: " + module_desc->model_type);
     }
 }
 
@@ -160,24 +188,45 @@ std::pair<TokenizedInputs, std::vector<int>> TextEncoderModule::run(const std::v
     }
 }
 
+TokenizedInputs TextEncoderModule::run(const std::vector<std::string>& prompts, std::optional<ov::Tensor>& grid_thw) {
+    if (grid_thw.has_value()) {
+        std::vector<std::string> unified_prompts = {};
+        for (const auto &prompt : prompts) {
+            // Hard code base image/video id and encoded images/videos
+            auto [unified_prompt, images_sequence, videos_sequence] = normalize_prompt(prompt, 0, 0, grid_thw.value());
+            std::stringstream ss;
+            ss << "<|im_start|>user\n";
+            ss << unified_prompt;
+            ss << "<|im_end|>\n<|im_start|>assistant\n";
+            unified_prompts.push_back(ss.str());
+        }
+        return m_tokenizer_impl->encode(unified_prompts, m_tokenization_params);
+    } else {
+        return m_tokenizer_impl->encode(prompts, m_tokenization_params);
+    }
+}
+
 NormalizedPrompt TextEncoderModule::normalize_prompt(const std::string& prompt,
                                       size_t base_image_id,
                                       size_t base_video_id,
                                       const std::vector<ov::Tensor>& encoded_images,
                                       const std::vector<ov::Tensor>& encoded_videos,
                                       const std::vector<std::vector<int>>& source_sizes) {
-    auto [unified_prompt, images_sequence] = normalize(prompt, NATIVE_TAG, NATIVE_TAG, base_image_id, encoded_images.size());
-    std::vector<std::array<size_t, 3>> images_grid_thw;
-    images_grid_thw.reserve(encoded_images.size());
-    for (const auto& source_size : source_sizes) {
-        size_t grid_t = 1;
-        size_t grid_h = source_size[0];
-        size_t grid_w = source_size[1];
-        images_grid_thw.push_back({grid_t, grid_h, grid_w});
-    }
+    auto thw = calc_thw(source_sizes);
+    return normalize_prompt(prompt, base_image_id, base_video_id, thw);
+}
 
+NormalizedPrompt TextEncoderModule::normalize_prompt(const std::string& prompt,
+                                  size_t base_image_id,
+                                  size_t base_video_id,
+                                  const ov::Tensor& grid_thw) {
+    const ov::Shape& thw_shape = grid_thw.get_shape();
+    auto thw_data = grid_thw.data<const int64_t>();
+    auto [unified_prompt, images_sequence] = normalize(prompt, NATIVE_TAG, NATIVE_TAG, base_image_id, thw_shape[0]);
     for (size_t new_image_id : images_sequence) {
-        auto [grid_t, grid_h, grid_w] = images_grid_thw.at(new_image_id - base_image_id);
+        size_t grid_t = thw_data[(new_image_id - base_image_id) * 3 + 0];
+        size_t grid_h = thw_data[(new_image_id - base_image_id) * 3 + 1];
+        size_t grid_w = thw_data[(new_image_id - base_image_id) * 3 + 2];
         const size_t num_image_pad_tokens = calc_tokens_num(grid_t, grid_h, grid_w);
 
         std::string expanded_tag;
@@ -189,6 +238,9 @@ NormalizedPrompt TextEncoderModule::normalize_prompt(const std::string& prompt,
             expanded_tag.append(m_vlm_config.image_pad_token);
         }
         expanded_tag.append(m_vlm_config.vision_end_token);
+        if (to_vlm_model_type(module_desc->model_type) == VLMModelType::QWEN3_5) {
+            expanded_tag.append("\n");
+        }
 
         unified_prompt.replace(unified_prompt.find(NATIVE_TAG), NATIVE_TAG.length(), expanded_tag);
     }
@@ -233,6 +285,20 @@ size_t TextEncoderModule::calc_tokens_num(size_t grid_t, size_t grid_h, size_t g
     return grid_t * grid_h * grid_w / m_merge_length;
 }
 
+ov::Tensor TextEncoderModule::calc_thw(const std::vector<std::vector<int>>& source_sizes) {
+    ov::Tensor thw_tensor(ov::element::i64, ov::Shape{source_sizes.size(), 3});
+    auto thw_data = thw_tensor.data<int64_t>();
+    for (size_t i = 0; i < source_sizes.size(); i++) {
+        int64_t grid_t = 1;
+        auto grid_h = static_cast<int64_t>(source_sizes[i][0]);
+        auto grid_w = static_cast<int64_t>(source_sizes[i][1]);
+        thw_data[i * 3 + 0] = grid_t;
+        thw_data[i * 3 + 1] = grid_h;
+        thw_data[i * 3 + 2] = grid_w;
+    }
+    return thw_tensor;
+}
+
 }  // namespace module
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/module_genai/modules/md_text_encoder.hpp b/src/cpp/src/module_genai/modules/md_text_encoder.hpp
@@ -30,19 +30,26 @@ class TextEncoderModule : public IBaseModule {
                         const std::vector<ov::Tensor>& encoded_images,
                         const std::vector<std::vector<int>>& source_sizes,
                         bool has_encoded_image = false);
+    TokenizedInputs run(const std::vector<std::string>& prompts, std::optional<ov::Tensor>& grid_thw);
     NormalizedPrompt normalize_prompt(const std::string& prompt,
                                       size_t base_image_id,
                                       size_t base_video_id,
                                       const std::vector<ov::Tensor>& encoded_images,
                                       const std::vector<ov::Tensor>& encoded_videos,
                                       const std::vector<std::vector<int>>& source_sizes);
+    NormalizedPrompt normalize_prompt(const std::string& prompt,
+                                      size_t base_image_id,
+                                      size_t base_video_id,
+                                      const ov::Tensor& grid_thw);
     std::pair<std::string, std::vector<size_t>> normalize(
             const std::string& prompt,
             const std::string& native_tag,
             const std::string& automatic_tag,
             size_t base_id,
             size_t n_images);
     size_t calc_tokens_num(size_t grid_t, size_t grid_h, size_t grid_w) const;
+
+    ov::Tensor calc_thw(const std::vector<std::vector<int>>& source_sizes);
 };
 
 REGISTER_MODULE_CONFIG(TextEncoderModule);
diff --git a/src/cpp/src/module_genai/modules/model/qwen3_5/qwen3_5preprocessor.cpp b/src/cpp/src/module_genai/modules/model/qwen3_5/qwen3_5preprocessor.cpp
@@ -26,7 +26,7 @@ Qwen3_5Preprocessor::Qwen3_5Preprocessor(const std::filesystem::path &model_path
     load_pos_embed_weight(model_path);
 }
 
-Qwen3_5PreprocessorOutput Qwen3_5Preprocessor::preprocess(const ov::Tensor &images) {
+std::any Qwen3_5Preprocessor::preprocess(const ov::Tensor &images) {
     const auto img_shape = images.get_shape();
     if (img_shape.size() != 3 && img_shape.size() != 4) {
         OPENVINO_THROW("images must have shape [H, W, C] or [B, H, W, C]");
@@ -173,7 +173,7 @@ Qwen3_5PreprocessorOutput Qwen3_5Preprocessor::preprocess(const ov::Tensor &imag
     auto pos_embeds = build_pos_embeds(grid_thw);
     auto rotary = build_rotary_cos_sin(grid_thw);
 
-    return {pixel_values, grid_thw, pos_embeds, rotary.first, rotary.second};
+    return Qwen3_5PreprocessorOutput{pixel_values, grid_thw, pos_embeds, rotary.first, rotary.second};
 }
 
 void Qwen3_5Preprocessor::load_pos_embed_weight(const std::filesystem::path &model_path) {
diff --git a/src/cpp/src/module_genai/modules/model/qwen3_5/qwen3_5preprocessor.hpp b/src/cpp/src/module_genai/modules/model/qwen3_5/qwen3_5preprocessor.hpp
@@ -9,6 +9,7 @@
 #include <string>
 #include "openvino/runtime/tensor.hpp"
 #include "qwen3_5config.hpp"
+#include "../../preprocessor.hpp"
 
 namespace ov::genai::module {
 
@@ -20,11 +21,11 @@ struct Qwen3_5PreprocessorOutput {
     ov::Tensor rotary_sin;
 };
 
-class Qwen3_5Preprocessor {
+class Qwen3_5Preprocessor : public Preprocessor {
 public:
     explicit Qwen3_5Preprocessor(const std::filesystem::path& model_path);
 
-    Qwen3_5PreprocessorOutput preprocess(const ov::Tensor &images);
+    std::any preprocess(const ov::Tensor &images) override;
 private:
     Qwen3_5VisionPreprocessConfig m_preprocess_config;
     Qwen3_5VisionConfig m_vision_config;
diff --git a/src/cpp/src/module_genai/modules/preprocessor.hpp b/src/cpp/src/module_genai/modules/preprocessor.hpp
@@ -0,0 +1,18 @@
+// Copyright (C) 2023-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+#include <any>
+#include <openvino/runtime/tensor.hpp>
+
+namespace ov::genai::module {
+
+class Preprocessor {
+public:
+    Preprocessor() = default;
+    virtual ~Preprocessor() = default;
+
+    virtual std::any preprocess(const ov::Tensor &images) = 0;
+};
+
+}
diff --git a/src/cpp/src/visual_language/vlm_config.cpp b/src/cpp/src/visual_language/vlm_config.cpp
@@ -23,6 +23,7 @@ VLMModelType to_vlm_model_type(const std::string& value) {
         {"qwen2_5_vl", VLMModelType::QWEN2_5_VL},
         {"gemma3", VLMModelType::GEMMA3},
         {"qwen3_5", VLMModelType::QWEN3_5},
+        {"qwen3_5_moe", VLMModelType::QWEN3_5_MOE},
     };
 
     auto it = model_types_map.find(value);
diff --git a/src/cpp/src/visual_language/vlm_config.hpp b/src/cpp/src/visual_language/vlm_config.hpp
@@ -22,6 +22,7 @@ enum class VLMModelType {
     QWEN2_5_VL,
     GEMMA3,
     QWEN3_5,
+    QWEN3_5_MOE,
 };
 
 VLMModelType to_vlm_model_type(const std::string& value);
diff --git a/tests/module_genai/cpp/modules/TextEncoderModule.cpp b/tests/module_genai/cpp/modules/TextEncoderModule.cpp

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ Qwen3_5Preprocessor::Qwen3_5Preprocessor(const std::filesystem::path &model_path`
`26`	`26`	`load_pos_embed_weight(model_path);`
`27`	`27`	`}`
`28`	`28`
`29`		`-Qwen3_5PreprocessorOutput Qwen3_5Preprocessor::preprocess(const ov::Tensor &images) {`
	`29`	`+std::any Qwen3_5Preprocessor::preprocess(const ov::Tensor &images) {`
`30`	`30`	`const auto img_shape = images.get_shape();`
`31`	`31`	`if (img_shape.size() != 3 && img_shape.size() != 4) {`
`32`	`32`	`OPENVINO_THROW("images must have shape [H, W, C] or [B, H, W, C]");`
`@@ -173,7 +173,7 @@ Qwen3_5PreprocessorOutput Qwen3_5Preprocessor::preprocess(const ov::Tensor &imag`
`173`	`173`	`auto pos_embeds = build_pos_embeds(grid_thw);`
`174`	`174`	`auto rotary = build_rotary_cos_sin(grid_thw);`
`175`	`175`
`176`		`- return {pixel_values, grid_thw, pos_embeds, rotary.first, rotary.second};`
	`176`	`+ return Qwen3_5PreprocessorOutput{pixel_values, grid_thw, pos_embeds, rotary.first, rotary.second};`
`177`	`177`	`}`
`178`	`178`
`179`	`179`	`void Qwen3_5Preprocessor::load_pos_embed_weight(const std::filesystem::path &model_path) {`