Enable vision encoder module for Qwen 3.5

ZiniuLin · ZiniuLin · commit 5f6a60b5defe · 2026-03-03T10:49:51.000+08:00
Enable vision encoder module for Qwen 3.5.

Signed-off-by: Ziniu Lin &lt;ziniu.lin@intel.com&gt;
diff --git a/src/cpp/src/module_genai/modules/md_img_preprocess.cpp b/src/cpp/src/module_genai/modules/md_img_preprocess.cpp
@@ -106,7 +106,7 @@ void ImagePreprocessModule::run() {
             this->outputs["source_sizes"].data = sizes_vec;
         } else if (model_type == VLMModelType::QWEN3_5) {
             ov::Tensor images = tensor_utils::stack(images_data, 0);
-            auto output = std::any_cast<Qwen3_5PreprocessorOutput>(std::get<std::shared_ptr<Preprocessor>>(encoder_ptr)->preprocess(images));
+            Qwen3_5PreprocessorOutput output = std::get<std::shared_ptr<Qwen3_5Preprocessor>>(encoder_ptr)->preprocess(images);
             this->outputs["pixel_values"].data = output.pixel_values;
             this->outputs["grid_thw"].data = output.grid_thw;
             this->outputs["pos_embeds"].data = output.pos_embeds;
@@ -121,7 +121,7 @@ void ImagePreprocessModule::run() {
             this->outputs["source_size"].data =
                 std::vector<int>{static_cast<int>(encoded_img.resized_source_size.height), static_cast<int>(encoded_img.resized_source_size.width)};
         } else if (model_type == VLMModelType::QWEN3_5) {
-            auto output = std::any_cast<Qwen3_5PreprocessorOutput>(std::get<std::shared_ptr<Preprocessor>>(encoder_ptr)->preprocess(image1_data));
+            Qwen3_5PreprocessorOutput output = std::get<std::shared_ptr<Qwen3_5Preprocessor>>(encoder_ptr)->preprocess(image1_data);
             this->outputs["pixel_values"].data = output.pixel_values;
             this->outputs["grid_thw"].data = output.grid_thw;
             this->outputs["pos_embeds"].data = output.pos_embeds;
diff --git a/src/cpp/src/module_genai/modules/md_img_preprocess.hpp b/src/cpp/src/module_genai/modules/md_img_preprocess.hpp
@@ -8,7 +8,7 @@
 
 #include "module_genai/module.hpp"
 #include "module_genai/module_type.hpp"
-#include "preprocessor.hpp"
+#include "model/qwen3_5/qwen3_5preprocessor.hpp"
 #include "visual_language/qwen2vl/classes.hpp"
 
 namespace ov {
@@ -18,7 +18,7 @@ class ImagePreprocessModule : public IBaseModule {
     DeclareModuleConstructor(ImagePreprocessModule);
 
 private:
-    std::variant<std::shared_ptr<VisionEncoderQwen2VL>, std::shared_ptr<Preprocessor>> encoder_ptr;
+    std::variant<std::shared_ptr<VisionEncoderQwen2VL>, std::shared_ptr<Qwen3_5Preprocessor>> encoder_ptr;
 };
 
 REGISTER_MODULE_CONFIG(ImagePreprocessModule);
diff --git a/src/cpp/src/module_genai/modules/md_text_encoder.cpp b/src/cpp/src/module_genai/modules/md_text_encoder.cpp
@@ -91,6 +91,7 @@ bool TextEncoderModule::initialize() {
         m_merge_length = std::pow(vision_config.spatial_merge_size, 2);
     } else {
         GENAI_ERR("TextEncoderModule[" + module_desc->name + "]: Unsupported model type: " + module_desc->model_type);
+        return false;
     }
     
     return true;
diff --git a/src/cpp/src/module_genai/modules/md_vision_encoder.cpp b/src/cpp/src/module_genai/modules/md_vision_encoder.cpp
diff --git a/src/cpp/src/module_genai/modules/md_vision_encoder.hpp b/src/cpp/src/module_genai/modules/md_vision_encoder.hpp
@@ -8,6 +8,7 @@
 #include "visual_language/processor_config.hpp"
 #include "visual_language/vision_encoder.hpp"
 #include "visual_language/vlm_config.hpp"
+#include "model/qwen3_5/qwen3_5config.hpp"
 
 
 namespace ov {
@@ -20,6 +21,14 @@ class VisionEncoderModule : public IBaseModule {
 private:
     bool initialize();
     std::pair<ov::Tensor, ov::Tensor> embed(const EncodedImage &image, const std::vector<int>& images_sequence, const ov::Tensor& input_ids);
+    Qwen3_5VisionEmbeddingResult embed(
+        const ov::Tensor &pixel_values,
+        const ov::Tensor &grid_thw,
+        const ov::Tensor &pos_embeds,
+        const ov::Tensor &rotary_cos,
+        const ov::Tensor &rotary_sin,
+        const ov::Tensor &input_ids,
+        const ov::Tensor &attention_mask);
     ov::Tensor get_rotary_pos_emb(const std::vector<std::array<size_t, 3>>& grids_thw);
     size_t calc_vec_tokens_num(const std::vector<std::array<size_t, 3UL>>& vec_grid_thw) const;
     size_t calc_tokens_num(size_t grid_t, size_t grid_h, size_t grid_w) const;
@@ -33,7 +42,7 @@ class VisionEncoderModule : public IBaseModule {
                                    const int64_t vision_start_token_id,
                                    const std::vector<std::pair<std::size_t, std::size_t>>& history_vision_count);
 
-    std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_ireq_queue_vision_embeddings_merger;
+    std::unique_ptr<CircularBufferQueue<ov::InferRequest>> m_request_queue;
     bool m_with_cu_seqlens_input { false };
     VLMConfig m_vlm_config;
     ProcessorConfig m_processor_config;
@@ -42,6 +51,8 @@ class VisionEncoderModule : public IBaseModule {
     ov::Tensor m_position_ids;
     int64_t m_rope_delta = 0;
     int64_t m_vision_start_token_id = 0;
+    int64_t m_image_pad_token_id = 0;
+    int64_t m_video_pad_token_id = 0;
 };
 
 REGISTER_MODULE_CONFIG(VisionEncoderModule) ;
diff --git a/src/cpp/src/module_genai/modules/model/qwen3_5/qwen3_5config.hpp b/src/cpp/src/module_genai/modules/model/qwen3_5/qwen3_5config.hpp
@@ -7,6 +7,7 @@
 #include <vector>
 #include <filesystem>
 #include <array>
+#include "openvino/runtime/tensor.hpp"
 
 namespace ov::genai::module {
 
@@ -43,4 +44,11 @@ struct Qwen3_5VisionPreprocessConfig {
     static Qwen3_5VisionPreprocessConfig from_json_file(const std::filesystem::path& path);
 };
 
+struct Qwen3_5VisionEmbeddingResult {
+    ov::Tensor position_ids;
+    ov::Tensor visual_pos_mask;
+    ov::Tensor rope_deltas;
+    ov::Tensor visual_embeds;
+};
+
 }
diff --git a/src/cpp/src/module_genai/modules/model/qwen3_5/qwen3_5preprocessor.cpp b/src/cpp/src/module_genai/modules/model/qwen3_5/qwen3_5preprocessor.cpp
@@ -26,7 +26,7 @@ Qwen3_5Preprocessor::Qwen3_5Preprocessor(const std::filesystem::path &model_path
     load_pos_embed_weight(model_path);
 }
 
-std::any Qwen3_5Preprocessor::preprocess(const ov::Tensor &images) {
+Qwen3_5PreprocessorOutput Qwen3_5Preprocessor::preprocess(const ov::Tensor &images) {
     const auto img_shape = images.get_shape();
     if (img_shape.size() != 3 && img_shape.size() != 4) {
         OPENVINO_THROW("images must have shape [H, W, C] or [B, H, W, C]");
diff --git a/src/cpp/src/module_genai/modules/model/qwen3_5/qwen3_5preprocessor.hpp b/src/cpp/src/module_genai/modules/model/qwen3_5/qwen3_5preprocessor.hpp
@@ -9,7 +9,6 @@
 #include <string>
 #include "openvino/runtime/tensor.hpp"
 #include "qwen3_5config.hpp"
-#include "../../preprocessor.hpp"
 
 namespace ov::genai::module {
 
@@ -21,11 +20,11 @@ struct Qwen3_5PreprocessorOutput {
     ov::Tensor rotary_sin;
 };
 
-class Qwen3_5Preprocessor : public Preprocessor {
+class Qwen3_5Preprocessor {
 public:
     explicit Qwen3_5Preprocessor(const std::filesystem::path& model_path);
 
-    std::any preprocess(const ov::Tensor &images) override;
+    Qwen3_5PreprocessorOutput preprocess(const ov::Tensor &images);
 private:
     Qwen3_5VisionPreprocessConfig m_preprocess_config;
     Qwen3_5VisionConfig m_vision_config;
diff --git a/src/cpp/src/module_genai/modules/preprocessor.hpp b/src/cpp/src/module_genai/modules/preprocessor.hpp
diff --git a/tests/module_genai/cpp/modules/TextEncoderModule.cpp b/tests/module_genai/cpp/modules/TextEncoderModule.cpp
@@ -184,10 +184,10 @@ class Qwen3_5TextEncoderModuleTest : public ModuleTestBase, public ::testing::Te
                 input_node("image", to_string(DataType::OVTensor), pipeline_params_name + ".image"));
             cur_node["outputs"] = YAML::Node(YAML::NodeType::Sequence);
             cur_node["outputs"].push_back(output_node("pixel_values", to_string(DataType::OVTensor)));
-            cur_node["outputs"].push_back(output_node("grid_thw", to_string(DataType::VecInt)));
-            cur_node["outputs"].push_back(output_node("pos_embeds", to_string(DataType::VecInt)));
-            cur_node["outputs"].push_back(output_node("rotary_cos", to_string(DataType::VecInt)));
-            cur_node["outputs"].push_back(output_node("rotary_sin", to_string(DataType::VecInt)));
+            cur_node["outputs"].push_back(output_node("grid_thw", to_string(DataType::OVTensor)));
+            cur_node["outputs"].push_back(output_node("pos_embeds", to_string(DataType::OVTensor)));
+            cur_node["outputs"].push_back(output_node("rotary_cos", to_string(DataType::OVTensor)));
+            cur_node["outputs"].push_back(output_node("rotary_sin", to_string(DataType::OVTensor)));
             cur_node["params"] = YAML::Node();
             cur_node["params"]["model_path"] = TEST_MODEL::Qwen3_5();
             pipeline_modules[image_preprocessor_name] = cur_node;
diff --git a/tests/module_genai/cpp/modules/VisionEncoderModule.cpp b/tests/module_genai/cpp/modules/VisionEncoderModule.cpp
@@ -162,4 +162,202 @@ INSTANTIATE_TEST_SUITE_P(ModuleTestSuite,
                          ::testing::Combine(::testing::ValuesIn(vision_encoder_test::test_data),
                                             ::testing::ValuesIn(vision_encoder_test::test_devices)),
                          VisionEncoderModuleTest::get_test_case_name);
-   
+
+struct Qwen3_5VisionEncoderTestData {
+    ov::Tensor pixel_values;
+    ov::Tensor grid_thw;
+    ov::Tensor pos_embeds;
+    ov::Tensor rotary_cos;
+    ov::Tensor rotary_sin;
+    ov::Tensor input_ids;
+    ov::Tensor attention_mask;
+};
+
+namespace TEST_DATA {
+
+Qwen3_5VisionEncoderTestData qwen3_5_vision_encoder_test_data() {
+    Qwen3_5VisionEncoderTestData data;
+    const size_t seed = 42;
+
+    data.pixel_values = ov::genai::module::ModuleTestBase::ut_randn_tensor(ov::Shape{256, 3, 2, 16, 16}, seed);
+    data.pos_embeds   = ov::genai::module::ModuleTestBase::ut_randn_tensor(ov::Shape{256, 1152}, seed);
+    data.rotary_cos   = ov::genai::module::ModuleTestBase::ut_randn_tensor(ov::Shape{256, 72},   seed);
+    data.rotary_sin   = ov::genai::module::ModuleTestBase::ut_randn_tensor(ov::Shape{256, 72},   seed);
+
+    data.grid_thw = ov::Tensor(ov::element::i64, ov::Shape{1, 3});
+    {
+        int64_t* p = data.grid_thw.data<int64_t>();
+        p[0] = 1; p[1] = 16; p[2] = 16;
+    }
+
+    data.input_ids = ov::Tensor(ov::element::i64, ov::Shape{1, 78});
+    {
+        int64_t* p = data.input_ids.data<int64_t>();
+        p[0] = 248045; p[1] = 846; p[2] = 198;
+        p[3] = 248053;
+        for (int i = 4; i < 68; ++i) p[i] = 248056;
+        p[68] = 248054;
+        int64_t text_tokens[] = {5606, 420, 6866, 198, 14524, 2534, 553, 445, 13};
+        for (int i = 0; i < 9; ++i) p[69 + i] = text_tokens[i];
+    }
+
+    data.attention_mask = ov::Tensor(ov::element::i64, ov::Shape{1, 78});
+    {
+        int64_t* p = data.attention_mask.data<int64_t>();
+        std::fill(p, p + 78, int64_t(1));
+    }
+
+    return data;
+}
+
+}
+
+using qwen3_5_test_params = std::tuple<Qwen3_5VisionEncoderTestData, std::string>;
+
+class Qwen3_5VisionEncoderModuleTest
+    : public ov::genai::module::ModuleTestBase,
+      public ::testing::TestWithParam<qwen3_5_test_params> {
+private:
+    std::string m_device;
+    Qwen3_5VisionEncoderTestData m_test_data;
+
+public:
+    static std::string get_test_case_name(const testing::TestParamInfo<qwen3_5_test_params>& obj) {
+        return "Qwen3_5_" + std::get<1>(obj.param);
+    }
+
+    void SetUp() override {
+        REGISTER_TEST_NAME();
+        std::tie(m_test_data, m_device) = GetParam();
+    }
+
+    void TearDown() override {}
+
+protected:
+    std::string get_yaml_content() override {
+        YAML::Node config;
+        config["global_context"]["model_type"] = "qwen3_5";
+        YAML::Node pipeline_modules = config["pipeline_modules"];
+
+        std::string vision_encoder_name = "vision_encoder";
+        {
+            YAML::Node cur_node;
+            cur_node["type"] = "VisionEncoderModule";
+            cur_node["device"] = m_device;
+            cur_node["inputs"] = YAML::Node(YAML::NodeType::Sequence);
+            YAML::Node preprocessed_image;
+            preprocessed_image["name"] = "preprocessed_image";
+            preprocessed_image["type"] = "OVTensor";
+            cur_node["inputs"].push_back(preprocessed_image);
+            YAML::Node grid_thw;
+            grid_thw["name"] = "grid_thw";
+            grid_thw["type"] = "OVTensor";
+            cur_node["inputs"].push_back(grid_thw);
+            YAML::Node pos_embeds;
+            pos_embeds["name"] = "pos_embeds";
+            pos_embeds["type"] = "OVTensor";
+            cur_node["inputs"].push_back(pos_embeds);
+            YAML::Node rotary_cos;
+            rotary_cos["name"] = "rotary_cos";
+            rotary_cos["type"] = "OVTensor";
+            cur_node["inputs"].push_back(rotary_cos);
+            YAML::Node rotary_sin;
+            rotary_sin["name"] = "rotary_sin";
+            rotary_sin["type"] = "OVTensor";
+            cur_node["inputs"].push_back(rotary_sin);
+            YAML::Node input_ids;
+            input_ids["name"] = "input_ids";
+            input_ids["type"] = "OVTensor";
+            cur_node["inputs"].push_back(input_ids);
+            YAML::Node attention_mask;
+            attention_mask["name"] = "attention_mask";
+            attention_mask["type"] = "OVTensor";
+            cur_node["inputs"].push_back(attention_mask);
+            cur_node["outputs"] = YAML::Node(YAML::NodeType::Sequence);
+            cur_node["outputs"].push_back(output_node("image_embedding", to_string(DataType::OVTensor)));
+            cur_node["outputs"].push_back(output_node("visual_pos_mask", to_string(DataType::OVTensor)));
+            cur_node["outputs"].push_back(output_node("position_ids", to_string(DataType::OVTensor)));
+            cur_node["outputs"].push_back(output_node("rope_delta", to_string(DataType::OVTensor)));
+            cur_node["params"] = YAML::Node();
+            cur_node["params"]["model_path"] = TEST_MODEL::Qwen3_5() + "qwen3_5_vision_q4a_b4a_g128.xml";
+            cur_node["params"]["vision_start_token_id"] = 248053;
+            pipeline_modules[vision_encoder_name] = cur_node;
+        }
+        return YAML::Dump(config);
+    }
+
+    ov::AnyMap prepare_inputs() override {
+        ov::AnyMap inputs;
+        inputs["preprocessed_image"]   = m_test_data.pixel_values;
+        inputs["grid_thw"]       = m_test_data.grid_thw;
+        inputs["pos_embeds"]     = m_test_data.pos_embeds;
+        inputs["rotary_cos"]     = m_test_data.rotary_cos;
+        inputs["rotary_sin"]     = m_test_data.rotary_sin;
+        inputs["input_ids"]      = m_test_data.input_ids;
+        inputs["attention_mask"] = m_test_data.attention_mask;
+        return inputs;
+    }
+
+    std::vector<float> expected_image_embedding = {
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+    };
+    ov::Shape expected_image_embedding_shape = ov::Shape{1, 78, 2048};
+
+    std::vector<bool> expected_visual_pos_mask = {
+        0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+    };
+    ov::Shape expected_visual_pos_mask_shape = ov::Shape{1, 78};
+
+    std::vector<int64_t> expected_position_ids = {
+        0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
+    };
+    ov::Shape expected_position_ids_shape = ov::Shape{3, 1, 78};
+
+    std::vector<int64_t> expected_rope_delta = {
+        -56
+    };
+    ov::Shape expected_rope_delta_shape = ov::Shape{1, 1};
+
+    void check_outputs(ov::genai::module::ModulePipeline& pipe) override {
+        auto image_embedding   = pipe.get_output("image_embedding").as<ov::Tensor>();
+        auto visual_pos_mask = pipe.get_output("visual_pos_mask").as<ov::Tensor>();
+        auto position_ids    = pipe.get_output("position_ids").as<ov::Tensor>();
+        auto rope_delta     = pipe.get_output("rope_delta").as<ov::Tensor>();
+
+        EXPECT_TRUE(compare_big_tensor(image_embedding, expected_image_embedding))
+            << "image_embedding do not match expected values";
+        EXPECT_TRUE(compare_shape(image_embedding.get_shape(), expected_image_embedding_shape))
+            << "image_embedding shape mismatch: got " << image_embedding.get_shape();
+        
+        EXPECT_TRUE(compare_big_tensor(visual_pos_mask, expected_visual_pos_mask))
+            << "visual_pos_mask do not match expected values";
+        EXPECT_TRUE(compare_shape(visual_pos_mask.get_shape(), expected_visual_pos_mask_shape))
+            << "visual_pos_mask shape mismatch: got " << visual_pos_mask.get_shape();
+        
+        EXPECT_TRUE(compare_big_tensor(position_ids, expected_position_ids))
+            << "position_ids do not match expected values";
+        EXPECT_TRUE(compare_shape(position_ids.get_shape(), expected_position_ids_shape))
+            << "position_ids shape mismatch: got " << position_ids.get_shape();
+
+        EXPECT_TRUE(compare_big_tensor(rope_delta, expected_rope_delta))
+            << "rope_delta do not match expected values";
+        EXPECT_TRUE(compare_shape(rope_delta.get_shape(), expected_rope_delta_shape))
+            << "rope_delta shape mismatch: got " << rope_delta.get_shape();
+    }
+};
+
+TEST_P(Qwen3_5VisionEncoderModuleTest, ModuleTest) {
+    run();
+}
+
+namespace qwen3_5_vision_encoder_test {
+    auto test_data    = std::vector<Qwen3_5VisionEncoderTestData>{TEST_DATA::qwen3_5_vision_encoder_test_data()};
+    auto test_devices = std::vector<std::string>{TEST_MODEL::get_device()};
+}
+
+INSTANTIATE_TEST_SUITE_P(ModuleTestSuite,
+                         Qwen3_5VisionEncoderModuleTest,
+                         ::testing::Combine(
+                             ::testing::ValuesIn(qwen3_5_vision_encoder_test::test_data),
+                             ::testing::ValuesIn(qwen3_5_vision_encoder_test::test_devices)),
+                         Qwen3_5VisionEncoderModuleTest::get_test_case_name);

Original file line number	Diff line number	Diff line change
`@@ -91,6 +91,7 @@ bool TextEncoderModule::initialize() {`
`91`	`91`	`m_merge_length = std::pow(vision_config.spatial_merge_size, 2);`
`92`	`92`	`} else {`
`93`	`93`	`GENAI_ERR("TextEncoderModule[" + module_desc->name + "]: Unsupported model type: " + module_desc->model_type);`
	`94`	`+ return false;`
`94`	`95`	`}`
`95`	`96`
`96`	`97`	`return true;`
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ Qwen3_5Preprocessor::Qwen3_5Preprocessor(const std::filesystem::path &model_path`
`26`	`26`	`load_pos_embed_weight(model_path);`
`27`	`27`	`}`
`28`	`28`
`29`		`-std::any Qwen3_5Preprocessor::preprocess(const ov::Tensor &images) {`
	`29`	`+Qwen3_5PreprocessorOutput Qwen3_5Preprocessor::preprocess(const ov::Tensor &images) {`
`30`	`30`	`const auto img_shape = images.get_shape();`
`31`	`31`	`if (img_shape.size() != 3 && img_shape.size() != 4) {`
`32`	`32`	`OPENVINO_THROW("images must have shape [H, W, C] or [B, H, W, C]");`