test pass.

xipingyan · xipingyan · commit f5d8f2bd787d · 2026-03-03T21:12:06.000+08:00
Signed-off-by: xiping.yan &lt;xiping.yan@intel.com&gt;
diff --git a/src/cpp/src/module_genai/modules/md_img_preprocess/md_img_preprocess.cpp b/src/cpp/src/module_genai/modules/md_img_preprocess/md_img_preprocess.cpp
@@ -75,14 +75,6 @@ ImagePreprocessModule::ImagePreprocessModule(const IBaseModuleDesc::PTR& desc, c
         OPENVINO_ASSERT(_encoder_ptr != nullptr,
                         "Failed to create VisionEncoder for ImagePreprocessModule: " + desc->name);
     }
-
-    // if (_model_type == VLMModelType::QWEN2_VL || _model_type == VLMModelType::QWEN2_5_VL) {
-    //     encoder_ptr = std::make_shared<VisionEncoderQwen2VL>(std::filesystem::path(model_path), device, ov::AnyMap{});
-    // } else if (_model_type == VLMModelType::QWEN3_5) {
-    //     encoder_ptr = std::make_shared<Qwen3_5Preprocessor>(std::filesystem::path(model_path));
-    // } else {
-    //     OPENVINO_THROW("ImagePreprocessModule[" + desc->name + "]: Unsupported model type: " + desc->model_type);
-    // }
 }
 
 ImagePreprocessModule::~ImagePreprocessModule() {}
@@ -96,8 +88,12 @@ void ImagePreprocessModule::run_image(const bool& has_image_input, const bool& h
     }
 
     if (_vision_preprocess_ptr) {
-        _vision_preprocess_ptr->preprocess(images_data, {});
-        _vision_preprocess_ptr->result_to_output(this->outputs);
+        auto output = _vision_preprocess_ptr->preprocess(images_data, {});
+        this->outputs["pixel_values"].data = output.pixel_values;
+        this->outputs["grid_thw"].data = output.grid_thw;
+        this->outputs["pos_embeds"].data = output.pos_embeds;
+        this->outputs["rotary_cos"].data = output.rotary_cos;
+        this->outputs["rotary_sin"].data = output.rotary_sin;
     } else {
         std::vector<ov::Tensor> output_tensors;
         std::vector<ImageSize> output_sizes;
@@ -131,8 +127,7 @@ void ImagePreprocessModule::run_video(const bool& has_video_input, const bool& h
     }
 
     if (_vision_preprocess_ptr) {
-        _vision_preprocess_ptr->preprocess({}, frames);
-        _vision_preprocess_ptr->result_to_output(this->outputs);
+        auto output = _vision_preprocess_ptr->preprocess({}, frames);
     } else {
         auto encoded_video = _encoder_ptr->encode_frames(frames, ov::AnyMap{});
         this->outputs["raw_datas"].data = encoded_video.video_features;
@@ -175,49 +170,6 @@ void ImagePreprocessModule::run() {
                        "]: No valid input found. Please provide one of the following inputs: 'image', 'images', "
                        "'video', 'videos'.");
     }
-
-    // if (exists_input("images")) {
-    //     auto images_data = get_input("images").as<std::vector<ov::Tensor>>();
-
-    //     std::vector<ov::Tensor> output_tensors;
-    //     std::vector<ImageSize> output_sizes;
-    //     for (size_t i = 0; i < images_data.size(); ++i) {
-    //         auto encoded_img = _encoder_ptr->encode(images_data[i], ov::AnyMap{});
-    //         output_tensors.push_back(encoded_img.resized_source);
-    //         output_sizes.push_back(encoded_img.resized_source_size);
-    //     }
-    //     this->outputs["raw_datas"].data = output_tensors;
-    //     std::vector<std::vector<int>> sizes_vec;
-    //     for (const auto& sz : output_sizes) {
-    //         sizes_vec.push_back({static_cast<int>(sz.height), static_cast<int>(sz.width)});
-    //     }
-    //     this->outputs["source_sizes"].data = sizes_vec;
-
-    //     // } else if (model_type == VLMModelType::QWEN3_5) {
-    //     //     ov::Tensor images = tensor_utils::stack(images_data, 0);
-    //     //     Qwen3_5PreprocessorOutput output = std::get<std::shared_ptr<Qwen3_5Preprocessor>>(encoder_ptr)->preprocess(images);
-    //     //     this->outputs["pixel_values"].data = output.pixel_values;
-    //     //     this->outputs["grid_thw"].data = output.grid_thw;
-    //     //     this->outputs["pos_embeds"].data = output.pos_embeds;
-    //     //     this->outputs["rotary_cos"].data = output.rotary_cos;
-    //     //     this->outputs["rotary_sin"].data = output.rotary_sin;
-    //     // }
-    // } else {
-    //     auto image1_data = get_input("image").as<ov::Tensor>();
-    //     if (model_type == VLMModelType::QWEN2_VL || model_type == VLMModelType::QWEN2_5_VL) {
-    //         auto encoded_img = std::get<std::shared_ptr<VisionEncoderQwen2VL>>(encoder_ptr)->encode(image1_data, ov::AnyMap{});
-    //         this->outputs["raw_data"].data = encoded_img.resized_source;
-    //         this->outputs["source_size"].data =
-    //             std::vector<int>{static_cast<int>(encoded_img.resized_source_size.height), static_cast<int>(encoded_img.resized_source_size.width)};
-    //     } else if (model_type == VLMModelType::QWEN3_5) {
-    //         Qwen3_5PreprocessorOutput output = std::get<std::shared_ptr<Qwen3_5Preprocessor>>(encoder_ptr)->preprocess(image1_data);
-    //         this->outputs["pixel_values"].data = output.pixel_values;
-    //         this->outputs["grid_thw"].data = output.grid_thw;
-    //         this->outputs["pos_embeds"].data = output.pos_embeds;
-    //         this->outputs["rotary_cos"].data = output.rotary_cos;
-    //         this->outputs["rotary_sin"].data = output.rotary_sin;
-    //     }
-    // }
 }
 
 }  // namespace module
diff --git a/src/cpp/src/module_genai/modules/md_img_preprocess/vision_preprocess.hpp b/src/cpp/src/module_genai/modules/md_img_preprocess/vision_preprocess.hpp
@@ -17,6 +17,14 @@ namespace ov::genai::module {
 
 using OutputModule = IBaseModule::OutputModule;
 
+struct PreprocessOutput {
+    ov::Tensor pixel_values;
+    ov::Tensor grid_thw;
+    ov::Tensor pos_embeds;
+    ov::Tensor rotary_cos;
+    ov::Tensor rotary_sin;
+};
+
 // Vision preprocessing facade.
 //
 // Current implementation encapsulates Qwen3VLVideoProcessor, but the public
@@ -33,9 +41,7 @@ class VisionPreprocess {
     virtual ~VisionPreprocess() = default;
 
     // Preprocess images and videos.
-    virtual void preprocess(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& videos) = 0;
-
-    virtual void result_to_output(std::map<std::string, OutputModule>& output) const = 0;
+    virtual PreprocessOutput preprocess(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& videos) = 0;
 
 private:
     VisionPreprocess() = delete;
diff --git a/src/cpp/src/module_genai/modules/model/qwen3_5/qwen3_5preprocessor.cpp b/src/cpp/src/module_genai/modules/model/qwen3_5/qwen3_5preprocessor.cpp
@@ -31,7 +31,7 @@ Qwen3_5Preprocessor::Qwen3_5Preprocessor(const std::filesystem::path &model_path
 Qwen3_5PreprocessorOutput Qwen3_5Preprocessor::preprocess(const ov::Tensor &images) {
     const auto img_shape = images.get_shape();
     if (img_shape.size() != 3 && img_shape.size() != 4) {
-        OPENVINO_THROW("images must have shape [H, W, C] or [B, H, W, C]");
+        OPENVINO_THROW("images must have shape [H, W, C] or [B, H, W, C], get shape: ", img_shape);
     }
     if (images.get_element_type() != ov::element::u8) {
         OPENVINO_THROW("images must be u8 for Qwen3_5 preprocessing");
diff --git a/src/cpp/src/module_genai/modules/model/qwen3_5/qwen3_5preprocessor.hpp b/src/cpp/src/module_genai/modules/model/qwen3_5/qwen3_5preprocessor.hpp
@@ -10,11 +10,10 @@
 #include <string>
 #include "openvino/runtime/tensor.hpp"
 #include "qwen3_5config.hpp"
+#include "module_genai/utils/vision_preprocess.hpp"
 
 namespace ov::genai::module {
 
-class IVideoProcessor;
-
 struct Qwen3_5PreprocessorOutput {
     ov::Tensor pixel_values;
     ov::Tensor grid_thw;
diff --git a/src/cpp/src/module_genai/modules/model/qwen3_5/vision_preprocess.cpp b/src/cpp/src/module_genai/modules/model/qwen3_5/vision_preprocess.cpp
@@ -6,27 +6,51 @@
 #include <utility>
 
 #include "openvino/core/except.hpp"
+#include "module_genai/utils/tensor_utils.hpp"
 
 namespace ov::genai::module {
 
 Qwen3_5VisionPreprocess::Qwen3_5VisionPreprocess(const std::filesystem::path& model_path, VLMModelType model_type)
     : VisionPreprocess(model_type) {
     //   m_video_processor(std::make_unique<Qwen3_5VLVideoProcessor>(model_path)) {}
+    m_preprocessor = std::make_shared<Qwen3_5Preprocessor>(model_path);
 }
 
-void Qwen3_5VisionPreprocess::preprocess(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& videos) {
-    OPENVINO_ASSERT(m_video_processor != nullptr);
-    OPENVINO_ASSERT(images.empty() || videos.empty(), "Qwen3_5VisionPreprocess: images and videos cannot both be non-empty");
-
-    if (!videos.empty()) {
-        m_video_processor->preprocess(videos);
-        return;
+PreprocessOutput Qwen3_5VisionPreprocess::preprocess(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& videos) {
+    // OPENVINO_ASSERT(m_video_processor != nullptr);
+    // OPENVINO_ASSERT(images.empty() || videos.empty(), "Qwen3_5VisionPreprocess: images and videos cannot both be non-empty");
+
+    // if (!videos.empty()) {
+    //     m_video_processor->preprocess(videos);
+    //     return;
+    // }
+    // m_video_processor->preprocess(images);
+
+    ov::Tensor stack_images;
+    if (images.size() > 1) {
+        stack_images = tensor_utils::stack(images, 0);
+    } else if (images.size() == 1) {
+        stack_images = images[0];
+    } else {
+        OPENVINO_THROW("No images provided for preprocessing");
     }
-    m_video_processor->preprocess(images);
+    auto output = m_preprocessor->preprocess(stack_images);
+
+    PreprocessOutput preprocess_output;
+    preprocess_output.pixel_values = std::move(output.pixel_values);
+    preprocess_output.grid_thw = std::move(output.grid_thw);
+    preprocess_output.pos_embeds = std::move(output.pos_embeds);
+    preprocess_output.rotary_cos = std::move(output.rotary_cos);
+    preprocess_output.rotary_sin = std::move(output.rotary_sin);
+    return preprocess_output;
 }
 
-void Qwen3_5VisionPreprocess::result_to_output(std::map<std::string, OutputModule>& output) const {
-    (void)output;
-}
+// void Qwen3_5VisionPreprocess::result_to_output(std::map<std::string, OutputModule>& output) const {
+//     output["pixel_values"].data = m_output.pixel_values;
+//     output["grid_thw"].data = m_output.grid_thw;
+//     output["pos_embeds"].data = m_output.pos_embeds;
+//     output["rotary_cos"].data = m_output.rotary_cos;
+//     output["rotary_sin"].data = m_output.rotary_sin;
+// }
 
 }  // namespace ov::genai::module
diff --git a/src/cpp/src/module_genai/modules/model/qwen3_5/vision_preprocess.hpp b/src/cpp/src/module_genai/modules/model/qwen3_5/vision_preprocess.hpp
@@ -9,6 +9,7 @@
 
 #include "module_genai/modules/md_img_preprocess/vision_preprocess.hpp"
 #include "module_genai/utils/vision_preprocess.hpp"
+#include "module_genai/modules/model/qwen3_5/qwen3_5preprocessor.hpp"
 
 namespace ov::genai::module {
 
@@ -17,12 +18,12 @@ class Qwen3_5VisionPreprocess final : public VisionPreprocess {
     Qwen3_5VisionPreprocess() = delete;
     Qwen3_5VisionPreprocess(const std::filesystem::path& model_path, VLMModelType model_type);
 
-    void preprocess(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& videos) override;
+    PreprocessOutput preprocess(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& videos) override;
 
-    void result_to_output(std::map<std::string, OutputModule>& output) const override;
+    // void result_to_output(std::map<std::string, OutputModule>& output) const override;
 
 private:
-    std::unique_ptr<IVideoProcessor> m_video_processor;
+    std::shared_ptr<Qwen3_5Preprocessor> m_preprocessor;
 };
 
 }  // namespace ov::genai::module
diff --git a/src/cpp/src/module_genai/modules/model/qwen3_vl/vision_preprocess.cpp b/src/cpp/src/module_genai/modules/model/qwen3_vl/vision_preprocess.cpp
@@ -12,14 +12,10 @@ namespace ov::genai::module {
 Qwen3VisionPreprocess::Qwen3VisionPreprocess(const std::filesystem::path& model_path, VLMModelType model_type)
     : VisionPreprocess(model_type) {}
 
-void Qwen3VisionPreprocess::preprocess(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& videos) {
+PreprocessOutput Qwen3VisionPreprocess::preprocess(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& videos) {
     OPENVINO_ASSERT(images.empty() || videos.empty(), "Qwen3VisionPreprocess: images and videos cannot both be non-empty");
     OPENVINO_THROW("Qwen3VisionPreprocess::preprocess is not implemented yet");
-}
-
-void Qwen3VisionPreprocess::result_to_output(std::map<std::string, OutputModule>& output) const {
-    (void)output;
-    OPENVINO_THROW("Qwen3VisionPreprocess::result_to_output is not implemented yet");
+    return {};
 }
 
 }  // namespace ov::genai::module
diff --git a/src/cpp/src/module_genai/modules/model/qwen3_vl/vision_preprocess.hpp b/src/cpp/src/module_genai/modules/model/qwen3_vl/vision_preprocess.hpp
@@ -17,9 +17,9 @@ class Qwen3VisionPreprocess final : public VisionPreprocess {
     Qwen3VisionPreprocess() = delete;
     Qwen3VisionPreprocess(const std::filesystem::path& model_path, VLMModelType model_type);
 
-    void preprocess(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& videos) override;
+    PreprocessOutput preprocess(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& videos) override;
 
-    void result_to_output(std::map<std::string, OutputModule>& output) const override;
+    // void result_to_output(std::map<std::string, OutputModule>& output) const override;
 
 private:
     std::unique_ptr<IVideoProcessor> m_video_processor;
diff --git a/src/cpp/src/module_genai/utils/vision_preprocess.hpp b/src/cpp/src/module_genai/utils/vision_preprocess.hpp
@@ -4,6 +4,6 @@ class IVideoProcessor {
 public:
     virtual ~IVideoProcessor() = default;
 
-    virtual void sample_frames(VideoMetadata metadata, int num_frames = 0, float fps = 0.0f) = 0;
+    // virtual void sample_frames(VideoMetadata metadata, int num_frames = 0, float fps = 0.0f) = 0;
     virtual void preprocess(const std::vector<ov::Tensor>& frames) = 0;
 };

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ Qwen3_5Preprocessor::Qwen3_5Preprocessor(const std::filesystem::path &model_path`
`31`	`31`	`Qwen3_5PreprocessorOutput Qwen3_5Preprocessor::preprocess(const ov::Tensor &images) {`
`32`	`32`	`const auto img_shape = images.get_shape();`
`33`	`33`	`if (img_shape.size() != 3 && img_shape.size() != 4) {`
`34`		`- OPENVINO_THROW("images must have shape [H, W, C] or [B, H, W, C]");`
	`34`	`+ OPENVINO_THROW("images must have shape [H, W, C] or [B, H, W, C], get shape: ", img_shape);`
`35`	`35`	`}`
`36`	`36`	`if (images.get_element_type() != ov::element::u8) {`
`37`	`37`	`OPENVINO_THROW("images must be u8 for Qwen3_5 preprocessing");`