Skip to content

Commit f5d8f2b

Browse files
committed
test pass.
Signed-off-by: xiping.yan <xiping.yan@intel.com>
1 parent ded6b91 commit f5d8f2b

File tree

9 files changed

+62
-84
lines changed

9 files changed

+62
-84
lines changed

src/cpp/src/module_genai/modules/md_img_preprocess/md_img_preprocess.cpp

Lines changed: 7 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -75,14 +75,6 @@ ImagePreprocessModule::ImagePreprocessModule(const IBaseModuleDesc::PTR& desc, c
7575
OPENVINO_ASSERT(_encoder_ptr != nullptr,
7676
"Failed to create VisionEncoder for ImagePreprocessModule: " + desc->name);
7777
}
78-
79-
// if (_model_type == VLMModelType::QWEN2_VL || _model_type == VLMModelType::QWEN2_5_VL) {
80-
// encoder_ptr = std::make_shared<VisionEncoderQwen2VL>(std::filesystem::path(model_path), device, ov::AnyMap{});
81-
// } else if (_model_type == VLMModelType::QWEN3_5) {
82-
// encoder_ptr = std::make_shared<Qwen3_5Preprocessor>(std::filesystem::path(model_path));
83-
// } else {
84-
// OPENVINO_THROW("ImagePreprocessModule[" + desc->name + "]: Unsupported model type: " + desc->model_type);
85-
// }
8678
}
8779

8880
ImagePreprocessModule::~ImagePreprocessModule() {}
@@ -96,8 +88,12 @@ void ImagePreprocessModule::run_image(const bool& has_image_input, const bool& h
9688
}
9789

9890
if (_vision_preprocess_ptr) {
99-
_vision_preprocess_ptr->preprocess(images_data, {});
100-
_vision_preprocess_ptr->result_to_output(this->outputs);
91+
auto output = _vision_preprocess_ptr->preprocess(images_data, {});
92+
this->outputs["pixel_values"].data = output.pixel_values;
93+
this->outputs["grid_thw"].data = output.grid_thw;
94+
this->outputs["pos_embeds"].data = output.pos_embeds;
95+
this->outputs["rotary_cos"].data = output.rotary_cos;
96+
this->outputs["rotary_sin"].data = output.rotary_sin;
10197
} else {
10298
std::vector<ov::Tensor> output_tensors;
10399
std::vector<ImageSize> output_sizes;
@@ -131,8 +127,7 @@ void ImagePreprocessModule::run_video(const bool& has_video_input, const bool& h
131127
}
132128

133129
if (_vision_preprocess_ptr) {
134-
_vision_preprocess_ptr->preprocess({}, frames);
135-
_vision_preprocess_ptr->result_to_output(this->outputs);
130+
auto output = _vision_preprocess_ptr->preprocess({}, frames);
136131
} else {
137132
auto encoded_video = _encoder_ptr->encode_frames(frames, ov::AnyMap{});
138133
this->outputs["raw_datas"].data = encoded_video.video_features;
@@ -175,49 +170,6 @@ void ImagePreprocessModule::run() {
175170
"]: No valid input found. Please provide one of the following inputs: 'image', 'images', "
176171
"'video', 'videos'.");
177172
}
178-
179-
// if (exists_input("images")) {
180-
// auto images_data = get_input("images").as<std::vector<ov::Tensor>>();
181-
182-
// std::vector<ov::Tensor> output_tensors;
183-
// std::vector<ImageSize> output_sizes;
184-
// for (size_t i = 0; i < images_data.size(); ++i) {
185-
// auto encoded_img = _encoder_ptr->encode(images_data[i], ov::AnyMap{});
186-
// output_tensors.push_back(encoded_img.resized_source);
187-
// output_sizes.push_back(encoded_img.resized_source_size);
188-
// }
189-
// this->outputs["raw_datas"].data = output_tensors;
190-
// std::vector<std::vector<int>> sizes_vec;
191-
// for (const auto& sz : output_sizes) {
192-
// sizes_vec.push_back({static_cast<int>(sz.height), static_cast<int>(sz.width)});
193-
// }
194-
// this->outputs["source_sizes"].data = sizes_vec;
195-
196-
// // } else if (model_type == VLMModelType::QWEN3_5) {
197-
// // ov::Tensor images = tensor_utils::stack(images_data, 0);
198-
// // Qwen3_5PreprocessorOutput output = std::get<std::shared_ptr<Qwen3_5Preprocessor>>(encoder_ptr)->preprocess(images);
199-
// // this->outputs["pixel_values"].data = output.pixel_values;
200-
// // this->outputs["grid_thw"].data = output.grid_thw;
201-
// // this->outputs["pos_embeds"].data = output.pos_embeds;
202-
// // this->outputs["rotary_cos"].data = output.rotary_cos;
203-
// // this->outputs["rotary_sin"].data = output.rotary_sin;
204-
// // }
205-
// } else {
206-
// auto image1_data = get_input("image").as<ov::Tensor>();
207-
// if (model_type == VLMModelType::QWEN2_VL || model_type == VLMModelType::QWEN2_5_VL) {
208-
// auto encoded_img = std::get<std::shared_ptr<VisionEncoderQwen2VL>>(encoder_ptr)->encode(image1_data, ov::AnyMap{});
209-
// this->outputs["raw_data"].data = encoded_img.resized_source;
210-
// this->outputs["source_size"].data =
211-
// std::vector<int>{static_cast<int>(encoded_img.resized_source_size.height), static_cast<int>(encoded_img.resized_source_size.width)};
212-
// } else if (model_type == VLMModelType::QWEN3_5) {
213-
// Qwen3_5PreprocessorOutput output = std::get<std::shared_ptr<Qwen3_5Preprocessor>>(encoder_ptr)->preprocess(image1_data);
214-
// this->outputs["pixel_values"].data = output.pixel_values;
215-
// this->outputs["grid_thw"].data = output.grid_thw;
216-
// this->outputs["pos_embeds"].data = output.pos_embeds;
217-
// this->outputs["rotary_cos"].data = output.rotary_cos;
218-
// this->outputs["rotary_sin"].data = output.rotary_sin;
219-
// }
220-
// }
221173
}
222174

223175
} // namespace module

src/cpp/src/module_genai/modules/md_img_preprocess/vision_preprocess.hpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,14 @@ namespace ov::genai::module {
1717

1818
using OutputModule = IBaseModule::OutputModule;
1919

20+
struct PreprocessOutput {
21+
ov::Tensor pixel_values;
22+
ov::Tensor grid_thw;
23+
ov::Tensor pos_embeds;
24+
ov::Tensor rotary_cos;
25+
ov::Tensor rotary_sin;
26+
};
27+
2028
// Vision preprocessing facade.
2129
//
2230
// Current implementation encapsulates Qwen3VLVideoProcessor, but the public
@@ -33,9 +41,7 @@ class VisionPreprocess {
3341
virtual ~VisionPreprocess() = default;
3442

3543
// Preprocess images and videos.
36-
virtual void preprocess(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& videos) = 0;
37-
38-
virtual void result_to_output(std::map<std::string, OutputModule>& output) const = 0;
44+
virtual PreprocessOutput preprocess(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& videos) = 0;
3945

4046
private:
4147
VisionPreprocess() = delete;

src/cpp/src/module_genai/modules/model/qwen3_5/qwen3_5preprocessor.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ Qwen3_5Preprocessor::Qwen3_5Preprocessor(const std::filesystem::path &model_path
3131
Qwen3_5PreprocessorOutput Qwen3_5Preprocessor::preprocess(const ov::Tensor &images) {
3232
const auto img_shape = images.get_shape();
3333
if (img_shape.size() != 3 && img_shape.size() != 4) {
34-
OPENVINO_THROW("images must have shape [H, W, C] or [B, H, W, C]");
34+
OPENVINO_THROW("images must have shape [H, W, C] or [B, H, W, C], get shape: ", img_shape);
3535
}
3636
if (images.get_element_type() != ov::element::u8) {
3737
OPENVINO_THROW("images must be u8 for Qwen3_5 preprocessing");

src/cpp/src/module_genai/modules/model/qwen3_5/qwen3_5preprocessor.hpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,10 @@
1010
#include <string>
1111
#include "openvino/runtime/tensor.hpp"
1212
#include "qwen3_5config.hpp"
13+
#include "module_genai/utils/vision_preprocess.hpp"
1314

1415
namespace ov::genai::module {
1516

16-
class IVideoProcessor;
17-
1817
struct Qwen3_5PreprocessorOutput {
1918
ov::Tensor pixel_values;
2019
ov::Tensor grid_thw;

src/cpp/src/module_genai/modules/model/qwen3_5/vision_preprocess.cpp

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,27 +6,51 @@
66
#include <utility>
77

88
#include "openvino/core/except.hpp"
9+
#include "module_genai/utils/tensor_utils.hpp"
910

1011
namespace ov::genai::module {
1112

1213
Qwen3_5VisionPreprocess::Qwen3_5VisionPreprocess(const std::filesystem::path& model_path, VLMModelType model_type)
1314
: VisionPreprocess(model_type) {
1415
// m_video_processor(std::make_unique<Qwen3_5VLVideoProcessor>(model_path)) {}
16+
m_preprocessor = std::make_shared<Qwen3_5Preprocessor>(model_path);
1517
}
1618

17-
void Qwen3_5VisionPreprocess::preprocess(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& videos) {
18-
OPENVINO_ASSERT(m_video_processor != nullptr);
19-
OPENVINO_ASSERT(images.empty() || videos.empty(), "Qwen3_5VisionPreprocess: images and videos cannot both be non-empty");
20-
21-
if (!videos.empty()) {
22-
m_video_processor->preprocess(videos);
23-
return;
19+
PreprocessOutput Qwen3_5VisionPreprocess::preprocess(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& videos) {
20+
// OPENVINO_ASSERT(m_video_processor != nullptr);
21+
// OPENVINO_ASSERT(images.empty() || videos.empty(), "Qwen3_5VisionPreprocess: images and videos cannot both be non-empty");
22+
23+
// if (!videos.empty()) {
24+
// m_video_processor->preprocess(videos);
25+
// return;
26+
// }
27+
// m_video_processor->preprocess(images);
28+
29+
ov::Tensor stack_images;
30+
if (images.size() > 1) {
31+
stack_images = tensor_utils::stack(images, 0);
32+
} else if (images.size() == 1) {
33+
stack_images = images[0];
34+
} else {
35+
OPENVINO_THROW("No images provided for preprocessing");
2436
}
25-
m_video_processor->preprocess(images);
37+
auto output = m_preprocessor->preprocess(stack_images);
38+
39+
PreprocessOutput preprocess_output;
40+
preprocess_output.pixel_values = std::move(output.pixel_values);
41+
preprocess_output.grid_thw = std::move(output.grid_thw);
42+
preprocess_output.pos_embeds = std::move(output.pos_embeds);
43+
preprocess_output.rotary_cos = std::move(output.rotary_cos);
44+
preprocess_output.rotary_sin = std::move(output.rotary_sin);
45+
return preprocess_output;
2646
}
2747

28-
void Qwen3_5VisionPreprocess::result_to_output(std::map<std::string, OutputModule>& output) const {
29-
(void)output;
30-
}
48+
// void Qwen3_5VisionPreprocess::result_to_output(std::map<std::string, OutputModule>& output) const {
49+
// output["pixel_values"].data = m_output.pixel_values;
50+
// output["grid_thw"].data = m_output.grid_thw;
51+
// output["pos_embeds"].data = m_output.pos_embeds;
52+
// output["rotary_cos"].data = m_output.rotary_cos;
53+
// output["rotary_sin"].data = m_output.rotary_sin;
54+
// }
3155

3256
} // namespace ov::genai::module

src/cpp/src/module_genai/modules/model/qwen3_5/vision_preprocess.hpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
#include "module_genai/modules/md_img_preprocess/vision_preprocess.hpp"
1111
#include "module_genai/utils/vision_preprocess.hpp"
12+
#include "module_genai/modules/model/qwen3_5/qwen3_5preprocessor.hpp"
1213

1314
namespace ov::genai::module {
1415

@@ -17,12 +18,12 @@ class Qwen3_5VisionPreprocess final : public VisionPreprocess {
1718
Qwen3_5VisionPreprocess() = delete;
1819
Qwen3_5VisionPreprocess(const std::filesystem::path& model_path, VLMModelType model_type);
1920

20-
void preprocess(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& videos) override;
21+
PreprocessOutput preprocess(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& videos) override;
2122

22-
void result_to_output(std::map<std::string, OutputModule>& output) const override;
23+
// void result_to_output(std::map<std::string, OutputModule>& output) const override;
2324

2425
private:
25-
std::unique_ptr<IVideoProcessor> m_video_processor;
26+
std::shared_ptr<Qwen3_5Preprocessor> m_preprocessor;
2627
};
2728

2829
} // namespace ov::genai::module

src/cpp/src/module_genai/modules/model/qwen3_vl/vision_preprocess.cpp

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,14 +12,10 @@ namespace ov::genai::module {
1212
Qwen3VisionPreprocess::Qwen3VisionPreprocess(const std::filesystem::path& model_path, VLMModelType model_type)
1313
: VisionPreprocess(model_type) {}
1414

15-
void Qwen3VisionPreprocess::preprocess(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& videos) {
15+
PreprocessOutput Qwen3VisionPreprocess::preprocess(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& videos) {
1616
OPENVINO_ASSERT(images.empty() || videos.empty(), "Qwen3VisionPreprocess: images and videos cannot both be non-empty");
1717
OPENVINO_THROW("Qwen3VisionPreprocess::preprocess is not implemented yet");
18-
}
19-
20-
void Qwen3VisionPreprocess::result_to_output(std::map<std::string, OutputModule>& output) const {
21-
(void)output;
22-
OPENVINO_THROW("Qwen3VisionPreprocess::result_to_output is not implemented yet");
18+
return {};
2319
}
2420

2521
} // namespace ov::genai::module

src/cpp/src/module_genai/modules/model/qwen3_vl/vision_preprocess.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@ class Qwen3VisionPreprocess final : public VisionPreprocess {
1717
Qwen3VisionPreprocess() = delete;
1818
Qwen3VisionPreprocess(const std::filesystem::path& model_path, VLMModelType model_type);
1919

20-
void preprocess(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& videos) override;
20+
PreprocessOutput preprocess(const std::vector<ov::Tensor>& images, const std::vector<ov::Tensor>& videos) override;
2121

22-
void result_to_output(std::map<std::string, OutputModule>& output) const override;
22+
// void result_to_output(std::map<std::string, OutputModule>& output) const override;
2323

2424
private:
2525
std::unique_ptr<IVideoProcessor> m_video_processor;

src/cpp/src/module_genai/utils/vision_preprocess.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,6 @@ class IVideoProcessor {
44
public:
55
virtual ~IVideoProcessor() = default;
66

7-
virtual void sample_frames(VideoMetadata metadata, int num_frames = 0, float fps = 0.0f) = 0;
7+
// virtual void sample_frames(VideoMetadata metadata, int num_frames = 0, float fps = 0.0f) = 0;
88
virtual void preprocess(const std::vector<ov::Tensor>& frames) = 0;
99
};

0 commit comments

Comments
 (0)