From 6c49dc88eb89531546bc58620a56a15154234d5d Mon Sep 17 00:00:00 2001 From: xipingya Date: Wed, 30 Jul 2025 12:02:48 +0800 Subject: [PATCH 001/118] Avoid to do resize for same width and height images. Signed-off-by: xipingya --- src/cpp/src/visual_language/clip.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/cpp/src/visual_language/clip.cpp b/src/cpp/src/visual_language/clip.cpp index 2ae614d938..d4ecdae85d 100644 --- a/src/cpp/src/visual_language/clip.cpp +++ b/src/cpp/src/visual_language/clip.cpp @@ -76,7 +76,12 @@ void bicubic_resize(const clip_image_u8 &img, clip_image_u8 &dst, int target_wid dst.nx = target_width; dst.ny = target_height; - dst.buf.resize(3 * target_width * target_height); + const int target_size = 3 * target_width * target_height; + dst.buf.resize(target_size); + if (img.nx == target_width && img.ny == target_height) { + std::memcpy(dst.buf.data(), img.buf.data(), target_size); + return; + } float Cc; float C[5]; From c7d9932df7f4fc8eb24f00ebc2832bc9145a932e Mon Sep 17 00:00:00 2001 From: xipingya Date: Wed, 30 Jul 2025 21:08:40 +0800 Subject: [PATCH 002/118] Enable video process for qwen*-vl Signed-off-by: xipingya --- .../openvino/genai/generation_config.hpp | 2 + .../src/continuous_batching/pipeline_base.cpp | 6 +- .../src/visual_language/inputs_embedder.cpp | 12 ++- .../src/visual_language/inputs_embedder.hpp | 4 +- src/cpp/src/visual_language/llava/classes.cpp | 5 +- src/cpp/src/visual_language/llava/classes.hpp | 2 +- .../visual_language/llava_next/classes.cpp | 6 +- .../visual_language/llava_next/classes.hpp | 2 +- src/cpp/src/visual_language/pipeline.cpp | 2 +- .../src/visual_language/qwen2vl/classes.cpp | 86 +++++++++++++++++-- .../src/visual_language/qwen2vl/classes.hpp | 4 + .../src/visual_language/vision_encoder.hpp | 4 + 12 files changed, 113 insertions(+), 22 deletions(-) diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 67b6ff2a87..fab0ff39b4 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -212,6 +212,8 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { // set to true if chat template should be applied for non-chat scenarios, set to false otherwise bool apply_chat_template = true; + // Vidoe or image + std::string type = "image"; /** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0. * Otherwise verifies eos_token_id == tokenizer_eos_token_id. diff --git a/src/cpp/src/continuous_batching/pipeline_base.cpp b/src/cpp/src/continuous_batching/pipeline_base.cpp index c8de7ab210..0ccc9951a0 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.cpp +++ b/src/cpp/src/continuous_batching/pipeline_base.cpp @@ -167,7 +167,7 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( const auto& rgbs = rgbs_vector[0]; const auto& prompt = prompts[0]; auto start_get_inputs_embeds = std::chrono::steady_clock::now(); - encoded_images = m_inputs_embedder->encode_images(rgbs); + encoded_images = m_inputs_embedder->encode_images(rgbs, sampling_params[0].type == "video"); m_history_images.insert(m_history_images.end(), encoded_images.begin(), encoded_images.end()); const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); @@ -185,7 +185,7 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( for (size_t i = 0; i < prompts.size(); i++) { const auto& prompt = prompts[i]; const auto& rgbs = rgbs_vector[i]; - const auto encoded_images = m_inputs_embedder->encode_images(rgbs); + const auto encoded_images = m_inputs_embedder->encode_images(rgbs, sampling_params[i].type == "video"); auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); auto start_get_inputs_embeds = std::chrono::steady_clock::now(); @@ -248,7 +248,7 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(uint64_t re { std::lock_guard lock(m_embeddings_mutex); m_inputs_embedder->set_apply_chat_template_status(sampling_params.apply_chat_template); - const auto encoded_images = m_inputs_embedder->encode_images(rgbs); + const auto encoded_images = m_inputs_embedder->encode_images(rgbs, sampling_params.type == "video"); const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, 0, encoded_images); inputs = m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, metrics, true, image_sequence); diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index ce6a789c85..75348449aa 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -163,9 +163,13 @@ std::vector InputsEmbedder::IInputsEmbedder::to_single_image_tensors return single_image_tensors; } -std::vector InputsEmbedder::IInputsEmbedder::encode_images(const std::vector& images) { - std::vector embeds; +std::vector InputsEmbedder::IInputsEmbedder::encode_images(const std::vector& images, const bool& is_video) { std::vector single_images = to_single_image_tensors(images); + if (is_video) { + return m_vision_encoder->encode_video(single_images); + } + + std::vector embeds; for (const ov::Tensor& image : single_images) { embeds.emplace_back(m_vision_encoder->encode(image)); } @@ -240,8 +244,8 @@ ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const st return m_impl->get_inputs_embeds(prompt, images, metrics, recalculate_merged_embeddings, image_sequence); } -std::vector InputsEmbedder::encode_images(const std::vector& images) { - return m_impl->encode_images(images); +std::vector InputsEmbedder::encode_images(const std::vector& images, const bool& is_video) { + return m_impl->encode_images(images, is_video); } std::pair> InputsEmbedder::get_position_ids(const size_t inputs_embeds_size, const size_t history_size) { diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index 5310e56dfa..d0eafce1d8 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -39,7 +39,7 @@ class InputsEmbedder { ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}); - std::vector encode_images(const std::vector& images); + std::vector encode_images(const std::vector& images, const bool& is_video = false); // compute position ids for language model input std::pair> get_position_ids(const size_t inputs_embeds_size, const size_t history_size); @@ -102,7 +102,7 @@ class InputsEmbedder { ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence); - virtual std::vector encode_images(const std::vector& images); + virtual std::vector encode_images(const std::vector& images, const bool& is_video = false); virtual std::pair> get_position_ids(const size_t inputs_embeds_size, const size_t history_size); diff --git a/src/cpp/src/visual_language/llava/classes.cpp b/src/cpp/src/visual_language/llava/classes.cpp index ce9db85c70..28a5086045 100644 --- a/src/cpp/src/visual_language/llava/classes.cpp +++ b/src/cpp/src/visual_language/llava/classes.cpp @@ -92,7 +92,10 @@ InputsEmbedderLLaVA::InputsEmbedderLLaVA( const ov::AnyMap device_config) : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } -std::vector InputsEmbedderLLaVA::encode_images(const std::vector& images) { +std::vector InputsEmbedderLLaVA::encode_images(const std::vector& images, const bool& is_video) { + if (is_video) { + std::cout << "== Warning: LLaVA doesn't support video process. " << std::endl; + } std::vector embeds; ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}}; std::vector single_images = to_single_image_tensors(images); diff --git a/src/cpp/src/visual_language/llava/classes.hpp b/src/cpp/src/visual_language/llava/classes.hpp index 8cc8c147d7..660328fb62 100644 --- a/src/cpp/src/visual_language/llava/classes.hpp +++ b/src/cpp/src/visual_language/llava/classes.hpp @@ -37,7 +37,7 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}) override; - std::vector encode_images(const std::vector& images) override; + std::vector encode_images(const std::vector& images, const bool& is_video = false) override; std::pair> normalize_prompt( const std::string& prompt, diff --git a/src/cpp/src/visual_language/llava_next/classes.cpp b/src/cpp/src/visual_language/llava_next/classes.cpp index 6ebb3dad76..e61c91f63c 100644 --- a/src/cpp/src/visual_language/llava_next/classes.cpp +++ b/src/cpp/src/visual_language/llava_next/classes.cpp @@ -333,7 +333,11 @@ ov::Tensor pack_image_features_llava_next( } // namespace -std::vector InputsEmbedderLLaVANext::encode_images(const std::vector& images) { +std::vector InputsEmbedderLLaVANext::encode_images(const std::vector& images, const bool& is_video) { + if (is_video) { + std::cout << "== Warning: LLaVANext doesn't support video process. " << std::endl; + } + std::vector embeds; ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}}; std::vector single_images = to_single_image_tensors(images); diff --git a/src/cpp/src/visual_language/llava_next/classes.hpp b/src/cpp/src/visual_language/llava_next/classes.hpp index b79597b519..8559853fd4 100644 --- a/src/cpp/src/visual_language/llava_next/classes.hpp +++ b/src/cpp/src/visual_language/llava_next/classes.hpp @@ -24,7 +24,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}) override; - std::vector encode_images(const std::vector& images) override; + std::vector encode_images(const std::vector& images, const bool& is_video) override; std::pair> normalize_prompt( const std::string& prompt, diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 20fbf2a6d2..1cbb8459ff 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -183,7 +183,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ "Currently only \"num_return_sequences\" equal to 1 is supported for NPU device!"); } - const auto encoded_images = m_inputs_embedder->encode_images(rgbs); + const auto encoded_images = m_inputs_embedder->encode_images(rgbs, generation_config.type == std::string("video")); auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); if (m_is_chat_conversation) { diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index 29e5ed43c3..26be9a1549 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -65,7 +65,7 @@ ov::Tensor reshape_image_patches( spatial_merge_size, patch_size }; - + ov::Tensor reshaped_patches(patches.get_element_type(), output_shape); const float* input_data = patches.data(); @@ -314,16 +314,14 @@ ov::Tensor merge_text_and_image_embeddings( } // namespace qwen2vl_utils -EncodedImage VisionEncoderQwen2VL::encode(const ov::Tensor& image, const ov::AnyMap& config_map) { - CircularBufferQueueElementGuard infer_request_guard(this->m_ireq_queue_vision_encoder.get()); - ov::InferRequest& encoder = infer_request_guard.get(); - ProcessorConfig config = utils::from_any_map(config_map, m_processor_config); - +ov::Tensor VisionEncoderQwen2VL::preproces_single_image(const ov::Tensor& image, + const ProcessorConfig& config, + ImageSize& target_image_size) { ov::Shape image_shape = image.get_shape(); auto original_height = image_shape.at(1); auto original_width = image_shape.at(2); - ImageSize target_image_size = qwen2_vl_utils::smart_resize( + target_image_size = qwen2_vl_utils::smart_resize( original_height, original_width, config.patch_size * config.merge_size, @@ -340,7 +338,78 @@ EncodedImage VisionEncoderQwen2VL::encode(const ov::Tensor& image, const ov::Any std::copy(config.image_std.begin(), config.image_std.end(), ctx.image_std); clip_image_f32 normalized_image = clip_image_preprocess(ctx, resized_image); - ov::Tensor patches = clip_image_f32_to_tensor(normalized_image); + return clip_image_f32_to_tensor(normalized_image); +} + +std::vector VisionEncoderQwen2VL::encode_video(const std::vector& images, + const ov::AnyMap& config_map) { + CircularBufferQueueElementGuard infer_request_guard(this->m_ireq_queue_vision_encoder.get()); + ov::InferRequest& encoder = infer_request_guard.get(); + ProcessorConfig config = utils::from_any_map(config_map, m_processor_config); + + std::vector tiled_images = images; + size_t remainder = images.size() % config.temporal_patch_size; + if (remainder > 0) { + for (size_t i = 0; i < config.temporal_patch_size - remainder; i++) { + tiled_images.push_back(images.back()); + } + } + + std::vector encoded_imgs; + for (size_t i = 0; i < tiled_images.size(); i += config.temporal_patch_size) { + auto orig_shape = tiled_images[i].get_shape(); + ov::Tensor tiled_patches(ov::element::f32, + {config.temporal_patch_size, orig_shape.at(3), orig_shape.at(1), orig_shape.at(2)}); + + ImageSize target_image_size; + for (size_t j = 0; j < config.temporal_patch_size; j++) { + auto patch = preproces_single_image(tiled_images[i + j], config, target_image_size); + std::memcpy(tiled_patches.data() + j * patch.get_byte_size() / sizeof(float), + patch.data(), + patch.get_byte_size()); + } + + ov::Tensor patches = std::move(tiled_patches); + auto patches_shape = patches.get_shape(); + size_t channel = patches_shape.at(1); + + size_t grid_t = patches_shape.at(0) / config.temporal_patch_size; + size_t grid_h = target_image_size.height / config.patch_size; + size_t grid_w = target_image_size.width / config.patch_size; + + ov::Tensor reshaped_patches = qwen2_vl_utils::reshape_image_patches( + patches, grid_t, grid_h, grid_w, channel, config.temporal_patch_size, config.patch_size, config.merge_size + ); + ov::Tensor transposed_patches = qwen2_vl_utils::transpose_image_patches(reshaped_patches); + + ov::Shape flattened_patches_shape{ + grid_t * grid_h * grid_w, + channel * config.temporal_patch_size * config.patch_size * config.patch_size + }; + ov::Tensor flattened_patches(transposed_patches.get_element_type(), flattened_patches_shape); + std::memcpy(flattened_patches.data(), transposed_patches.data(), transposed_patches.get_byte_size()); + + encoder.set_tensor("hidden_states", flattened_patches); + encoder.infer(); + + const ov::Tensor& infer_output = encoder.get_output_tensor(); + ov::Tensor image_features(infer_output.get_element_type(), infer_output.get_shape()); + std::memcpy(image_features.data(), infer_output.data(), infer_output.get_byte_size()); + + ImageSize resized_source_size{grid_h, grid_w}; + + encoded_imgs.push_back({std::move(image_features), resized_source_size}); + } + return encoded_imgs; +} + +EncodedImage VisionEncoderQwen2VL::encode(const ov::Tensor& image, const ov::AnyMap& config_map) { + CircularBufferQueueElementGuard infer_request_guard(this->m_ireq_queue_vision_encoder.get()); + ov::InferRequest& encoder = infer_request_guard.get(); + ProcessorConfig config = utils::from_any_map(config_map, m_processor_config); + + ImageSize target_image_size; + ov::Tensor patches = preproces_single_image(image, config, target_image_size); // For single patch tile it to match temporal_patch_size if (patches.get_shape().at(0) == 1) { @@ -359,6 +428,7 @@ EncodedImage VisionEncoderQwen2VL::encode(const ov::Tensor& image, const ov::Any } auto patches_shape = patches.get_shape(); + std::cout << "patches_shape = " << patches_shape << std::endl; size_t channel = patches_shape.at(1); size_t grid_t = patches_shape.at(0) / config.temporal_patch_size; diff --git a/src/cpp/src/visual_language/qwen2vl/classes.hpp b/src/cpp/src/visual_language/qwen2vl/classes.hpp index b468601395..27e3bb28e8 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.hpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.hpp @@ -17,6 +17,10 @@ class VisionEncoderQwen2VL : public VisionEncoder { using VisionEncoder::VisionEncoder; EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map) override; + std::vector encode_video(const std::vector& image, const ov::AnyMap& config_map) override; + +private: + ov::Tensor preproces_single_image(const ov::Tensor& image, const ProcessorConfig& config, ImageSize& target_image_size); }; class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { diff --git a/src/cpp/src/visual_language/vision_encoder.hpp b/src/cpp/src/visual_language/vision_encoder.hpp index 0b16534440..c7ea48df52 100644 --- a/src/cpp/src/visual_language/vision_encoder.hpp +++ b/src/cpp/src/visual_language/vision_encoder.hpp @@ -99,6 +99,10 @@ class VisionEncoder { /// @return Resulting embeddings for the resized source image and /// its slices. virtual EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map = {}) = 0; + virtual std::vector encode_video(const std::vector& image, const ov::AnyMap& config_map = {}) { + std::cout << "Not implemented." << std::endl; + return {}; + } /// @brief Gets processor config /// @return Processor config From 2ee043f5c29b959cb520bbae3d0953bf89c3b95b Mon Sep 17 00:00:00 2001 From: xipingya Date: Thu, 31 Jul 2025 14:22:14 +0800 Subject: [PATCH 003/118] Add python interface: generate config: is_video, default false. Signed-off-by: xipingya --- src/cpp/include/openvino/genai/generation_config.hpp | 2 +- src/cpp/src/continuous_batching/pipeline_base.cpp | 6 +++--- src/cpp/src/visual_language/pipeline.cpp | 2 +- src/cpp/src/visual_language/qwen2vl/classes.cpp | 1 - src/python/openvino_genai/py_openvino_genai.pyi | 6 ++++++ src/python/py_generation_config.cpp | 1 + 6 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index fab0ff39b4..7e0fa5fcda 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -213,7 +213,7 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { bool apply_chat_template = true; // Vidoe or image - std::string type = "image"; + bool is_video = false; /** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0. * Otherwise verifies eos_token_id == tokenizer_eos_token_id. diff --git a/src/cpp/src/continuous_batching/pipeline_base.cpp b/src/cpp/src/continuous_batching/pipeline_base.cpp index 0ccc9951a0..96f870d5d1 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.cpp +++ b/src/cpp/src/continuous_batching/pipeline_base.cpp @@ -167,7 +167,7 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( const auto& rgbs = rgbs_vector[0]; const auto& prompt = prompts[0]; auto start_get_inputs_embeds = std::chrono::steady_clock::now(); - encoded_images = m_inputs_embedder->encode_images(rgbs, sampling_params[0].type == "video"); + encoded_images = m_inputs_embedder->encode_images(rgbs, sampling_params[0].is_video); m_history_images.insert(m_history_images.end(), encoded_images.begin(), encoded_images.end()); const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); @@ -185,7 +185,7 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( for (size_t i = 0; i < prompts.size(); i++) { const auto& prompt = prompts[i]; const auto& rgbs = rgbs_vector[i]; - const auto encoded_images = m_inputs_embedder->encode_images(rgbs, sampling_params[i].type == "video"); + const auto encoded_images = m_inputs_embedder->encode_images(rgbs, sampling_params[i].is_video); auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); auto start_get_inputs_embeds = std::chrono::steady_clock::now(); @@ -248,7 +248,7 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(uint64_t re { std::lock_guard lock(m_embeddings_mutex); m_inputs_embedder->set_apply_chat_template_status(sampling_params.apply_chat_template); - const auto encoded_images = m_inputs_embedder->encode_images(rgbs, sampling_params.type == "video"); + const auto encoded_images = m_inputs_embedder->encode_images(rgbs, sampling_params.is_video); const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, 0, encoded_images); inputs = m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, metrics, true, image_sequence); diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 1cbb8459ff..e333b6ba82 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -183,7 +183,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ "Currently only \"num_return_sequences\" equal to 1 is supported for NPU device!"); } - const auto encoded_images = m_inputs_embedder->encode_images(rgbs, generation_config.type == std::string("video")); + const auto encoded_images = m_inputs_embedder->encode_images(rgbs, generation_config.is_video); auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); if (m_is_chat_conversation) { diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index 26be9a1549..3d208d4258 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -428,7 +428,6 @@ EncodedImage VisionEncoderQwen2VL::encode(const ov::Tensor& image, const ov::Any } auto patches_shape = patches.get_shape(); - std::cout << "patches_shape = " << patches_shape << std::endl; size_t channel = patches_shape.at(1); size_t grid_t = patches_shape.at(0) / config.temporal_patch_size; diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 4ccf100d57..a9fc3fe67d 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -826,6 +826,12 @@ class GenerationConfig: def max_new_tokens(self, arg0: typing.SupportsInt) -> None: ... @property + def is_video(self) -> bool: + ... + @is_video.setter + def is_video(self, arg0: typing.SupportsInt) -> None: + ... + @property def max_ngram_size(self) -> int: ... @max_ngram_size.setter diff --git a/src/python/py_generation_config.cpp b/src/python/py_generation_config.cpp index 4cce29042e..bb72d9dc96 100644 --- a/src/python/py_generation_config.cpp +++ b/src/python/py_generation_config.cpp @@ -215,6 +215,7 @@ void init_generation_config(py::module_& m) { .def_readwrite("structured_output_config", &GenerationConfig::structured_output_config) .def_readwrite("adapters", &GenerationConfig::adapters) .def_readwrite("apply_chat_template", &GenerationConfig::apply_chat_template) + .def_readwrite("is_video", &GenerationConfig::is_video) .def("set_eos_token_id", &GenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id")) .def("is_beam_search", &GenerationConfig::is_beam_search) .def("is_greedy_decoding", &GenerationConfig::is_greedy_decoding) From 29c74fdcedbdbf20d88938cfd086a8e309dfeeaa Mon Sep 17 00:00:00 2001 From: xipingya Date: Tue, 5 Aug 2025 11:11:08 +0800 Subject: [PATCH 004/118] fallback video_encode to image encode in base class. Signed-off-by: xipingya --- src/cpp/src/visual_language/vision_encoder.hpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/cpp/src/visual_language/vision_encoder.hpp b/src/cpp/src/visual_language/vision_encoder.hpp index c7ea48df52..6459e73c43 100644 --- a/src/cpp/src/visual_language/vision_encoder.hpp +++ b/src/cpp/src/visual_language/vision_encoder.hpp @@ -99,9 +99,13 @@ class VisionEncoder { /// @return Resulting embeddings for the resized source image and /// its slices. virtual EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map = {}) = 0; - virtual std::vector encode_video(const std::vector& image, const ov::AnyMap& config_map = {}) { - std::cout << "Not implemented." << std::endl; - return {}; + virtual std::vector encode_video(const std::vector& images, const ov::AnyMap& config_map = {}) { + // Video encode not implemented, fallback to image encode. + std::vector embeds; + for (const ov::Tensor& image : images) { + embeds.emplace_back(this->encode(image)); + } + return embeds; } /// @brief Gets processor config From 78dac29d11394acc6130078052f2dc66392ce03b Mon Sep 17 00:00:00 2001 From: xipingya Date: Tue, 5 Aug 2025 11:24:39 +0800 Subject: [PATCH 005/118] Update calc target image size. Only calc once for video process. Signed-off-by: xipingya --- .../src/visual_language/qwen2vl/classes.cpp | 37 +++++++++++-------- .../src/visual_language/qwen2vl/classes.hpp | 2 +- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index 3d208d4258..fae9e7e5f9 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -316,19 +316,7 @@ ov::Tensor merge_text_and_image_embeddings( ov::Tensor VisionEncoderQwen2VL::preproces_single_image(const ov::Tensor& image, const ProcessorConfig& config, - ImageSize& target_image_size) { - ov::Shape image_shape = image.get_shape(); - auto original_height = image_shape.at(1); - auto original_width = image_shape.at(2); - - target_image_size = qwen2_vl_utils::smart_resize( - original_height, - original_width, - config.patch_size * config.merge_size, - config.min_pixels, - config.max_pixels - ); - + const ImageSize& target_image_size) { clip_image_u8 input_image = tensor_to_clip_image_u8(image); clip_image_u8 resized_image; bicubic_resize(input_image, resized_image, target_image_size.width, target_image_size.height); @@ -347,6 +335,10 @@ std::vector VisionEncoderQwen2VL::encode_video(const std::vector tiled_images = images; size_t remainder = images.size() % config.temporal_patch_size; if (remainder > 0) { @@ -355,13 +347,21 @@ std::vector VisionEncoderQwen2VL::encode_video(const std::vector encoded_imgs; for (size_t i = 0; i < tiled_images.size(); i += config.temporal_patch_size) { auto orig_shape = tiled_images[i].get_shape(); ov::Tensor tiled_patches(ov::element::f32, {config.temporal_patch_size, orig_shape.at(3), orig_shape.at(1), orig_shape.at(2)}); - ImageSize target_image_size; for (size_t j = 0; j < config.temporal_patch_size; j++) { auto patch = preproces_single_image(tiled_images[i + j], config, target_image_size); std::memcpy(tiled_patches.data() + j * patch.get_byte_size() / sizeof(float), @@ -408,7 +408,14 @@ EncodedImage VisionEncoderQwen2VL::encode(const ov::Tensor& image, const ov::Any ov::InferRequest& encoder = infer_request_guard.get(); ProcessorConfig config = utils::from_any_map(config_map, m_processor_config); - ImageSize target_image_size; + ov::Shape image_shape = image.get_shape(); + auto original_height = image_shape.at(1); + auto original_width = image_shape.at(2); + ImageSize target_image_size = qwen2_vl_utils::smart_resize(original_height, + original_width, + config.patch_size * config.merge_size, + config.min_pixels, + config.max_pixels); ov::Tensor patches = preproces_single_image(image, config, target_image_size); // For single patch tile it to match temporal_patch_size diff --git a/src/cpp/src/visual_language/qwen2vl/classes.hpp b/src/cpp/src/visual_language/qwen2vl/classes.hpp index 27e3bb28e8..65d23c7b01 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.hpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.hpp @@ -20,7 +20,7 @@ class VisionEncoderQwen2VL : public VisionEncoder { std::vector encode_video(const std::vector& image, const ov::AnyMap& config_map) override; private: - ov::Tensor preproces_single_image(const ov::Tensor& image, const ProcessorConfig& config, ImageSize& target_image_size); + ov::Tensor preproces_single_image(const ov::Tensor& image, const ProcessorConfig& config, const ImageSize& target_image_size); }; class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { From 7b2c115ce8c19e14a107a6123da5a11fac82f02b Mon Sep 17 00:00:00 2001 From: xipingya Date: Tue, 5 Aug 2025 11:37:41 +0800 Subject: [PATCH 006/118] Reduce shared codes, fallback to image process via return empty vector; Signed-off-by: xipingya --- src/cpp/src/visual_language/inputs_embedder.cpp | 9 +++++++-- src/cpp/src/visual_language/vision_encoder.hpp | 8 ++------ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 75348449aa..92301791c6 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -165,11 +165,16 @@ std::vector InputsEmbedder::IInputsEmbedder::to_single_image_tensors std::vector InputsEmbedder::IInputsEmbedder::encode_images(const std::vector& images, const bool& is_video) { std::vector single_images = to_single_image_tensors(images); + std::vector embeds; + if (is_video) { - return m_vision_encoder->encode_video(single_images); + embeds = m_vision_encoder->encode_video(single_images); + if (!embeds.empty()) { + return embeds; + } + // Fallback to image process. } - std::vector embeds; for (const ov::Tensor& image : single_images) { embeds.emplace_back(m_vision_encoder->encode(image)); } diff --git a/src/cpp/src/visual_language/vision_encoder.hpp b/src/cpp/src/visual_language/vision_encoder.hpp index 6459e73c43..4c4fc77b7e 100644 --- a/src/cpp/src/visual_language/vision_encoder.hpp +++ b/src/cpp/src/visual_language/vision_encoder.hpp @@ -100,12 +100,8 @@ class VisionEncoder { /// its slices. virtual EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map = {}) = 0; virtual std::vector encode_video(const std::vector& images, const ov::AnyMap& config_map = {}) { - // Video encode not implemented, fallback to image encode. - std::vector embeds; - for (const ov::Tensor& image : images) { - embeds.emplace_back(this->encode(image)); - } - return embeds; + // Video encode not implemented, return empty and fallback to image encode. + return {}; } /// @brief Gets processor config From 10d8e8d1b2d1ff75523790715ddcd924a9c8a404 Mon Sep 17 00:00:00 2001 From: xipingya Date: Sat, 9 Aug 2025 18:02:12 +0800 Subject: [PATCH 007/118] 1: remove is_video, 2: add ov::Properity::video Signed-off-by: xipingya --- .../genai/continuous_batching_pipeline.hpp | 7 +++- .../openvino/genai/generation_config.hpp | 2 -- .../genai/visual_language/pipeline.hpp | 4 +++ src/cpp/src/continuous_batching/pipeline.cpp | 11 ++++-- .../src/continuous_batching/pipeline_base.cpp | 36 ++++++++++++++++--- .../src/continuous_batching/pipeline_base.hpp | 2 ++ .../src/continuous_batching/pipeline_impl.cpp | 2 +- .../continuous_batching_adapter.hpp | 3 +- src/cpp/src/visual_language/pipeline.cpp | 13 +++++-- src/cpp/src/visual_language/pipeline_base.hpp | 16 +++++++++ .../openvino_genai/py_openvino_genai.pyi | 7 ++-- .../py_continuous_batching_pipeline.cpp | 16 +++++++-- src/python/py_generation_config.cpp | 1 - src/python/py_vlm_pipeline.cpp | 9 +++-- 14 files changed, 105 insertions(+), 24 deletions(-) diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp index 1a84192ead..6327e770cf 100644 --- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp +++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp @@ -165,7 +165,11 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { /// @param request_id must be unique for every add_request() call. GenerationHandle add_request(uint64_t request_id, const ov::Tensor& input_ids, const ov::genai::GenerationConfig& sampling_params); GenerationHandle add_request(uint64_t request_id, const std::string& prompt, const ov::genai::GenerationConfig& sampling_params); - GenerationHandle add_request(uint64_t request_id, const std::string& prompt, const std::vector& images, const ov::genai::GenerationConfig& sampling_params); + GenerationHandle add_request(uint64_t request_id, + const std::string& prompt, + const std::vector& images, + const std::vector& video, + const ov::genai::GenerationConfig& sampling_params); void step(); @@ -177,6 +181,7 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { std::vector generate( const std::vector& prompts, const std::vector>& images, + const std::vector>& videos, const std::vector& sampling_params, const StreamerVariant& streamer=std::monostate{}); /** diff --git a/src/cpp/include/openvino/genai/generation_config.hpp b/src/cpp/include/openvino/genai/generation_config.hpp index 7e0fa5fcda..67b6ff2a87 100644 --- a/src/cpp/include/openvino/genai/generation_config.hpp +++ b/src/cpp/include/openvino/genai/generation_config.hpp @@ -212,8 +212,6 @@ class OPENVINO_GENAI_EXPORTS GenerationConfig { // set to true if chat template should be applied for non-chat scenarios, set to false otherwise bool apply_chat_template = true; - // Vidoe or image - bool is_video = false; /** @brief sets eos_token_id to tokenizer_eos_token_id if eos_token_id is less than 0. * Otherwise verifies eos_token_id == tokenizer_eos_token_id. diff --git a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp index 568440f84b..1e889f8023 100644 --- a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp +++ b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp @@ -98,6 +98,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { VLMDecodedResults generate( const std::string& prompt, const std::vector& rgbs, + const std::vector& video, const GenerationConfig& generation_config, const StreamerVariant& streamer ); @@ -235,7 +236,10 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /* * utils that allow to use generate() in the following way: * pipe.generate(prompt, ov::genai::image(image_tensor)). + * pipe.generate(prompt, ov::genai::images(image_tensors)). + * pipe.generate(prompt, ov::genai::video(video_tensors)). */ static constexpr ov::Property image{"image"}; static constexpr ov::Property> images{"images"}; +static constexpr ov::Property> video{"video"}; } diff --git a/src/cpp/src/continuous_batching/pipeline.cpp b/src/cpp/src/continuous_batching/pipeline.cpp index 41cbc0d07b..3b928ef1a8 100644 --- a/src/cpp/src/continuous_batching/pipeline.cpp +++ b/src/cpp/src/continuous_batching/pipeline.cpp @@ -237,8 +237,12 @@ GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, co return m_impl->add_request(request_id, input_ids, sampling_params); } -GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, const std::string& prompt, const std::vector& images, const ov::genai::GenerationConfig& sampling_params) { - return m_impl->add_request(request_id, prompt, images, sampling_params); +GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, + const std::string& prompt, + const std::vector& images, + const std::vector& video, + const ov::genai::GenerationConfig& sampling_params) { + return m_impl->add_request(request_id, prompt, images, video, sampling_params); } void ContinuousBatchingPipeline::step() { @@ -272,9 +276,10 @@ std::vector ContinuousBatchingPipeline::generate(const std::ve std::vector ContinuousBatchingPipeline::generate( const std::vector& prompts, const std::vector>& images, + const std::vector>& videos, const std::vector& sampling_params, const StreamerVariant& streamer) { - return m_impl->generate(prompts, images, sampling_params, streamer); + return m_impl->generate(prompts, images, videos, sampling_params, streamer); } diff --git a/src/cpp/src/continuous_batching/pipeline_base.cpp b/src/cpp/src/continuous_batching/pipeline_base.cpp index 96f870d5d1..db25f1c2ea 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.cpp +++ b/src/cpp/src/continuous_batching/pipeline_base.cpp @@ -51,7 +51,8 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( // TODO: remove this code and within model runner add check: if sequence group type is tokens, // but embedding model is available => compute embeddings first, then pass to LLM std::vector> images(prompts.size()); - auto results_vlm = generate(prompts, images, sampling_params, streamer); + std::vector> videos(prompts.size()); + auto results_vlm = generate(prompts, images, videos, sampling_params, streamer); std::vector resutls; for (auto& vlm_result : results_vlm) { GenerationResult result; @@ -150,6 +151,7 @@ std::vector ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( const std::vector& prompts, const std::vector>& rgbs_vector, + const std::vector>& video_vector, const std::vector& sampling_params, const StreamerVariant& streamer) { auto generate_start_time = std::chrono::steady_clock::now(); @@ -157,6 +159,7 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( OPENVINO_ASSERT(prompts.size() == sampling_params.size(), "Number of prompts should be equal to the number of generation configs."); OPENVINO_ASSERT(prompts.size() == rgbs_vector.size(), "Number of prompts should be equal to the number of images vectors."); + OPENVINO_ASSERT(prompts.size() == video_vector.size(), "Number of prompts should be equal to the number of video vectors."); std::vector input_embeds_list; std::vector vlm_perf_metrics(prompts.size()); @@ -165,9 +168,14 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( if (m_is_chat_conversation) { OPENVINO_ASSERT(1 == prompts.size(), "Can't chat with multiple prompts"); const auto& rgbs = rgbs_vector[0]; + const auto& video = video_vector[0]; const auto& prompt = prompts[0]; auto start_get_inputs_embeds = std::chrono::steady_clock::now(); - encoded_images = m_inputs_embedder->encode_images(rgbs, sampling_params[0].is_video); + if (rgbs.size() > 0) { + encoded_images = m_inputs_embedder->encode_images(rgbs, false); + } else if (video.size() > 0) { + encoded_images = m_inputs_embedder->encode_images(video, true); + } m_history_images.insert(m_history_images.end(), encoded_images.begin(), encoded_images.end()); const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); @@ -177,7 +185,11 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( std::string templated_history = m_tokenizer.apply_chat_template(m_history, true); m_inputs_embedder->set_apply_chat_template_status(false); - input_embeds_list.push_back(m_inputs_embedder->get_inputs_embeds(templated_history, m_history_images, vlm_perf_metrics[0], rgbs.size() > 0, m_history_image_ids)); + input_embeds_list.push_back(m_inputs_embedder->get_inputs_embeds(templated_history, + m_history_images, + vlm_perf_metrics[0], + encoded_images.size() > 0, + m_history_image_ids)); auto end_get_inputs_embeds = std::chrono::steady_clock::now(); vlm_perf_metrics[0].vlm_raw_metrics.prepare_embeddings_durations.emplace_back(PerfMetrics::get_microsec(end_get_inputs_embeds - start_get_inputs_embeds)); @@ -185,7 +197,14 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( for (size_t i = 0; i < prompts.size(); i++) { const auto& prompt = prompts[i]; const auto& rgbs = rgbs_vector[i]; - const auto encoded_images = m_inputs_embedder->encode_images(rgbs, sampling_params[i].is_video); + const auto& video = video_vector[i]; + std::vector encoded_images; + if (rgbs.size() > 0) { + encoded_images = m_inputs_embedder->encode_images(rgbs, false); + } else if (video.size() > 0) { + encoded_images = m_inputs_embedder->encode_images(video, true); + } + auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); auto start_get_inputs_embeds = std::chrono::steady_clock::now(); @@ -241,6 +260,7 @@ GenerationHandle ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(uint64_t request_id, const std::string& prompt, const std::vector& rgbs, + const std::vector& video, GenerationConfig sampling_params) { OPENVINO_ASSERT(m_model_input_type == ModelInputType::EMBEDDINGS, "Model doesn't support embeddings."); ov::genai::VLMPerfMetrics metrics; @@ -248,7 +268,13 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(uint64_t re { std::lock_guard lock(m_embeddings_mutex); m_inputs_embedder->set_apply_chat_template_status(sampling_params.apply_chat_template); - const auto encoded_images = m_inputs_embedder->encode_images(rgbs, sampling_params.is_video); + + std::vector encoded_images; + if (rgbs.size() > 0) { + encoded_images = m_inputs_embedder->encode_images(rgbs, false); + } else if (video.size() > 0) { + encoded_images = m_inputs_embedder->encode_images(video, true); + } const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, 0, encoded_images); inputs = m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, metrics, true, image_sequence); diff --git a/src/cpp/src/continuous_batching/pipeline_base.hpp b/src/cpp/src/continuous_batching/pipeline_base.hpp index 69d593529f..66635d6aa0 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.hpp +++ b/src/cpp/src/continuous_batching/pipeline_base.hpp @@ -92,6 +92,7 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline { GenerationHandle add_request(uint64_t request_id, const std::string& prompt, const std::vector& rgbs, + const std::vector& video, GenerationConfig sampling_params); /** @@ -124,6 +125,7 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline { generate( const std::vector& prompts, const std::vector>& rgbs, + const std::vector>& videos, const std::vector& sampling_params, const StreamerVariant& streamer); diff --git a/src/cpp/src/continuous_batching/pipeline_impl.cpp b/src/cpp/src/continuous_batching/pipeline_impl.cpp index fc27b92b7a..6eefc9d5fa 100644 --- a/src/cpp/src/continuous_batching/pipeline_impl.cpp +++ b/src/cpp/src/continuous_batching/pipeline_impl.cpp @@ -246,7 +246,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::add_request(uint64_t request timer.end(); return add_request(request_id, inputs, sampling_params); } else if (m_model_input_type == ModelInputType::EMBEDDINGS) { - return ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(request_id, prompt, {}, sampling_params); + return ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(request_id, prompt, {}, {}, sampling_params); } else { OPENVINO_THROW("Unknown model input type."); } diff --git a/src/cpp/src/visual_language/continuous_batching_adapter.hpp b/src/cpp/src/visual_language/continuous_batching_adapter.hpp index 677991d875..a314caca7d 100644 --- a/src/cpp/src/visual_language/continuous_batching_adapter.hpp +++ b/src/cpp/src/visual_language/continuous_batching_adapter.hpp @@ -44,11 +44,12 @@ class ov::genai::VLMPipeline::VLMContinuousBatchingAdapter : public ov::genai::V VLMDecodedResults generate( const std::string& prompt, const std::vector& rgbs, + const std::vector& video, GenerationConfig generation_config, const StreamerVariant& streamer ) override { auto start_time = std::chrono::steady_clock::now(); - auto result = m_impl.generate({prompt}, {rgbs}, {generation_config}, streamer)[0]; + auto result = m_impl.generate({prompt}, {rgbs}, {video}, {generation_config}, streamer)[0]; auto stop_time = std::chrono::steady_clock::now(); VLMDecodedResults decoded; diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index e333b6ba82..3acdbe96e4 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -153,6 +153,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ VLMDecodedResults generate( const std::string& prompt, const std::vector& rgbs, + const std::vector& video, GenerationConfig generation_config, const StreamerVariant& streamer ) override { @@ -183,7 +184,12 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ "Currently only \"num_return_sequences\" equal to 1 is supported for NPU device!"); } - const auto encoded_images = m_inputs_embedder->encode_images(rgbs, generation_config.is_video); + std::vector encoded_images; + if (rgbs.size() > 0) { + encoded_images = m_inputs_embedder->encode_images(rgbs, false); + } else if (rgbs.size() > 0) { + encoded_images = m_inputs_embedder->encode_images(video, true); + } auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); if (m_is_chat_conversation) { @@ -444,10 +450,11 @@ VLMPipeline::~VLMPipeline() = default; VLMDecodedResults VLMPipeline::generate( const std::string& prompt, const std::vector& rgbs, + const std::vector& video, const GenerationConfig& generation_config, const StreamerVariant& streamer ) { - return m_pimpl->generate(prompt, rgbs, generation_config, streamer); + return m_pimpl->generate(prompt, rgbs, video, generation_config, streamer); } VLMDecodedResults VLMPipeline::generate( @@ -456,7 +463,7 @@ VLMDecodedResults VLMPipeline::generate( const GenerationConfig& generation_config, const StreamerVariant& streamer ) { - return m_pimpl->generate(prompt, {rgb}, generation_config, streamer); + return m_pimpl->generate(prompt, {rgb}, {}, generation_config, streamer); } VLMDecodedResults VLMPipeline::generate( diff --git a/src/cpp/src/visual_language/pipeline_base.hpp b/src/cpp/src/visual_language/pipeline_base.hpp index 6ee56f7e4e..3ecad2e81e 100644 --- a/src/cpp/src/visual_language/pipeline_base.hpp +++ b/src/cpp/src/visual_language/pipeline_base.hpp @@ -23,6 +23,7 @@ class ov::genai::VLMPipeline::VLMPipelineBase { virtual VLMDecodedResults generate( const std::string& prompt, const std::vector& rgbs, + const std::vector& video, GenerationConfig generation_config, const StreamerVariant& streamer ) = 0; @@ -33,6 +34,7 @@ class ov::genai::VLMPipeline::VLMPipelineBase { ) { auto image = config_map.find(ov::genai::image.name()); auto images = config_map.find(ov::genai::images.name()); + auto video = config_map.find(ov::genai::video.name()); OPENVINO_ASSERT( config_map.end() == image || config_map.end() == images, "Only one property can be set: image of images." @@ -52,6 +54,19 @@ class ov::genai::VLMPipeline::VLMPipelineBase { } } + std::vector video_rgbs; + if (config_map.end() != video) { + if (video->second.is>()) { + video_rgbs = video->second.as>(); + } + else if (video->second.is()){ + video_rgbs = {video->second.as()}; + } + else { + OPENVINO_THROW("Unknown video type."); + } + } + ov::genai::OptionalGenerationConfig config_arg = utils::get_config_from_map(config_map); GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config(); config.update_generation_config(config_map); @@ -59,6 +74,7 @@ class ov::genai::VLMPipeline::VLMPipelineBase { return generate( prompt, rgbs, + video_rgbs, config, utils::get_streamer_from_map(config_map) ); diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index a9fc3fe67d..600c6ee80f 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -3220,7 +3220,7 @@ class VLMPipeline: def finish_chat(self) -> None: ... @typing.overload - def generate(self, prompt: str, images: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig, streamer: collections.abc.Callable[[str], int | None] | openvino_genai.py_openvino_genai.StreamerBase | None = None, **kwargs) -> VLMDecodedResults: + def generate(self, prompt: str, images: collections.abc.Sequence[openvino._pyopenvino.Tensor], video: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig, streamer: collections.abc.Callable[[str], int | None] | openvino_genai.py_openvino_genai.StreamerBase | None = None, **kwargs) -> VLMDecodedResults: """ Generates sequences for VLMs. @@ -3244,7 +3244,10 @@ class VLMPipeline: :param images: image or list of images :type images: list[ov.Tensor] or ov.Tensor - + + :param video: list of images + :type video: list[ov.Tensor] + :param generation_config: generation_config :type generation_config: GenerationConfig or a dict diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp index 05af43b7a7..4fd4ccd1c2 100644 --- a/src/python/py_continuous_batching_pipeline.cpp +++ b/src/python/py_continuous_batching_pipeline.cpp @@ -370,7 +370,17 @@ void init_continuous_batching_pipeline(py::module_& m) { .def("get_metrics", &ContinuousBatchingPipeline::get_metrics) .def("add_request", py::overload_cast(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("input_ids"), py::arg("generation_config")) .def("add_request", py::overload_cast(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("prompt"), py::arg("generation_config")) - .def("add_request", py::overload_cast&, const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("prompt"), py::arg("images"), py::arg("generation_config")) + .def("add_request", + py::overload_cast&, + const std::vector&, + const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::add_request), + py::arg("request_id"), + py::arg("prompt"), + py::arg("images"), + py::arg("video"), + py::arg("generation_config")) .def("step", &ContinuousBatchingPipeline::step) .def("has_non_finished_requests", &ContinuousBatchingPipeline::has_non_finished_requests) @@ -426,6 +436,7 @@ void init_continuous_batching_pipeline(py::module_& m) { [](ContinuousBatchingPipeline& pipe, const std::vector& prompts, const std::vector>& images, + const std::vector>& videos, const std::vector& generation_config, const pyutils::PyBindStreamerVariant& py_streamer ) -> py::typing::Union> { @@ -433,12 +444,13 @@ void init_continuous_batching_pipeline(py::module_& m) { std::vector generated_results; { py::gil_scoped_release rel; - generated_results = pipe.generate(prompts, images, generation_config, streamer); + generated_results = pipe.generate(prompts, images, videos, generation_config, streamer); } return py::cast(generated_results); }, py::arg("prompts"), py::arg("images"), + py::arg("videos"), py::arg("generation_config"), py::arg("streamer") = std::monostate{} ); diff --git a/src/python/py_generation_config.cpp b/src/python/py_generation_config.cpp index bb72d9dc96..4cce29042e 100644 --- a/src/python/py_generation_config.cpp +++ b/src/python/py_generation_config.cpp @@ -215,7 +215,6 @@ void init_generation_config(py::module_& m) { .def_readwrite("structured_output_config", &GenerationConfig::structured_output_config) .def_readwrite("adapters", &GenerationConfig::adapters) .def_readwrite("apply_chat_template", &GenerationConfig::apply_chat_template) - .def_readwrite("is_video", &GenerationConfig::is_video) .def("set_eos_token_id", &GenerationConfig::set_eos_token_id, py::arg("tokenizer_eos_token_id")) .def("is_beam_search", &GenerationConfig::is_beam_search) .def("is_greedy_decoding", &GenerationConfig::is_greedy_decoding) diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp index d5d7b07707..92a35f36e1 100644 --- a/src/python/py_vlm_pipeline.cpp +++ b/src/python/py_vlm_pipeline.cpp @@ -123,6 +123,7 @@ py::object call_vlm_generate( ov::genai::VLMPipeline& pipe, const std::string& prompt, const std::vector& images, + const std::vector& video, const ov::genai::GenerationConfig& generation_config, const pyutils::PyBindStreamerVariant& py_streamer, const py::kwargs& kwargs @@ -132,7 +133,7 @@ py::object call_vlm_generate( ov::genai::VLMDecodedResults res; { py::gil_scoped_release rel; - res= pipe.generate(prompt, images, updated_config, streamer); + res= pipe.generate(prompt, images, video, updated_config, streamer); } return py::cast(res); } @@ -221,14 +222,16 @@ void init_vlm_pipeline(py::module_& m) { [](ov::genai::VLMPipeline& pipe, const std::string& prompt, const std::vector& images, + const std::vector& video, const ov::genai::GenerationConfig& generation_config, const pyutils::PyBindStreamerVariant& streamer, const py::kwargs& kwargs ) -> py::typing::Union { - return call_vlm_generate(pipe, prompt, images, generation_config, streamer, kwargs); + return call_vlm_generate(pipe, prompt, images, video, generation_config, streamer, kwargs); }, py::arg("prompt"), "Input string", py::arg("images"), "Input images", + py::arg("video"), "Input video", py::arg("generation_config"), "generation_config", py::arg("streamer") = std::monostate(), "streamer", (vlm_generate_docstring + std::string(" \n ")).c_str() @@ -242,7 +245,7 @@ void init_vlm_pipeline(py::module_& m) { const pyutils::PyBindStreamerVariant& streamer, const py::kwargs& kwargs ) -> py::typing::Union { - return call_vlm_generate(pipe, prompt, {images}, generation_config, streamer, kwargs); + return call_vlm_generate(pipe, prompt, {images}, {}, generation_config, streamer, kwargs); }, py::arg("prompt"), "Input string", py::arg("images"), "Input images", From a3000d4b24ac3294249494600dca8be145734764 Mon Sep 17 00:00:00 2001 From: Xiping Yan Date: Thu, 11 Sep 2025 15:25:06 +0800 Subject: [PATCH 008/118] Update src/cpp/src/visual_language/llava/classes.cpp Co-authored-by: Wanglei Shen --- src/cpp/src/visual_language/llava/classes.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/visual_language/llava/classes.cpp b/src/cpp/src/visual_language/llava/classes.cpp index 28a5086045..4c768c787f 100644 --- a/src/cpp/src/visual_language/llava/classes.cpp +++ b/src/cpp/src/visual_language/llava/classes.cpp @@ -94,7 +94,7 @@ InputsEmbedderLLaVA::InputsEmbedderLLaVA( std::vector InputsEmbedderLLaVA::encode_images(const std::vector& images, const bool& is_video) { if (is_video) { - std::cout << "== Warning: LLaVA doesn't support video process. " << std::endl; + std::cout << "== Warning: LLaVA doesn't support video process. Input images are processed as separate images." << std::endl; } std::vector embeds; ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}}; From 4d8375d1ecff498746a971451214a4ac159a5625 Mon Sep 17 00:00:00 2001 From: Xiping Yan Date: Fri, 12 Sep 2025 09:32:12 +0800 Subject: [PATCH 009/118] Update src/cpp/src/visual_language/pipeline.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/cpp/src/visual_language/pipeline.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 8c2504dd62..8449fa6c78 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -188,7 +188,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ std::vector encoded_images; if (rgbs.size() > 0) { encoded_images = m_inputs_embedder->encode_images(rgbs, false); - } else if (rgbs.size() > 0) { + } else if (video.size() > 0) { encoded_images = m_inputs_embedder->encode_images(video, true); } auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); From ef9f8687c3e3dc2b9476cb881f16c21a2c04f5ad Mon Sep 17 00:00:00 2001 From: xipingya Date: Fri, 12 Sep 2025 09:36:01 +0800 Subject: [PATCH 010/118] rename according to copilot suggestion --- src/cpp/src/visual_language/qwen2vl/classes.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/visual_language/qwen2vl/classes.hpp b/src/cpp/src/visual_language/qwen2vl/classes.hpp index ee9c1448cd..7a08c29c3f 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.hpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.hpp @@ -21,7 +21,7 @@ class VisionEncoderQwen2VL : public VisionEncoder { std::vector encode_video(const std::vector& image, const ov::AnyMap& config_map) override; private: - ov::Tensor preproces_single_image(const ov::Tensor& image, const ProcessorConfig& config, const ImageSize& target_image_size); + ov::Tensor preprocess_single_image(const ov::Tensor& image, const ProcessorConfig& config, const ImageSize& target_image_size); EncodedImage encode_with_imagepreprocess_cpp(const ov::Tensor& image, const ov::AnyMap& config_map); EncodedImage encode_with_imagepreprocess_ov(const ov::Tensor& image, const ov::AnyMap& config_map); bool use_ov_image_preprocess = true; // default use ov image preprocess, control by env IMAGE_PREPROCESS=CPP to use cpp image preprocess From f92b19b107229402897ca9a9d4986b5d7591ff88 Mon Sep 17 00:00:00 2001 From: xipingya Date: Fri, 12 Sep 2025 16:53:49 +0800 Subject: [PATCH 011/118] rename rgbs to images Signed-off-by: xipingya --- .../openvino/genai/visual_language/pipeline.hpp | 3 ++- .../visual_language/continuous_batching_adapter.hpp | 4 ++-- src/cpp/src/visual_language/pipeline.cpp | 13 +++++++------ src/cpp/src/visual_language/pipeline_base.hpp | 12 ++++++------ src/cpp/src/visual_language/qwen2vl/classes.hpp | 1 - 5 files changed, 17 insertions(+), 16 deletions(-) diff --git a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp index 7d2341c5de..5a8efcac2c 100644 --- a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp +++ b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp @@ -90,6 +90,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /// uint8 RGB images with [NHWC] or [HWC] layout. /// @param prompt A prompt to respond to. /// @param images Images to be prepended to a prompt. + /// @param video Video frames to be prepended to a prompt. /// @param generation_config A config to follow for text generation. /// @param streamer A streamer to acquire intermediate result. /// @return A string generated by a model. @@ -97,7 +98,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false. VLMDecodedResults generate( const std::string& prompt, - const std::vector& rgbs, + const std::vector& images, const std::vector& video, const GenerationConfig& generation_config, const StreamerVariant& streamer diff --git a/src/cpp/src/visual_language/continuous_batching_adapter.hpp b/src/cpp/src/visual_language/continuous_batching_adapter.hpp index a314caca7d..123a55cf51 100644 --- a/src/cpp/src/visual_language/continuous_batching_adapter.hpp +++ b/src/cpp/src/visual_language/continuous_batching_adapter.hpp @@ -43,13 +43,13 @@ class ov::genai::VLMPipeline::VLMContinuousBatchingAdapter : public ov::genai::V VLMDecodedResults generate( const std::string& prompt, - const std::vector& rgbs, + const std::vector& images, const std::vector& video, GenerationConfig generation_config, const StreamerVariant& streamer ) override { auto start_time = std::chrono::steady_clock::now(); - auto result = m_impl.generate({prompt}, {rgbs}, {video}, {generation_config}, streamer)[0]; + auto result = m_impl.generate({prompt}, {images}, {video}, {generation_config}, streamer)[0]; auto stop_time = std::chrono::steady_clock::now(); VLMDecodedResults decoded; diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 8449fa6c78..cb45014a61 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -152,7 +152,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ VLMDecodedResults generate( const std::string& prompt, - const std::vector& rgbs, + const std::vector& images, const std::vector& video, GenerationConfig generation_config, const StreamerVariant& streamer @@ -178,16 +178,17 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ generation_config.validate(); if (m_is_npu) { - OPENVINO_ASSERT(rgbs.size() <= 1u, "Currently only batch size equal to 1 is supported for NPU device!"); + OPENVINO_ASSERT(images.size() <= 1u, "Currently only batch size equal to 1 is supported for NPU device!"); OPENVINO_ASSERT(generation_config.is_greedy_decoding() || generation_config.is_multinomial(), "Currently only greedy and multinomial decoding are supported for NPU device!"); OPENVINO_ASSERT(generation_config.num_return_sequences == 1u, "Currently only \"num_return_sequences\" equal to 1 is supported for NPU device!"); } + // Currently only one input is supported. Video or images. std::vector encoded_images; - if (rgbs.size() > 0) { - encoded_images = m_inputs_embedder->encode_images(rgbs, false); + if (images.size() > 0) { + encoded_images = m_inputs_embedder->encode_images(images, false); } else if (video.size() > 0) { encoded_images = m_inputs_embedder->encode_images(video, true); } @@ -457,12 +458,12 @@ VLMPipeline::~VLMPipeline() = default; VLMDecodedResults VLMPipeline::generate( const std::string& prompt, - const std::vector& rgbs, + const std::vector& images, const std::vector& video, const GenerationConfig& generation_config, const StreamerVariant& streamer ) { - return m_pimpl->generate(prompt, rgbs, video, generation_config, streamer); + return m_pimpl->generate(prompt, images, video, generation_config, streamer); } VLMDecodedResults VLMPipeline::generate( diff --git a/src/cpp/src/visual_language/pipeline_base.hpp b/src/cpp/src/visual_language/pipeline_base.hpp index 3ecad2e81e..bd8de9c1d4 100644 --- a/src/cpp/src/visual_language/pipeline_base.hpp +++ b/src/cpp/src/visual_language/pipeline_base.hpp @@ -22,7 +22,7 @@ class ov::genai::VLMPipeline::VLMPipelineBase { virtual VLMDecodedResults generate( const std::string& prompt, - const std::vector& rgbs, + const std::vector& images, const std::vector& video, GenerationConfig generation_config, const StreamerVariant& streamer @@ -39,15 +39,15 @@ class ov::genai::VLMPipeline::VLMPipelineBase { config_map.end() == image || config_map.end() == images, "Only one property can be set: image of images." ); - std::vector rgbs; + std::vector image_rgbs; if (config_map.end() != image) { - rgbs = {image->second.as()}; + image_rgbs = {image->second.as()}; } if (config_map.end() != images) { if (images->second.is>()) { - rgbs = images->second.as>(); + image_rgbs = images->second.as>(); } else if (images->second.is()){ - rgbs = {images->second.as()}; + image_rgbs = {images->second.as()}; } else { OPENVINO_THROW("Unknown images type."); @@ -73,7 +73,7 @@ class ov::genai::VLMPipeline::VLMPipelineBase { return generate( prompt, - rgbs, + image_rgbs, video_rgbs, config, utils::get_streamer_from_map(config_map) diff --git a/src/cpp/src/visual_language/qwen2vl/classes.hpp b/src/cpp/src/visual_language/qwen2vl/classes.hpp index 7a08c29c3f..1653ccd781 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.hpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.hpp @@ -21,7 +21,6 @@ class VisionEncoderQwen2VL : public VisionEncoder { std::vector encode_video(const std::vector& image, const ov::AnyMap& config_map) override; private: - ov::Tensor preprocess_single_image(const ov::Tensor& image, const ProcessorConfig& config, const ImageSize& target_image_size); EncodedImage encode_with_imagepreprocess_cpp(const ov::Tensor& image, const ov::AnyMap& config_map); EncodedImage encode_with_imagepreprocess_ov(const ov::Tensor& image, const ov::AnyMap& config_map); bool use_ov_image_preprocess = true; // default use ov image preprocess, control by env IMAGE_PREPROCESS=CPP to use cpp image preprocess From 66cdf383b97e1c2e11141f9a1d6799292af49d1b Mon Sep 17 00:00:00 2001 From: xipingya Date: Mon, 15 Sep 2025 10:35:05 +0800 Subject: [PATCH 012/118] enable if node to unify image and video preprocess. Signed-off-by: xipingya --- .../src/visual_language/gemma3/classes.cpp | 10 +- .../src/visual_language/gemma3/classes.hpp | 2 +- src/cpp/src/visual_language/pipeline.cpp | 2 +- src/cpp/src/visual_language/pipeline_base.hpp | 6 +- .../src/visual_language/qwen2vl/classes.cpp | 266 ++++++++++-------- .../src/visual_language/qwen2vl/classes.hpp | 2 +- 6 files changed, 161 insertions(+), 127 deletions(-) diff --git a/src/cpp/src/visual_language/gemma3/classes.cpp b/src/cpp/src/visual_language/gemma3/classes.cpp index cee5f2cbfd..2a2ff28df0 100644 --- a/src/cpp/src/visual_language/gemma3/classes.cpp +++ b/src/cpp/src/visual_language/gemma3/classes.cpp @@ -71,12 +71,20 @@ bool InputsEmbedderGemma3::has_token_type_ids() const { return true; } -std::vector InputsEmbedderGemma3::encode_images(const std::vector& images) { +std::vector InputsEmbedderGemma3::encode_images(const std::vector& images, const bool& is_video) { std::vector embeds; ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}}; std::vector single_images = to_single_image_tensors(images); + if (is_video) { + embeds = m_vision_encoder->encode_video(single_images, vision_config); + if (!embeds.empty()) { + return embeds; + } + // Fallback to image process. + } + embeds.reserve(single_images.size()); for (const ov::Tensor& image : single_images) { embeds.emplace_back(m_vision_encoder->encode(image, vision_config)); diff --git a/src/cpp/src/visual_language/gemma3/classes.hpp b/src/cpp/src/visual_language/gemma3/classes.hpp index b78fa8d193..ad8f0814df 100644 --- a/src/cpp/src/visual_language/gemma3/classes.hpp +++ b/src/cpp/src/visual_language/gemma3/classes.hpp @@ -41,7 +41,7 @@ class InputsEmbedderGemma3 : public InputsEmbedder::IInputsEmbedder { bool has_token_type_ids() const override; - std::vector encode_images(const std::vector& images) override; + std::vector encode_images(const std::vector& images, const bool& is_video = false) override; std::pair> normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const override; diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index cb45014a61..5f2b1f16da 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -185,7 +185,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ "Currently only \"num_return_sequences\" equal to 1 is supported for NPU device!"); } - // Currently only one input is supported. Video or images. + // Currently only one input is supported. Video, images or image. std::vector encoded_images; if (images.size() > 0) { encoded_images = m_inputs_embedder->encode_images(images, false); diff --git a/src/cpp/src/visual_language/pipeline_base.hpp b/src/cpp/src/visual_language/pipeline_base.hpp index bd8de9c1d4..06e8ee1914 100644 --- a/src/cpp/src/visual_language/pipeline_base.hpp +++ b/src/cpp/src/visual_language/pipeline_base.hpp @@ -35,10 +35,8 @@ class ov::genai::VLMPipeline::VLMPipelineBase { auto image = config_map.find(ov::genai::image.name()); auto images = config_map.find(ov::genai::images.name()); auto video = config_map.find(ov::genai::video.name()); - OPENVINO_ASSERT( - config_map.end() == image || config_map.end() == images, - "Only one property can be set: image of images." - ); + OPENVINO_ASSERT(config_map.end() == image || config_map.end() == images || config_map.end() == video, + "Only one property can be set: image, images, or video."); std::vector image_rgbs; if (config_map.end() != image) { image_rgbs = {image->second.as()}; diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index 7b6a96d561..7e76e0adc2 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -18,6 +18,8 @@ #include "openvino/op/round.hpp" #include "openvino/op/transpose.hpp" #include "openvino/op/tile.hpp" +#include "openvino/op/if.hpp" +#include "openvino/op/concat.hpp" #include "visual_language/vl_sdpa_transformations.hpp" @@ -112,18 +114,70 @@ std::shared_ptr create_flatten_patches(std::shared_ptr input return flattened; } +std::pair, std::shared_ptr> patch_preprocess_branch_image( + std::shared_ptr raw_images_1, + std::shared_ptr resize_shape, + std::shared_ptr image_mean, + std::shared_ptr image_scale, + std::shared_ptr tile_shape) { + auto img_f32_nchw = create_f32_nchw_input(raw_images_1); + auto img_resized = create_bicubic_resize(img_f32_nchw, resize_shape); + auto img_normalized = create_normalization(img_resized, image_mean, image_scale); + auto temporal_images = std::make_shared(img_normalized, tile_shape); + auto results = std::make_shared(temporal_images); + return { + std::make_shared(results, ov::ParameterVector{raw_images_1, resize_shape, tile_shape}, "then_body"), + results}; +} + +std::pair, std::shared_ptr> patch_preprocess_branch_video( + std::shared_ptr same_image, + std::shared_ptr raw_images_1, + std::shared_ptr raw_images_2, + std::shared_ptr resize_shape, + std::shared_ptr image_mean, + std::shared_ptr image_scale) { + auto img_f32_nchw_1 = create_f32_nchw_input(raw_images_1); + auto img_resized_1 = create_bicubic_resize(img_f32_nchw_1, resize_shape); + auto img_normalized_1 = create_normalization(img_resized_1, image_mean, image_scale); + + auto img_f32_nchw_2 = create_f32_nchw_input(raw_images_2); + auto img_resized_2 = create_bicubic_resize(img_f32_nchw_2, resize_shape); + auto img_normalized_2 = create_normalization(img_resized_2, image_mean, image_scale); + + int64_t concat_axis = 0; + ov::NodeVector inputs_to_concat = {img_normalized_1, img_normalized_2}; + auto temporal_images = std::make_shared(inputs_to_concat, concat_axis); + + auto result_temperal_images = std::make_shared(temporal_images); + auto result_ignore = std::make_shared(same_image); + return {std::make_shared(ov::ResultVector{result_temperal_images, result_ignore}, + ov::ParameterVector{same_image, raw_images_1, raw_images_2, resize_shape}, + "else_body"), + result_temperal_images}; +} + std::shared_ptr patch_preprocess_into_model(std::shared_ptr model_org, const ov::Tensor& image_mean_tensor, const ov::Tensor& image_scale_tensor) { - auto input_images = std::make_shared(ov::element::u8, ov::PartialShape{-1, -1, -1, -1}); + auto same_image = std::make_shared(ov::element::f32, ov::Shape{1}); + + auto raw_images_1 = std::make_shared(ov::element::u8, ov::PartialShape{-1, -1, -1, -1}); + auto raw_images_2 = std::make_shared(ov::element::u8, ov::PartialShape{-1, -1, -1, -1}); + auto resize_shape = std::make_shared(ov::element::i64, ov::PartialShape{2}); auto tile_shape = std::make_shared(ov::element::i64, ov::PartialShape{4}); auto reshape_shape8d = std::make_shared(ov::element::i64, ov::PartialShape{8}); auto reshape_shape4d = std::make_shared(ov::element::i64, ov::PartialShape{4}); auto reshape_shape2d = std::make_shared(ov::element::i64, ov::PartialShape{2}); - input_images->set_friendly_name("input_images"); - input_images->output(0).get_tensor().set_names({"input_images"}); + same_image->set_friendly_name("same_image"); + same_image->output(0).get_tensor().set_names({"same_image"}); + + raw_images_1->set_friendly_name("raw_images_1"); + raw_images_1->output(0).get_tensor().set_names({"raw_images_1"}); + raw_images_2->set_friendly_name("raw_images_2"); + raw_images_2->output(0).get_tensor().set_names({"raw_images_2"}); resize_shape->set_friendly_name("resize_shape"); resize_shape->output(0).get_tensor().set_names({"resize_shape"}); @@ -139,16 +193,45 @@ std::shared_ptr patch_preprocess_into_model(std::shared_ptroutput(0).get_tensor().set_names({"reshape_shape2d"}); auto image_mean = std::make_shared(image_mean_tensor); auto image_scale = std::make_shared(image_scale_tensor); - auto img_f32_nchw = create_f32_nchw_input(input_images); - auto img_resized = create_bicubic_resize(img_f32_nchw, resize_shape); + // If + auto then_raw_images_1 = std::make_shared(ov::element::u8, ov::PartialShape{-1, -1, -1, -1}); + auto then_resize_target_shape = std::make_shared(ov::element::i64, ov::PartialShape{2}); + auto then_tile_shape = std::make_shared(ov::element::i64, ov::PartialShape{4}); + auto model_then = patch_preprocess_branch_image(then_raw_images_1, + then_resize_target_shape, + image_mean, + image_scale, + then_tile_shape); + + auto else_same_image = std::make_shared(ov::element::f32, ov::Shape{1}); + auto else_raw_images_1 = std::make_shared(ov::element::u8, ov::PartialShape{-1, -1, -1, -1}); + auto else_raw_images_2 = std::make_shared(ov::element::u8, ov::PartialShape{-1, -1, -1, -1}); + auto else_resize_target_shape = std::make_shared(ov::element::i64, ov::PartialShape{2}); + auto model_else = patch_preprocess_branch_video(else_same_image, + else_raw_images_1, + else_raw_images_2, + else_resize_target_shape, + image_mean, + image_scale); + + auto if_op = std::make_shared(); + if_op->set_then_body(model_then.first); + if_op->set_else_body(model_else.first); + if_op->set_input(same_image->output(0), nullptr, else_same_image); + + if_op->set_input(raw_images_1->output(0), nullptr, else_raw_images_1); + if_op->set_input(raw_images_2->output(0), nullptr, else_raw_images_2); + if_op->set_input(resize_shape->output(0), nullptr, else_resize_target_shape); - auto img_normalized = create_normalization(img_resized, image_mean, image_scale); + if_op->set_input(raw_images_1->output(0), then_raw_images_1, nullptr); + if_op->set_input(resize_shape->output(0), then_resize_target_shape, nullptr); + if_op->set_input(tile_shape->output(0), then_tile_shape, nullptr); - auto temporal_images = std::make_shared(img_normalized, tile_shape); + auto temporal_images = if_op->set_output(model_then.second, model_else.second); auto img_8d = - create_transpose_patches(temporal_images, + create_transpose_patches(temporal_images.get_node_shared_ptr(), reshape_shape8d, std::make_shared(ov::element::i32, Shape{8}, @@ -168,11 +251,17 @@ std::shared_ptr patch_preprocess_into_model(std::shared_ptrget_results(); - return std::make_shared( - results, - ov::ParameterVector{input_images, resize_shape, tile_shape, reshape_shape8d, reshape_shape4d, reshape_shape2d}); + return std::make_shared(results, + ov::ParameterVector{same_image, + raw_images_1, + raw_images_2, + resize_shape, + tile_shape, + reshape_shape8d, + reshape_shape4d, + reshape_shape2d}); } -} // namespace +} // namespace namespace qwen2_vl_utils { @@ -550,117 +639,22 @@ EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_cpp(const ov::Ten std::copy(config.image_std.begin(), config.image_std.end(), ctx.image_std); clip_image_f32 normalized_image = clip_image_preprocess(ctx, resized_image); - return clip_image_f32_to_tensor(normalized_image); -} - -std::vector VisionEncoderQwen2VL::encode_video(const std::vector& images, - const ov::AnyMap& config_map) { - CircularBufferQueueElementGuard infer_request_guard(this->m_ireq_queue_vision_encoder.get()); - ov::InferRequest& encoder = infer_request_guard.get(); - ProcessorConfig config = utils::from_any_map(config_map, m_processor_config); - - if (images.empty()) { - return {}; - } - - std::vector tiled_images = images; - size_t remainder = images.size() % config.temporal_patch_size; - if (remainder > 0) { - for (size_t i = 0; i < config.temporal_patch_size - remainder; i++) { - tiled_images.push_back(images.back()); - } - } - - ov::Shape image_shape = images[0].get_shape(); - auto original_height = image_shape.at(1); - auto original_width = image_shape.at(2); - ImageSize target_image_size = qwen2_vl_utils::smart_resize(original_height, - original_width, - config.patch_size * config.merge_size, - config.min_pixels, - config.max_pixels); - - std::vector encoded_imgs; - for (size_t i = 0; i < tiled_images.size(); i += config.temporal_patch_size) { - auto orig_shape = tiled_images[i].get_shape(); - ov::Tensor tiled_patches(ov::element::f32, - {config.temporal_patch_size, orig_shape.at(3), orig_shape.at(1), orig_shape.at(2)}); - - for (size_t j = 0; j < config.temporal_patch_size; j++) { - auto patch = preproces_single_image(tiled_images[i + j], config, target_image_size); - std::memcpy(tiled_patches.data() + j * patch.get_byte_size() / sizeof(float), - patch.data(), - patch.get_byte_size()); - } - - ov::Tensor patches = std::move(tiled_patches); - auto patches_shape = patches.get_shape(); - size_t channel = patches_shape.at(1); - - size_t grid_t = patches_shape.at(0) / config.temporal_patch_size; - size_t grid_h = target_image_size.height / config.patch_size; - size_t grid_w = target_image_size.width / config.patch_size; - - ov::Tensor reshaped_patches = qwen2_vl_utils::reshape_image_patches( - patches, grid_t, grid_h, grid_w, channel, config.temporal_patch_size, config.patch_size, config.merge_size - ); - ov::Tensor transposed_patches = qwen2_vl_utils::transpose_image_patches(reshaped_patches); - - ov::Shape flattened_patches_shape{ - grid_t * grid_h * grid_w, - channel * config.temporal_patch_size * config.patch_size * config.patch_size - }; - ov::Tensor flattened_patches(transposed_patches.get_element_type(), flattened_patches_shape); - std::memcpy(flattened_patches.data(), transposed_patches.data(), transposed_patches.get_byte_size()); - - encoder.set_tensor("hidden_states", flattened_patches); - encoder.infer(); - - const ov::Tensor& infer_output = encoder.get_output_tensor(); - ov::Tensor image_features(infer_output.get_element_type(), infer_output.get_shape()); - std::memcpy(image_features.data(), infer_output.data(), infer_output.get_byte_size()); - - ImageSize resized_source_size{grid_h, grid_w}; - - encoded_imgs.push_back({std::move(image_features), resized_source_size}); - } - return encoded_imgs; -} - -EncodedImage VisionEncoderQwen2VL::encode(const ov::Tensor& image, const ov::AnyMap& config_map) { - CircularBufferQueueElementGuard infer_request_guard(this->m_ireq_queue_vision_encoder.get()); - ov::InferRequest& encoder = infer_request_guard.get(); - ProcessorConfig config = utils::from_any_map(config_map, m_processor_config); - - ov::Shape image_shape = image.get_shape(); - auto original_height = image_shape.at(1); - auto original_width = image_shape.at(2); - ImageSize target_image_size = qwen2_vl_utils::smart_resize(original_height, - original_width, - config.patch_size * config.merge_size, - config.min_pixels, - config.max_pixels); - ov::Tensor patches = preproces_single_image(image, config, target_image_size); - + ov::Tensor patches = clip_image_f32_to_tensor(normalized_image); // For single patch tile it to match temporal_patch_size if (patches.get_shape().at(0) == 1) { auto orig_shape = patches.get_shape(); ov::Tensor tiled_patches(patches.get_element_type(), - {config.temporal_patch_size, orig_shape.at(1), orig_shape.at(2), orig_shape.at(3)}); - + {config.temporal_patch_size, orig_shape.at(1), orig_shape.at(2), orig_shape.at(3)}); for (size_t i = 0; i < config.temporal_patch_size; i++) { - std::memcpy( - tiled_patches.data() + i * patches.get_byte_size() / sizeof(float), - patches.data(), - patches.get_byte_size() - ); + std::memcpy(tiled_patches.data() + i * patches.get_byte_size() / sizeof(float), + patches.data(), + patches.get_byte_size()); + } patches = std::move(tiled_patches); } - auto patches_shape = patches.get_shape(); size_t channel = patches_shape.at(1); - size_t grid_t = patches_shape.at(0) / config.temporal_patch_size; size_t grid_h = target_image_size.height / config.patch_size; size_t grid_w = target_image_size.width / config.patch_size; @@ -688,12 +682,12 @@ EncodedImage VisionEncoderQwen2VL::encode(const ov::Tensor& image, const ov::Any } // keep both implementations for comparison and testing, here is the ov version -EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const ov::Tensor& image, const ov::AnyMap& config_map) { +EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const std::vector& images, const ov::AnyMap& config_map) { CircularBufferQueueElementGuard infer_request_guard(this->m_ireq_queue_vision_encoder.get()); ov::InferRequest& encoder = infer_request_guard.get(); ProcessorConfig config = utils::from_any_map(config_map, m_processor_config); - ov::Shape image_shape = image.get_shape(); + ov::Shape image_shape = images[0].get_shape(); auto original_height = image_shape.at(1); auto original_width = image_shape.at(2); @@ -705,12 +699,16 @@ EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const ov::Tens config.max_pixels ); - ov::Tensor input_images(ov::element::u8, image_shape, image.data()); + OPENVINO_ASSERT(config.temporal_patch_size == 2u, "temporal_patch_size != 2."); + ov::Tensor input_image_1(ov::element::u8, image_shape, images[0].data()); + ov::Tensor input_image_2(ov::element::u8, + image_shape, + images.size() == 2 ? images[1].data() : images[0].data()); uint64_t a_target_shape[2] = {target_image_size.height, target_image_size.width}; ov::Tensor target_shape(ov::element::i64, ov::Shape{2}, a_target_shape); - auto patches_shape = image.get_shape(); + auto patches_shape = images[0].get_shape(); size_t temporal_patch_size = std::max(static_cast(patches_shape.at(0)), static_cast(config.temporal_patch_size)); size_t channel = image_shape.at(3); @@ -739,7 +737,8 @@ EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const ov::Tens ov::Tensor reshape_shape4d(ov::element::i64, ov::Shape{4}, a_temp_shape4d); ov::Tensor reshape_shape2d(ov::element::i64, ov::Shape{2}, last_output_shape); - encoder.set_tensor("input_images", input_images); + encoder.set_tensor("raw_images_1", input_image_1); + encoder.set_tensor("raw_images_2", input_image_2); encoder.set_tensor("resize_shape", target_shape); encoder.set_tensor("tile_shape", tile_shape); encoder.set_tensor("reshape_shape8d", reshape_shape8d); @@ -761,7 +760,36 @@ EncodedImage VisionEncoderQwen2VL::encode(const ov::Tensor& image, const ov::Any if (use_ov_image_preprocess == false) { return encode_with_imagepreprocess_cpp(image, config_map); } - return encode_with_imagepreprocess_ov(image, config_map); + return encode_with_imagepreprocess_ov({image}, config_map); +} + +std::vector VisionEncoderQwen2VL::encode_video(const std::vector& images, + const ov::AnyMap& config_map) { + ProcessorConfig config = utils::from_any_map(config_map, m_processor_config); + std::vector encoded_imgs; + size_t i = 0; + for (; i < images.size(); i += config.temporal_patch_size) { + EncodedImage encoded_img; + if (use_ov_image_preprocess == false) { + // return encode_with_imagepreprocess_cpp(image, config_map); + std::cout << "Warning: Not implemented. fallback to encode_with_imagepreprocess_ov" << std::endl; + } + + encoded_img = encode_with_imagepreprocess_ov( + std::vector(images.begin() + i, images.begin() + i + config.temporal_patch_size), + config_map); + } + for (; i < images.size(); i++) { + EncodedImage encoded_img; + if (use_ov_image_preprocess == false) { + // return encode_with_imagepreprocess_cpp(image, config_map); + std::cout << "Warning: Not implemented. fallback to encode_with_imagepreprocess_ov" << std::endl; + } + + encoded_img = encode_with_imagepreprocess_ov({images[i]}, config_map); + } + + return encoded_imgs; } InputsEmbedderQwen2VL::InputsEmbedderQwen2VL( diff --git a/src/cpp/src/visual_language/qwen2vl/classes.hpp b/src/cpp/src/visual_language/qwen2vl/classes.hpp index 1653ccd781..d5a082e489 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.hpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.hpp @@ -22,7 +22,7 @@ class VisionEncoderQwen2VL : public VisionEncoder { private: EncodedImage encode_with_imagepreprocess_cpp(const ov::Tensor& image, const ov::AnyMap& config_map); - EncodedImage encode_with_imagepreprocess_ov(const ov::Tensor& image, const ov::AnyMap& config_map); + EncodedImage encode_with_imagepreprocess_ov(const std::vector& image, const ov::AnyMap& config_map); bool use_ov_image_preprocess = true; // default use ov image preprocess, control by env IMAGE_PREPROCESS=CPP to use cpp image preprocess }; From 3eda0364aeb30c50d025880a25643e7a65d84804 Mon Sep 17 00:00:00 2001 From: xipingya Date: Mon, 15 Sep 2025 16:24:01 +0800 Subject: [PATCH 013/118] cpp preprocess: enable video preprecess. Signed-off-by: xipingya --- src/cpp/src/visual_language/llava/classes.cpp | 2 +- .../visual_language/llava_next/classes.cpp | 2 +- .../src/visual_language/qwen2vl/classes.cpp | 90 ++++++++++--------- .../src/visual_language/qwen2vl/classes.hpp | 2 +- .../src/visual_language/vision_encoder.hpp | 2 + 5 files changed, 52 insertions(+), 46 deletions(-) diff --git a/src/cpp/src/visual_language/llava/classes.cpp b/src/cpp/src/visual_language/llava/classes.cpp index 8201f7f31c..1af833668a 100644 --- a/src/cpp/src/visual_language/llava/classes.cpp +++ b/src/cpp/src/visual_language/llava/classes.cpp @@ -94,7 +94,7 @@ InputsEmbedderLLaVA::InputsEmbedderLLaVA( std::vector InputsEmbedderLLaVA::encode_images(const std::vector& images, const bool& is_video) { if (is_video) { - std::cout << "== Warning: LLaVA doesn't support video process. Input images are processed as separate images." << std::endl; + Logger::warn("LLaVA doesn't support video preprocess currently. Input images are processed as separate images."); } std::vector embeds; ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}}; diff --git a/src/cpp/src/visual_language/llava_next/classes.cpp b/src/cpp/src/visual_language/llava_next/classes.cpp index d849109d97..3f6a6dcd8e 100644 --- a/src/cpp/src/visual_language/llava_next/classes.cpp +++ b/src/cpp/src/visual_language/llava_next/classes.cpp @@ -335,7 +335,7 @@ ov::Tensor pack_image_features_llava_next( std::vector InputsEmbedderLLaVANext::encode_images(const std::vector& images, const bool& is_video) { if (is_video) { - std::cout << "== Warning: LLaVANext doesn't support video process. " << std::endl; + Logger::warn("LLaVANext doesn't support video preprocess currently. Input images are processed as separate images."); } std::vector embeds; diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index 7e76e0adc2..bc7c7658fd 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -615,44 +615,45 @@ VisionEncoderQwen2VL::VisionEncoderQwen2VL(const ModelsMap& models_map, } // keep both implementations for comparison and testing, here is the cpp version -EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_cpp(const ov::Tensor& image, const ov::AnyMap& config_map) { +EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_cpp(const std::vector& images, const ov::AnyMap& config_map) { CircularBufferQueueElementGuard infer_request_guard(this->m_ireq_queue_vision_encoder.get()); ov::InferRequest& encoder = infer_request_guard.get(); ProcessorConfig config = utils::from_any_map(config_map, m_processor_config); - ov::Shape image_shape = image.get_shape(); - auto original_height = image_shape.at(1); - auto original_width = image_shape.at(2); - ImageSize target_image_size = qwen2_vl_utils::smart_resize( - original_height, - original_width, - config.patch_size * config.merge_size, - config.min_pixels, - config.max_pixels - ); + + OPENVINO_ASSERT(config.temporal_patch_size == 2u, "temporal_patch_size != 2."); + if (images.size() > 1) + OPENVINO_ASSERT(config.temporal_patch_size == images.size(), "temporal_patch_size != images.size()"); - clip_image_u8 input_image = tensor_to_clip_image_u8(image); - clip_image_u8 resized_image; - bicubic_resize(input_image, resized_image, target_image_size.width, target_image_size.height); - - clip_ctx ctx; - std::copy(config.image_mean.begin(), config.image_mean.end(), ctx.image_mean); - std::copy(config.image_std.begin(), config.image_std.end(), ctx.image_std); - clip_image_f32 normalized_image = clip_image_preprocess(ctx, resized_image); - - ov::Tensor patches = clip_image_f32_to_tensor(normalized_image); - // For single patch tile it to match temporal_patch_size - if (patches.get_shape().at(0) == 1) { - auto orig_shape = patches.get_shape(); - ov::Tensor tiled_patches(patches.get_element_type(), - {config.temporal_patch_size, orig_shape.at(1), orig_shape.at(2), orig_shape.at(3)}); - for (size_t i = 0; i < config.temporal_patch_size; i++) { - std::memcpy(tiled_patches.data() + i * patches.get_byte_size() / sizeof(float), - patches.data(), - patches.get_byte_size()); - - } - patches = std::move(tiled_patches); + ov::Shape orig_shape = images[0].get_shape(); + ImageSize target_image_size = qwen2_vl_utils::smart_resize(orig_shape.at(1), + orig_shape.at(2), + config.patch_size * config.merge_size, + config.min_pixels, + config.max_pixels); + + ov::Tensor tiled_patches(ov::element::f32, + {config.temporal_patch_size, 3, target_image_size.height, target_image_size.width}); + + for (size_t i = 0; i < config.temporal_patch_size; i++) { + const auto& image = images.size() > i ? images[i] : images[0]; + + clip_image_u8 input_image = tensor_to_clip_image_u8(image); + clip_image_u8 resized_image; + bicubic_resize(input_image, resized_image, target_image_size.width, target_image_size.height); + + clip_ctx ctx; + std::copy(config.image_mean.begin(), config.image_mean.end(), ctx.image_mean); + std::copy(config.image_std.begin(), config.image_std.end(), ctx.image_std); + clip_image_f32 normalized_image = clip_image_preprocess(ctx, resized_image); + + auto patch = clip_image_f32_to_tensor(normalized_image); + + std::memcpy(tiled_patches.data() + i * patch.get_byte_size() / sizeof(float), + patch.data(), + patch.get_byte_size()); } + auto patches = std::move(tiled_patches); + auto patches_shape = patches.get_shape(); size_t channel = patches_shape.at(1); size_t grid_t = patches_shape.at(0) / config.temporal_patch_size; @@ -758,7 +759,7 @@ EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const std::vec EncodedImage VisionEncoderQwen2VL::encode(const ov::Tensor& image, const ov::AnyMap& config_map) { if (use_ov_image_preprocess == false) { - return encode_with_imagepreprocess_cpp(image, config_map); + return encode_with_imagepreprocess_cpp({image}, config_map); } return encode_with_imagepreprocess_ov({image}, config_map); } @@ -771,22 +772,25 @@ std::vector VisionEncoderQwen2VL::encode_video(const std::vector(images.begin() + i, images.begin() + i + config.temporal_patch_size), + config_map); + } else { + encoded_img = encode_with_imagepreprocess_ov( + std::vector(images.begin() + i, images.begin() + i + config.temporal_patch_size), + config_map); } - encoded_img = encode_with_imagepreprocess_ov( - std::vector(images.begin() + i, images.begin() + i + config.temporal_patch_size), - config_map); + encoded_imgs.push_back(encoded_img); } for (; i < images.size(); i++) { EncodedImage encoded_img; if (use_ov_image_preprocess == false) { - // return encode_with_imagepreprocess_cpp(image, config_map); - std::cout << "Warning: Not implemented. fallback to encode_with_imagepreprocess_ov" << std::endl; + encoded_img = encode_with_imagepreprocess_cpp({images[i]}, config_map); + } else { + encoded_img = encode_with_imagepreprocess_ov({images[i]}, config_map); } - - encoded_img = encode_with_imagepreprocess_ov({images[i]}, config_map); + encoded_imgs.push_back(encoded_img); } return encoded_imgs; diff --git a/src/cpp/src/visual_language/qwen2vl/classes.hpp b/src/cpp/src/visual_language/qwen2vl/classes.hpp index d5a082e489..87fa1588c8 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.hpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.hpp @@ -21,7 +21,7 @@ class VisionEncoderQwen2VL : public VisionEncoder { std::vector encode_video(const std::vector& image, const ov::AnyMap& config_map) override; private: - EncodedImage encode_with_imagepreprocess_cpp(const ov::Tensor& image, const ov::AnyMap& config_map); + EncodedImage encode_with_imagepreprocess_cpp(const std::vector& image, const ov::AnyMap& config_map); EncodedImage encode_with_imagepreprocess_ov(const std::vector& image, const ov::AnyMap& config_map); bool use_ov_image_preprocess = true; // default use ov image preprocess, control by env IMAGE_PREPROCESS=CPP to use cpp image preprocess }; diff --git a/src/cpp/src/visual_language/vision_encoder.hpp b/src/cpp/src/visual_language/vision_encoder.hpp index 4c4fc77b7e..46907b82b0 100644 --- a/src/cpp/src/visual_language/vision_encoder.hpp +++ b/src/cpp/src/visual_language/vision_encoder.hpp @@ -5,11 +5,13 @@ #include #include "openvino/runtime/infer_request.hpp" +#include "logger.hpp" #include "openvino/genai/common_types.hpp" #include "visual_language/vlm_config.hpp" #include "visual_language/processor_config.hpp" #include "circular_buffer_queue.hpp" + namespace ov::genai { /// @brief A pair describing image size. struct ImageSize { From 3df267f90b4b43c4d7fdea53abc411d585d4d35b Mon Sep 17 00:00:00 2001 From: xipingya Date: Mon, 15 Sep 2025 16:43:55 +0800 Subject: [PATCH 014/118] Pass same_images Signed-off-by: xipingya --- src/cpp/src/visual_language/qwen2vl/classes.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index bc7c7658fd..3dcebc9315 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -683,6 +683,8 @@ EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_cpp(const std::ve } // keep both implementations for comparison and testing, here is the ov version +// input multiple images, process based on video. +// input single image, process based on image. EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const std::vector& images, const ov::AnyMap& config_map) { CircularBufferQueueElementGuard infer_request_guard(this->m_ireq_queue_vision_encoder.get()); ov::InferRequest& encoder = infer_request_guard.get(); @@ -701,6 +703,8 @@ EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const std::vec ); OPENVINO_ASSERT(config.temporal_patch_size == 2u, "temporal_patch_size != 2."); + + ov::Tensor same_image(ov::element::f32, ov::Shape{1}, std::vector{images.size() == 2 ? 1 : 0}.data()); ov::Tensor input_image_1(ov::element::u8, image_shape, images[0].data()); ov::Tensor input_image_2(ov::element::u8, image_shape, @@ -738,6 +742,7 @@ EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const std::vec ov::Tensor reshape_shape4d(ov::element::i64, ov::Shape{4}, a_temp_shape4d); ov::Tensor reshape_shape2d(ov::element::i64, ov::Shape{2}, last_output_shape); + encoder.set_tensor("same_image", same_image); encoder.set_tensor("raw_images_1", input_image_1); encoder.set_tensor("raw_images_2", input_image_2); encoder.set_tensor("resize_shape", target_shape); From bf3169b48f730ffc7d3f5cdfc533dc5536237318 Mon Sep 17 00:00:00 2001 From: xipingya Date: Mon, 15 Sep 2025 16:54:03 +0800 Subject: [PATCH 015/118] add commments for same image --- src/cpp/src/visual_language/qwen2vl/classes.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index 3dcebc9315..057f5d131a 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -743,6 +743,7 @@ EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const std::vec ov::Tensor reshape_shape2d(ov::element::i64, ov::Shape{2}, last_output_shape); encoder.set_tensor("same_image", same_image); + // Same image means just duplicating input_image_1 as input_image_2. encoder.set_tensor("raw_images_1", input_image_1); encoder.set_tensor("raw_images_2", input_image_2); encoder.set_tensor("resize_shape", target_shape); From e1250aa8019297dcc73547f8861c037824d55749 Mon Sep 17 00:00:00 2001 From: xipingya Date: Tue, 16 Sep 2025 09:48:06 +0800 Subject: [PATCH 016/118] Update loop condition, and rename variables. Signed-off-by: xipingya --- src/cpp/src/visual_language/inputs_embedder.cpp | 6 +----- src/cpp/src/visual_language/qwen2vl/classes.cpp | 13 +++++++------ 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index b4af9186c4..b6026404ac 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -169,11 +169,7 @@ std::vector InputsEmbedder::IInputsEmbedder::encode_ima std::vector embeds; if (is_video) { - embeds = m_vision_encoder->encode_video(single_images); - if (!embeds.empty()) { - return embeds; - } - // Fallback to image process. + return m_vision_encoder->encode_video(single_images); } for (const ov::Tensor& image : single_images) { diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index 057f5d131a..002058cb5c 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -725,7 +725,7 @@ EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const std::vec if (patches_shape.at(0) == 1) { repeats = config.temporal_patch_size; } - uint64_t a_broadcast_shape[4] = {static_cast(repeats), 1, 1, 1}; + uint64_t a_tile_shape[4] = {static_cast(repeats), 1, 1, 1}; uint64_t a_temp_shape8d[8] = { grid_t, temporal_patch_size * channel, grid_h / config.merge_size, config.merge_size, config.patch_size, grid_w / config.merge_size, config.merge_size, config.patch_size @@ -737,13 +737,13 @@ EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const std::vec config.patch_size * config.patch_size }; uint64_t last_output_shape[2] = {grid_t * grid_h * grid_w, channel * temporal_patch_size * config.patch_size * config.patch_size}; - ov::Tensor tile_shape(ov::element::i64, ov::Shape{4}, a_broadcast_shape); + ov::Tensor tile_shape(ov::element::i64, ov::Shape{4}, a_tile_shape); ov::Tensor reshape_shape8d(ov::element::i64, ov::Shape{8}, a_temp_shape8d); ov::Tensor reshape_shape4d(ov::element::i64, ov::Shape{4}, a_temp_shape4d); ov::Tensor reshape_shape2d(ov::element::i64, ov::Shape{2}, last_output_shape); + // Same image means just duplicating input_image_1 as input_image_2 or not. encoder.set_tensor("same_image", same_image); - // Same image means just duplicating input_image_1 as input_image_2. encoder.set_tensor("raw_images_1", input_image_1); encoder.set_tensor("raw_images_2", input_image_2); encoder.set_tensor("resize_shape", target_shape); @@ -774,8 +774,9 @@ std::vector VisionEncoderQwen2VL::encode_video(const std::vector encoded_imgs; - size_t i = 0; - for (; i < images.size(); i += config.temporal_patch_size) { + int i = 0; + int image_num = static_cast(images.size()); + for (; i < image_num - static_cast(config.temporal_patch_size); i += config.temporal_patch_size) { EncodedImage encoded_img; if (use_ov_image_preprocess == false) { encoded_img = encode_with_imagepreprocess_cpp( @@ -789,7 +790,7 @@ std::vector VisionEncoderQwen2VL::encode_video(const std::vector Date: Tue, 16 Sep 2025 09:56:59 +0800 Subject: [PATCH 017/118] Update src/cpp/src/visual_language/pipeline_base.hpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/cpp/src/visual_language/pipeline_base.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cpp/src/visual_language/pipeline_base.hpp b/src/cpp/src/visual_language/pipeline_base.hpp index 06e8ee1914..7f87672ae4 100644 --- a/src/cpp/src/visual_language/pipeline_base.hpp +++ b/src/cpp/src/visual_language/pipeline_base.hpp @@ -35,8 +35,8 @@ class ov::genai::VLMPipeline::VLMPipelineBase { auto image = config_map.find(ov::genai::image.name()); auto images = config_map.find(ov::genai::images.name()); auto video = config_map.find(ov::genai::video.name()); - OPENVINO_ASSERT(config_map.end() == image || config_map.end() == images || config_map.end() == video, - "Only one property can be set: image, images, or video."); + int num_set = (config_map.end() != image) + (config_map.end() != images) + (config_map.end() != video); + OPENVINO_ASSERT(num_set == 1, "Only one property can be set: image, images, or video."); std::vector image_rgbs; if (config_map.end() != image) { image_rgbs = {image->second.as()}; From dec67b2ed790fcea69f23f139dc36c8bbd8992b7 Mon Sep 17 00:00:00 2001 From: xipingya Date: Tue, 16 Sep 2025 13:47:57 +0800 Subject: [PATCH 018/118] video should be frames. Signed-off-by: xipingya --- src/python/openvino_genai/py_openvino_genai.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index f2dfbffab2..599d16824a 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -3474,7 +3474,7 @@ class VLMPipeline: :param images: image or list of images :type images: list[ov.Tensor] or ov.Tensor - :param video: list of images + :param video: list of frames :type video: list[ov.Tensor] :param generation_config: generation_config From caee3fdf8a5d282abd8285fbf91f39124c1a9ec6 Mon Sep 17 00:00:00 2001 From: xipingya Date: Tue, 16 Sep 2025 15:46:01 +0800 Subject: [PATCH 019/118] Add pytest for video input. Signed-off-by: xipingya --- tests/python_tests/test_vlm_pipeline.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index cf33dfda8a..b57bdeaae8 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -974,3 +974,24 @@ def test_vlm_pipeline_match_optimum_preresized(request, model_id, image_name, ba genai_text = genai_output.texts[0] assert optimum_text == genai_text + + +@pytest.mark.precommit +@pytest.mark.parametrize( + "model_id, image_name, backend", + [ + pytest.param("katuni4ka/tiny-random-qwen2.5-vl", "cat_image_336x336", "SDPA"), + pytest.param("katuni4ka/tiny-random-qwen2.5-vl", "cat_image_336x336", "PA"), + ], +) +def test_vlm_pipeline_video_input(request, model_id, image_name, backend): + resized_image = request.getfixturevalue(image_name) + + prompt = "Describe this image." + max_new_tokens = 10 + + model_path = get_ov_model(model_id) + + vlm = VLMPipeline(model_path, "CPU", ATTENTION_BACKEND=backend) + with pytest.raises(Exception): + genai_output = vlm.generate(prompt, video=[openvino.Tensor(resized_image)], max_new_tokens=max_new_tokens) \ No newline at end of file From 1502b285535884895639521dbc36dc7fe703f2d5 Mon Sep 17 00:00:00 2001 From: xipingya Date: Wed, 17 Sep 2025 13:25:40 +0800 Subject: [PATCH 020/118] Remove is_video python attribute. Add "video" to continues batching. Signed-off-by: xipingya --- src/python/openvino_genai/py_openvino_genai.pyi | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 599d16824a..ebfc197fa4 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -426,7 +426,7 @@ class ContinuousBatchingPipeline: def generate(self, prompt: str, generation_config: GenerationConfig, streamer: collections.abc.Callable[[str], int | None] | openvino_genai.py_openvino_genai.StreamerBase | None = None) -> list[GenerationResult]: ... @typing.overload - def generate(self, prompts: collections.abc.Sequence[str], images: collections.abc.Sequence[collections.abc.Sequence[openvino._pyopenvino.Tensor]], generation_config: collections.abc.Sequence[GenerationConfig], streamer: collections.abc.Callable[[str], int | None] | openvino_genai.py_openvino_genai.StreamerBase | None = None) -> list[GenerationResult]: + def generate(self, prompts: collections.abc.Sequence[str], images: collections.abc.Sequence[collections.abc.Sequence[openvino._pyopenvino.Tensor]], video: collections.abc.Sequence[collections.abc.Sequence[openvino._pyopenvino.Tensor]], generation_config: collections.abc.Sequence[GenerationConfig], streamer: collections.abc.Callable[[str], int | None] | openvino_genai.py_openvino_genai.StreamerBase | None = None) -> list[GenerationResult]: ... def get_config(self) -> GenerationConfig: ... @@ -837,12 +837,6 @@ class GenerationConfig: def max_new_tokens(self, arg0: typing.SupportsInt) -> None: ... @property - def is_video(self) -> bool: - ... - @is_video.setter - def is_video(self, arg0: typing.SupportsInt) -> None: - ... - @property def max_ngram_size(self) -> int: ... @max_ngram_size.setter From 4d8e867719fd94e9d7a5e61b08819793c13c55af Mon Sep 17 00:00:00 2001 From: xipingya Date: Wed, 17 Sep 2025 13:44:49 +0800 Subject: [PATCH 021/118] rename video to videos --- src/python/openvino_genai/py_openvino_genai.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index ebfc197fa4..4a8c80c880 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -426,7 +426,7 @@ class ContinuousBatchingPipeline: def generate(self, prompt: str, generation_config: GenerationConfig, streamer: collections.abc.Callable[[str], int | None] | openvino_genai.py_openvino_genai.StreamerBase | None = None) -> list[GenerationResult]: ... @typing.overload - def generate(self, prompts: collections.abc.Sequence[str], images: collections.abc.Sequence[collections.abc.Sequence[openvino._pyopenvino.Tensor]], video: collections.abc.Sequence[collections.abc.Sequence[openvino._pyopenvino.Tensor]], generation_config: collections.abc.Sequence[GenerationConfig], streamer: collections.abc.Callable[[str], int | None] | openvino_genai.py_openvino_genai.StreamerBase | None = None) -> list[GenerationResult]: + def generate(self, prompts: collections.abc.Sequence[str], images: collections.abc.Sequence[collections.abc.Sequence[openvino._pyopenvino.Tensor]], videos: collections.abc.Sequence[collections.abc.Sequence[openvino._pyopenvino.Tensor]], generation_config: collections.abc.Sequence[GenerationConfig], streamer: collections.abc.Callable[[str], int | None] | openvino_genai.py_openvino_genai.StreamerBase | None = None) -> list[GenerationResult]: ... def get_config(self) -> GenerationConfig: ... From ea7fc94e4b24fbebf1de4beefbf984fd0ef52e04 Mon Sep 17 00:00:00 2001 From: xipingya Date: Wed, 17 Sep 2025 14:10:42 +0800 Subject: [PATCH 022/118] Update docs, and add video for add_request. Signed-off-by: xipingya --- src/python/openvino_genai/py_openvino_genai.pyi | 2 +- src/python/py_vlm_pipeline.cpp | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 4a8c80c880..b0a91f3e46 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -412,7 +412,7 @@ class ContinuousBatchingPipeline: def add_request(self, request_id: typing.SupportsInt, prompt: str, generation_config: GenerationConfig) -> GenerationHandle: ... @typing.overload - def add_request(self, request_id: typing.SupportsInt, prompt: str, images: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig) -> GenerationHandle: + def add_request(self, request_id: typing.SupportsInt, prompt: str, images: collections.abc.Sequence[openvino._pyopenvino.Tensor], video: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig) -> GenerationHandle: ... def finish_chat(self) -> None: ... diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp index a2e805823a..b254b89a98 100644 --- a/src/python/py_vlm_pipeline.cpp +++ b/src/python/py_vlm_pipeline.cpp @@ -46,6 +46,9 @@ auto vlm_generate_docstring = R"( :param images: image or list of images :type images: list[ov.Tensor] or ov.Tensor + :param video: list of frames + :type video: list[ov.Tensor] + :param generation_config: generation_config :type generation_config: GenerationConfig or a dict From 60364bfdb3adf261d774f6f1a72d914965c8b371 Mon Sep 17 00:00:00 2001 From: xipingya Date: Wed, 17 Sep 2025 14:33:04 +0800 Subject: [PATCH 023/118] Fix docs format. Signed-off-by: xipingya --- src/python/openvino_genai/py_openvino_genai.pyi | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index b0a91f3e46..c1f63377f9 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -3467,10 +3467,10 @@ class VLMPipeline: :param images: image or list of images :type images: list[ov.Tensor] or ov.Tensor - + :param video: list of frames :type video: list[ov.Tensor] - + :param generation_config: generation_config :type generation_config: GenerationConfig or a dict @@ -3510,6 +3510,9 @@ class VLMPipeline: :param images: image or list of images :type images: list[ov.Tensor] or ov.Tensor + :param video: list of frames + :type video: list[ov.Tensor] + :param generation_config: generation_config :type generation_config: GenerationConfig or a dict From 4ea5b3d5602947b6d3bc4f3b521a16b7cdd78a90 Mon Sep 17 00:00:00 2001 From: xipingya Date: Thu, 18 Sep 2025 11:25:43 +0800 Subject: [PATCH 024/118] Fix test error: can't catch exception. Signed-off-by: xipingya --- tests/python_tests/test_vlm_pipeline.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index b57bdeaae8..a6892b7885 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -993,5 +993,4 @@ def test_vlm_pipeline_video_input(request, model_id, image_name, backend): model_path = get_ov_model(model_id) vlm = VLMPipeline(model_path, "CPU", ATTENTION_BACKEND=backend) - with pytest.raises(Exception): - genai_output = vlm.generate(prompt, video=[openvino.Tensor(resized_image)], max_new_tokens=max_new_tokens) \ No newline at end of file + genai_output = vlm.generate(prompt, video=[openvino.Tensor(resized_image)], max_new_tokens=max_new_tokens) \ No newline at end of file From 8a0ab2e6ffdb430a139aa50ec5d1bac300e50485 Mon Sep 17 00:00:00 2001 From: xipingya Date: Thu, 18 Sep 2025 12:13:03 +0800 Subject: [PATCH 025/118] Fix: cannot be narrowed from type 'int' to 'float' in initializer list Signed-off-by: xipingya --- src/cpp/src/visual_language/qwen2vl/classes.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index 002058cb5c..2399c515b2 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -704,7 +704,7 @@ EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const std::vec OPENVINO_ASSERT(config.temporal_patch_size == 2u, "temporal_patch_size != 2."); - ov::Tensor same_image(ov::element::f32, ov::Shape{1}, std::vector{images.size() == 2 ? 1 : 0}.data()); + ov::Tensor same_image(ov::element::f32, ov::Shape{1}, std::vector{images.size() == 2u ? 1.f : 0.f}.data()); ov::Tensor input_image_1(ov::element::u8, image_shape, images[0].data()); ov::Tensor input_image_2(ov::element::u8, image_shape, From 28337ea5d91bec71f9849037f17af15511bc7bbe Mon Sep 17 00:00:00 2001 From: xipingya Date: Thu, 18 Sep 2025 13:43:11 +0800 Subject: [PATCH 026/118] Support no image or video input; Signed-off-by: xipingya --- src/cpp/src/visual_language/pipeline_base.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/visual_language/pipeline_base.hpp b/src/cpp/src/visual_language/pipeline_base.hpp index 7f87672ae4..ad0b06aca8 100644 --- a/src/cpp/src/visual_language/pipeline_base.hpp +++ b/src/cpp/src/visual_language/pipeline_base.hpp @@ -36,7 +36,7 @@ class ov::genai::VLMPipeline::VLMPipelineBase { auto images = config_map.find(ov::genai::images.name()); auto video = config_map.find(ov::genai::video.name()); int num_set = (config_map.end() != image) + (config_map.end() != images) + (config_map.end() != video); - OPENVINO_ASSERT(num_set == 1, "Only one property can be set: image, images, or video."); + OPENVINO_ASSERT(num_set <= 1, "Only one property can be set: image, images, or video."); std::vector image_rgbs; if (config_map.end() != image) { image_rgbs = {image->second.as()}; From f3fd7d48f2c60cfcf190817f70891980b4f8af50 Mon Sep 17 00:00:00 2001 From: xipingya Date: Thu, 18 Sep 2025 16:04:11 +0800 Subject: [PATCH 027/118] Add checking input for python api. Signed-off-by: xipingya --- src/cpp/src/continuous_batching/pipeline_base.cpp | 11 +++++++++++ src/cpp/src/visual_language/pipeline.cpp | 2 ++ 2 files changed, 13 insertions(+) diff --git a/src/cpp/src/continuous_batching/pipeline_base.cpp b/src/cpp/src/continuous_batching/pipeline_base.cpp index 629356a95d..aa5891df1a 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.cpp +++ b/src/cpp/src/continuous_batching/pipeline_base.cpp @@ -173,6 +173,10 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( const auto& video = video_vector[0]; const auto& prompt = prompts[0]; auto start_get_inputs_embeds = std::chrono::steady_clock::now(); + + int input_check = (rgbs.size() == 0u) + (video.size() == 0u); + OPENVINO_ASSERT(input_check <= 1, "Only accept one input image, images, or video."); + if (rgbs.size() > 0) { encoded_images = m_inputs_embedder->encode_images(rgbs, false); } else if (video.size() > 0) { @@ -205,6 +209,10 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( const auto& rgbs = rgbs_vector[i]; const auto& video = video_vector[i]; std::vector encoded_images; + + int input_check = (rgbs.size() == 0u) + (video.size() == 0u); + OPENVINO_ASSERT(input_check <= 1, "Only accept one input image, images, or video."); + if (rgbs.size() > 0) { encoded_images = m_inputs_embedder->encode_images(rgbs, false); } else if (video.size() > 0) { @@ -276,6 +284,9 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(uint64_t re const std::vector& video, GenerationConfig sampling_params) { OPENVINO_ASSERT(m_model_input_type == ModelInputType::EMBEDDINGS, "Model doesn't support embeddings."); + int input_check = (rgbs.size() == 0u) + (video.size() == 0u); + OPENVINO_ASSERT(input_check <= 1, "Only accept one input image, images, or video."); + ov::genai::VLMPerfMetrics metrics; ov::Tensor inputs; { diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 5f2b1f16da..3f288df47f 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -157,6 +157,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ GenerationConfig generation_config, const StreamerVariant& streamer ) override { + int input_check = (images.size() == 0u) + (video.size() == 0u); + OPENVINO_ASSERT(input_check <= 1, "Only accept one input image, images, or video."); auto generate_start_time = std::chrono::steady_clock::now(); VLMPerfMetrics perf_metrics; From a80d28e51a5844a435a8f267d9ec9b2b1d4ab6be Mon Sep 17 00:00:00 2001 From: xipingya Date: Thu, 18 Sep 2025 20:42:07 +0800 Subject: [PATCH 028/118] cpp interface: generate, remove video. add is_video, default false Signed-off-by: xipingya --- .../genai/continuous_batching_pipeline.hpp | 8 ++-- .../genai/visual_language/pipeline.hpp | 4 +- src/cpp/src/continuous_batching/pipeline.cpp | 12 +++--- .../src/continuous_batching/pipeline_base.cpp | 43 ++++--------------- .../src/continuous_batching/pipeline_base.hpp | 8 ++-- .../src/continuous_batching/pipeline_impl.cpp | 2 +- .../continuous_batching_adapter.hpp | 6 +-- .../visual_language/llava_next/classes.hpp | 2 +- src/cpp/src/visual_language/pipeline.cpp | 23 +++------- src/cpp/src/visual_language/pipeline_base.hpp | 22 ++++------ .../py_continuous_batching_pipeline.cpp | 32 +++++++++++--- src/python/py_vlm_pipeline.cpp | 27 +++++++++--- 12 files changed, 91 insertions(+), 98 deletions(-) diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp index 6327e770cf..4aa99789ea 100644 --- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp +++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp @@ -168,8 +168,8 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { GenerationHandle add_request(uint64_t request_id, const std::string& prompt, const std::vector& images, - const std::vector& video, - const ov::genai::GenerationConfig& sampling_params); + const ov::genai::GenerationConfig& sampling_params, + const bool& is_video = false); void step(); @@ -181,9 +181,9 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { std::vector generate( const std::vector& prompts, const std::vector>& images, - const std::vector>& videos, const std::vector& sampling_params, - const StreamerVariant& streamer=std::monostate{}); + const StreamerVariant& streamer=std::monostate{}, + const bool& is_video = false); /** * @brief start chat with keeping history in kv cache. * @param system_message optional system message. diff --git a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp index 5a8efcac2c..c22840b8b5 100644 --- a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp +++ b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp @@ -99,9 +99,9 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { VLMDecodedResults generate( const std::string& prompt, const std::vector& images, - const std::vector& video, const GenerationConfig& generation_config, - const StreamerVariant& streamer + const StreamerVariant& streamer, + const bool& is_video = false ); /// @brief Generate a response given a prompt and uint8 RGB image with [NHWC] or [HWC] layout. diff --git a/src/cpp/src/continuous_batching/pipeline.cpp b/src/cpp/src/continuous_batching/pipeline.cpp index 3b928ef1a8..bac55ff432 100644 --- a/src/cpp/src/continuous_batching/pipeline.cpp +++ b/src/cpp/src/continuous_batching/pipeline.cpp @@ -240,9 +240,9 @@ GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, co GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, const std::string& prompt, const std::vector& images, - const std::vector& video, - const ov::genai::GenerationConfig& sampling_params) { - return m_impl->add_request(request_id, prompt, images, video, sampling_params); + const ov::genai::GenerationConfig& sampling_params, + const bool& is_video) { + return m_impl->add_request(request_id, prompt, images, sampling_params, is_video); } void ContinuousBatchingPipeline::step() { @@ -276,10 +276,10 @@ std::vector ContinuousBatchingPipeline::generate(const std::ve std::vector ContinuousBatchingPipeline::generate( const std::vector& prompts, const std::vector>& images, - const std::vector>& videos, const std::vector& sampling_params, - const StreamerVariant& streamer) { - return m_impl->generate(prompts, images, videos, sampling_params, streamer); + const StreamerVariant& streamer, + const bool& is_video) { + return m_impl->generate(prompts, images, sampling_params, streamer, is_video); } diff --git a/src/cpp/src/continuous_batching/pipeline_base.cpp b/src/cpp/src/continuous_batching/pipeline_base.cpp index aa5891df1a..cf26ac9033 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.cpp +++ b/src/cpp/src/continuous_batching/pipeline_base.cpp @@ -51,8 +51,7 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( // TODO: remove this code and within model runner add check: if sequence group type is tokens, // but embedding model is available => compute embeddings first, then pass to LLM std::vector> images(prompts.size()); - std::vector> videos(prompts.size()); - auto results_vlm = generate(prompts, images, videos, sampling_params, streamer); + auto results_vlm = generate(prompts, images, sampling_params, streamer, false); std::vector resutls; for (auto& vlm_result : results_vlm) { GenerationResult result; @@ -151,15 +150,14 @@ std::vector ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( const std::vector& prompts, const std::vector>& rgbs_vector, - const std::vector>& video_vector, const std::vector& sampling_params, - const StreamerVariant& streamer) { + const StreamerVariant& streamer, + const bool& is_video) { auto generate_start_time = std::chrono::steady_clock::now(); OPENVINO_ASSERT(m_model_input_type == ModelInputType::EMBEDDINGS); OPENVINO_ASSERT(prompts.size() == sampling_params.size(), "Number of prompts should be equal to the number of generation configs."); OPENVINO_ASSERT(prompts.size() == rgbs_vector.size(), "Number of prompts should be equal to the number of images vectors."); - OPENVINO_ASSERT(prompts.size() == video_vector.size(), "Number of prompts should be equal to the number of video vectors."); std::vector input_embeds_list; std::vector token_type_ids_list; @@ -170,18 +168,10 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( if (m_is_chat_conversation) { OPENVINO_ASSERT(1 == prompts.size(), "Can't chat with multiple prompts"); const auto& rgbs = rgbs_vector[0]; - const auto& video = video_vector[0]; const auto& prompt = prompts[0]; auto start_get_inputs_embeds = std::chrono::steady_clock::now(); - int input_check = (rgbs.size() == 0u) + (video.size() == 0u); - OPENVINO_ASSERT(input_check <= 1, "Only accept one input image, images, or video."); - - if (rgbs.size() > 0) { - encoded_images = m_inputs_embedder->encode_images(rgbs, false); - } else if (video.size() > 0) { - encoded_images = m_inputs_embedder->encode_images(video, true); - } + encoded_images = m_inputs_embedder->encode_images(rgbs, is_video); m_history_images.insert(m_history_images.end(), encoded_images.begin(), encoded_images.end()); const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); @@ -207,17 +197,8 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( for (size_t i = 0; i < prompts.size(); i++) { const auto& prompt = prompts[i]; const auto& rgbs = rgbs_vector[i]; - const auto& video = video_vector[i]; - std::vector encoded_images; - - int input_check = (rgbs.size() == 0u) + (video.size() == 0u); - OPENVINO_ASSERT(input_check <= 1, "Only accept one input image, images, or video."); - if (rgbs.size() > 0) { - encoded_images = m_inputs_embedder->encode_images(rgbs, false); - } else if (video.size() > 0) { - encoded_images = m_inputs_embedder->encode_images(video, true); - } + auto encoded_images = m_inputs_embedder->encode_images(rgbs, is_video); auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); @@ -281,24 +262,16 @@ GenerationHandle ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(uint64_t request_id, const std::string& prompt, const std::vector& rgbs, - const std::vector& video, - GenerationConfig sampling_params) { + GenerationConfig sampling_params, + const bool& is_video) { OPENVINO_ASSERT(m_model_input_type == ModelInputType::EMBEDDINGS, "Model doesn't support embeddings."); - int input_check = (rgbs.size() == 0u) + (video.size() == 0u); - OPENVINO_ASSERT(input_check <= 1, "Only accept one input image, images, or video."); - ov::genai::VLMPerfMetrics metrics; ov::Tensor inputs; { std::lock_guard lock(m_embeddings_mutex); m_inputs_embedder->set_apply_chat_template_status(sampling_params.apply_chat_template); - std::vector encoded_images; - if (rgbs.size() > 0) { - encoded_images = m_inputs_embedder->encode_images(rgbs, false); - } else if (video.size() > 0) { - encoded_images = m_inputs_embedder->encode_images(video, true); - } + auto encoded_images = m_inputs_embedder->encode_images(rgbs, is_video); const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, 0, encoded_images); inputs = m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, metrics, true, image_sequence); diff --git a/src/cpp/src/continuous_batching/pipeline_base.hpp b/src/cpp/src/continuous_batching/pipeline_base.hpp index 8350292667..a8760c5558 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.hpp +++ b/src/cpp/src/continuous_batching/pipeline_base.hpp @@ -93,8 +93,8 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline { GenerationHandle add_request(uint64_t request_id, const std::string& prompt, const std::vector& rgbs, - const std::vector& video, - GenerationConfig sampling_params); + GenerationConfig sampling_params, + const bool& is_video = false); /** * Checks whether server (pipeline) has non-finished requests and step() should be called within a loop @@ -127,9 +127,9 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline { generate( const std::vector& prompts, const std::vector>& rgbs, - const std::vector>& videos, const std::vector& sampling_params, - const StreamerVariant& streamer); + const StreamerVariant& streamer, + const bool& is_video = false); /** * Starts chat with a given system prompt diff --git a/src/cpp/src/continuous_batching/pipeline_impl.cpp b/src/cpp/src/continuous_batching/pipeline_impl.cpp index b0e460c41b..1410623160 100644 --- a/src/cpp/src/continuous_batching/pipeline_impl.cpp +++ b/src/cpp/src/continuous_batching/pipeline_impl.cpp @@ -304,7 +304,7 @@ ContinuousBatchingPipeline::ContinuousBatchingImpl::add_request(uint64_t request timer.end(); return add_request(request_id, inputs, sampling_params); } else if (m_model_input_type == ModelInputType::EMBEDDINGS) { - return ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(request_id, prompt, {}, {}, sampling_params); + return ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(request_id, prompt, {}, sampling_params); } else { OPENVINO_THROW("Unknown model input type."); } diff --git a/src/cpp/src/visual_language/continuous_batching_adapter.hpp b/src/cpp/src/visual_language/continuous_batching_adapter.hpp index 123a55cf51..6db3828a1c 100644 --- a/src/cpp/src/visual_language/continuous_batching_adapter.hpp +++ b/src/cpp/src/visual_language/continuous_batching_adapter.hpp @@ -44,12 +44,12 @@ class ov::genai::VLMPipeline::VLMContinuousBatchingAdapter : public ov::genai::V VLMDecodedResults generate( const std::string& prompt, const std::vector& images, - const std::vector& video, GenerationConfig generation_config, - const StreamerVariant& streamer + const StreamerVariant& streamer, + const bool& is_video ) override { auto start_time = std::chrono::steady_clock::now(); - auto result = m_impl.generate({prompt}, {images}, {video}, {generation_config}, streamer)[0]; + auto result = m_impl.generate({prompt}, {images}, {generation_config}, streamer, is_video)[0]; auto stop_time = std::chrono::steady_clock::now(); VLMDecodedResults decoded; diff --git a/src/cpp/src/visual_language/llava_next/classes.hpp b/src/cpp/src/visual_language/llava_next/classes.hpp index 8559853fd4..19293c0e78 100644 --- a/src/cpp/src/visual_language/llava_next/classes.hpp +++ b/src/cpp/src/visual_language/llava_next/classes.hpp @@ -24,7 +24,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}) override; - std::vector encode_images(const std::vector& images, const bool& is_video) override; + std::vector encode_images(const std::vector& images, const bool& is_video = false) override; std::pair> normalize_prompt( const std::string& prompt, diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 3f288df47f..b52de37abb 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -153,13 +153,10 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ VLMDecodedResults generate( const std::string& prompt, const std::vector& images, - const std::vector& video, GenerationConfig generation_config, - const StreamerVariant& streamer + const StreamerVariant& streamer, + const bool& is_video ) override { - int input_check = (images.size() == 0u) + (video.size() == 0u); - OPENVINO_ASSERT(input_check <= 1, "Only accept one input image, images, or video."); - auto generate_start_time = std::chrono::steady_clock::now(); VLMPerfMetrics perf_metrics; auto& raw_counters = perf_metrics.raw_metrics; @@ -187,13 +184,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ "Currently only \"num_return_sequences\" equal to 1 is supported for NPU device!"); } - // Currently only one input is supported. Video, images or image. - std::vector encoded_images; - if (images.size() > 0) { - encoded_images = m_inputs_embedder->encode_images(images, false); - } else if (video.size() > 0) { - encoded_images = m_inputs_embedder->encode_images(video, true); - } + auto encoded_images = m_inputs_embedder->encode_images(images, is_video); auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); if (m_is_chat_conversation) { @@ -461,11 +452,11 @@ VLMPipeline::~VLMPipeline() = default; VLMDecodedResults VLMPipeline::generate( const std::string& prompt, const std::vector& images, - const std::vector& video, const GenerationConfig& generation_config, - const StreamerVariant& streamer + const StreamerVariant& streamer, + const bool& is_video ) { - return m_pimpl->generate(prompt, images, video, generation_config, streamer); + return m_pimpl->generate(prompt, images, generation_config, streamer, is_video); } VLMDecodedResults VLMPipeline::generate( @@ -474,7 +465,7 @@ VLMDecodedResults VLMPipeline::generate( const GenerationConfig& generation_config, const StreamerVariant& streamer ) { - return m_pimpl->generate(prompt, {rgb}, {}, generation_config, streamer); + return m_pimpl->generate(prompt, {rgb}, generation_config, streamer); } VLMDecodedResults VLMPipeline::generate( diff --git a/src/cpp/src/visual_language/pipeline_base.hpp b/src/cpp/src/visual_language/pipeline_base.hpp index ad0b06aca8..581e66fcb7 100644 --- a/src/cpp/src/visual_language/pipeline_base.hpp +++ b/src/cpp/src/visual_language/pipeline_base.hpp @@ -23,9 +23,9 @@ class ov::genai::VLMPipeline::VLMPipelineBase { virtual VLMDecodedResults generate( const std::string& prompt, const std::vector& images, - const std::vector& video, GenerationConfig generation_config, - const StreamerVariant& streamer + const StreamerVariant& streamer, + const bool& is_video = false ) = 0; VLMDecodedResults generate( @@ -35,7 +35,8 @@ class ov::genai::VLMPipeline::VLMPipelineBase { auto image = config_map.find(ov::genai::image.name()); auto images = config_map.find(ov::genai::images.name()); auto video = config_map.find(ov::genai::video.name()); - int num_set = (config_map.end() != image) + (config_map.end() != images) + (config_map.end() != video); + bool is_video = config_map.end() != video; + int num_set = (config_map.end() != image) + (config_map.end() != images) + (is_video); OPENVINO_ASSERT(num_set <= 1, "Only one property can be set: image, images, or video."); std::vector image_rgbs; if (config_map.end() != image) { @@ -52,13 +53,12 @@ class ov::genai::VLMPipeline::VLMPipelineBase { } } - std::vector video_rgbs; - if (config_map.end() != video) { + if (is_video) { if (video->second.is>()) { - video_rgbs = video->second.as>(); + image_rgbs = video->second.as>(); } else if (video->second.is()){ - video_rgbs = {video->second.as()}; + image_rgbs = {video->second.as()}; } else { OPENVINO_THROW("Unknown video type."); @@ -69,13 +69,7 @@ class ov::genai::VLMPipeline::VLMPipelineBase { GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config(); config.update_generation_config(config_map); - return generate( - prompt, - image_rgbs, - video_rgbs, - config, - utils::get_streamer_from_map(config_map) - ); + return generate(prompt, image_rgbs, config, utils::get_streamer_from_map(config_map), is_video); } virtual void start_chat(const std::string& system_message) = 0; diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp index 2da9a331a6..8060195d49 100644 --- a/src/python/py_continuous_batching_pipeline.cpp +++ b/src/python/py_continuous_batching_pipeline.cpp @@ -411,13 +411,13 @@ void init_continuous_batching_pipeline(py::module_& m) { py::overload_cast&, - const std::vector&, - const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::add_request), + const ov::genai::GenerationConfig&, + const bool&>(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("prompt"), py::arg("images"), - py::arg("video"), - py::arg("generation_config")) + py::arg("generation_config"), + py::arg("is_video") = false) .def("step", &ContinuousBatchingPipeline::step) .def("has_non_finished_requests", &ContinuousBatchingPipeline::has_non_finished_requests) @@ -473,7 +473,6 @@ void init_continuous_batching_pipeline(py::module_& m) { [](ContinuousBatchingPipeline& pipe, const std::vector& prompts, const std::vector>& images, - const std::vector>& videos, const std::vector& generation_config, const pyutils::PyBindStreamerVariant& py_streamer ) -> py::typing::Union> { @@ -481,12 +480,33 @@ void init_continuous_batching_pipeline(py::module_& m) { std::vector generated_results; { py::gil_scoped_release rel; - generated_results = pipe.generate(prompts, images, videos, generation_config, streamer); + generated_results = pipe.generate(prompts, images, generation_config, streamer); } return py::cast(generated_results); }, py::arg("prompts"), py::arg("images"), + py::arg("generation_config"), + py::arg("streamer") = std::monostate{} + ) + + .def( + "generate", + [](ContinuousBatchingPipeline& pipe, + const std::vector& prompts, + const std::vector>& videos, + const std::vector& generation_config, + const pyutils::PyBindStreamerVariant& py_streamer + ) -> py::typing::Union> { + ov::genai::StreamerVariant streamer = pyutils::pystreamer_to_streamer(py_streamer); + std::vector generated_results; + { + py::gil_scoped_release rel; + generated_results = pipe.generate(prompts, videos, generation_config, streamer, true); + } + return py::cast(generated_results); + }, + py::arg("prompts"), py::arg("videos"), py::arg("generation_config"), py::arg("streamer") = std::monostate{} diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp index b254b89a98..2b31c37ec6 100644 --- a/src/python/py_vlm_pipeline.cpp +++ b/src/python/py_vlm_pipeline.cpp @@ -128,17 +128,17 @@ py::object call_vlm_generate( ov::genai::VLMPipeline& pipe, const std::string& prompt, const std::vector& images, - const std::vector& video, const ov::genai::GenerationConfig& generation_config, const pyutils::PyBindStreamerVariant& py_streamer, - const py::kwargs& kwargs + const py::kwargs& kwargs, + const bool& is_video = false ) { auto updated_config = *pyutils::update_config_from_kwargs(generation_config, kwargs); ov::genai::StreamerVariant streamer = pyutils::pystreamer_to_streamer(py_streamer); ov::genai::VLMDecodedResults res; { py::gil_scoped_release rel; - res= pipe.generate(prompt, images, video, updated_config, streamer); + res= pipe.generate(prompt, images, updated_config, streamer, is_video); } return py::cast(res); } @@ -227,15 +227,30 @@ void init_vlm_pipeline(py::module_& m) { [](ov::genai::VLMPipeline& pipe, const std::string& prompt, const std::vector& images, - const std::vector& video, const ov::genai::GenerationConfig& generation_config, const pyutils::PyBindStreamerVariant& streamer, const py::kwargs& kwargs ) -> py::typing::Union { - return call_vlm_generate(pipe, prompt, images, video, generation_config, streamer, kwargs); + return call_vlm_generate(pipe, prompt, images, generation_config, streamer, kwargs); }, py::arg("prompt"), "Input string", py::arg("images"), "Input images", + py::arg("generation_config"), "generation_config", + py::arg("streamer") = std::monostate(), "streamer", + (vlm_generate_docstring + std::string(" \n ")).c_str() + ) + .def( + "generate", + [](ov::genai::VLMPipeline& pipe, + const std::string& prompt, + const std::vector& video, + const ov::genai::GenerationConfig& generation_config, + const pyutils::PyBindStreamerVariant& streamer, + const py::kwargs& kwargs + ) -> py::typing::Union { + return call_vlm_generate(pipe, prompt, video, generation_config, streamer, kwargs, true); + }, + py::arg("prompt"), "Input string", py::arg("video"), "Input video", py::arg("generation_config"), "generation_config", py::arg("streamer") = std::monostate(), "streamer", @@ -250,7 +265,7 @@ void init_vlm_pipeline(py::module_& m) { const pyutils::PyBindStreamerVariant& streamer, const py::kwargs& kwargs ) -> py::typing::Union { - return call_vlm_generate(pipe, prompt, {images}, {}, generation_config, streamer, kwargs); + return call_vlm_generate(pipe, prompt, {images}, generation_config, streamer, kwargs); }, py::arg("prompt"), "Input string", py::arg("images"), "Input images", From 6ab0a355faa7615c0dc289a6372d5c50fffdbf93 Mon Sep 17 00:00:00 2001 From: xipingya Date: Thu, 18 Sep 2025 20:54:24 +0800 Subject: [PATCH 029/118] update get_inputs_embeds_with_token_type_ids and get_inputs_embeds, if pass image, also need to pass video flag. Signed-off-by: xipingya --- src/cpp/src/visual_language/inputs_embedder.cpp | 14 ++++++++------ src/cpp/src/visual_language/inputs_embedder.hpp | 8 ++++---- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index b6026404ac..77b185d6ed 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -178,16 +178,17 @@ std::vector InputsEmbedder::IInputsEmbedder::encode_ima return embeds; } -ov::Tensor InputsEmbedder::IInputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence) { - return get_inputs_embeds(prompt, encode_images(images), metrics, true, image_sequence); +ov::Tensor InputsEmbedder::IInputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector& images, const bool& is_video, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence) { + return get_inputs_embeds(prompt, encode_images(images, is_video), metrics, true, image_sequence); } std::pair InputsEmbedder::IInputsEmbedder::get_inputs_embeds_with_token_type_ids( const std::string& prompt, const std::vector& images, + const bool& is_video, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence) { - return get_inputs_embeds_with_token_type_ids(prompt, encode_images(images), metrics, true, image_sequence); + return get_inputs_embeds_with_token_type_ids(prompt, encode_images(images, is_video), metrics, true, image_sequence); } std::pair InputsEmbedder::IInputsEmbedder::get_inputs_embeds_with_token_type_ids( @@ -261,8 +262,8 @@ InputsEmbedder::InputsEmbedder(const ModelsMap& models_map, } } -ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence) { - return m_impl->get_inputs_embeds(prompt, images, metrics, image_sequence); +ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector& images, const bool& is_video, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence) { + return m_impl->get_inputs_embeds(prompt, images, is_video, metrics, image_sequence); } ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings, const std::vector& image_sequence) { @@ -272,10 +273,11 @@ ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const st std::pair InputsEmbedder::get_inputs_embeds_with_token_type_ids( const std::string& prompt, const std::vector& images, + const bool& is_video, VLMPerfMetrics& metrics, const std::vector& image_sequence) { return m_impl->get_inputs_embeds_with_token_type_ids( - prompt, images, metrics, image_sequence); + prompt, images, is_video, metrics, image_sequence); } std::pair InputsEmbedder::get_inputs_embeds_with_token_type_ids( diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index 3d250c1539..7dd7ad8a69 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -35,12 +35,12 @@ class InputsEmbedder { const ov::AnyMap device_config); // compute input embedding for prompt and multiple images - ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence); + ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, const bool& is_video, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence); ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}); // compute input embedding and token_type_ids - std::pair get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector& images, VLMPerfMetrics& metrics, const std::vector& image_sequence = {}); + std::pair get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector& images, const bool& is_video, VLMPerfMetrics& metrics, const std::vector& image_sequence = {}); std::pair get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector& images, VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}); @@ -108,9 +108,9 @@ class InputsEmbedder { public: virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}) = 0; - ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence); + ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, const bool& is_video, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence); - std::pair get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence = {}); + std::pair get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector& images, const bool& is_video, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence = {}); virtual std::pair get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}); From dc30ec1d16de041ce40a2245b044ae7b6c5a68ba Mon Sep 17 00:00:00 2001 From: xipingya Date: Fri, 19 Sep 2025 09:10:01 +0800 Subject: [PATCH 030/118] update pyi interface of generate. Signed-off-by: xipingya --- .../openvino_genai/py_openvino_genai.pyi | 51 +++++++++++++++++-- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index c1f63377f9..d28ddeadd2 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -412,7 +412,7 @@ class ContinuousBatchingPipeline: def add_request(self, request_id: typing.SupportsInt, prompt: str, generation_config: GenerationConfig) -> GenerationHandle: ... @typing.overload - def add_request(self, request_id: typing.SupportsInt, prompt: str, images: collections.abc.Sequence[openvino._pyopenvino.Tensor], video: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig) -> GenerationHandle: + def add_request(self, request_id: typing.SupportsInt, prompt: str, images: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig, is_video: bool = False) -> GenerationHandle: ... def finish_chat(self) -> None: ... @@ -426,7 +426,10 @@ class ContinuousBatchingPipeline: def generate(self, prompt: str, generation_config: GenerationConfig, streamer: collections.abc.Callable[[str], int | None] | openvino_genai.py_openvino_genai.StreamerBase | None = None) -> list[GenerationResult]: ... @typing.overload - def generate(self, prompts: collections.abc.Sequence[str], images: collections.abc.Sequence[collections.abc.Sequence[openvino._pyopenvino.Tensor]], videos: collections.abc.Sequence[collections.abc.Sequence[openvino._pyopenvino.Tensor]], generation_config: collections.abc.Sequence[GenerationConfig], streamer: collections.abc.Callable[[str], int | None] | openvino_genai.py_openvino_genai.StreamerBase | None = None) -> list[GenerationResult]: + def generate(self, prompts: collections.abc.Sequence[str], images: collections.abc.Sequence[collections.abc.Sequence[openvino._pyopenvino.Tensor]], generation_config: collections.abc.Sequence[GenerationConfig], streamer: collections.abc.Callable[[str], int | None] | openvino_genai.py_openvino_genai.StreamerBase | None = None) -> list[GenerationResult]: + ... + @typing.overload + def generate(self, prompts: collections.abc.Sequence[str], videos: collections.abc.Sequence[collections.abc.Sequence[openvino._pyopenvino.Tensor]], generation_config: collections.abc.Sequence[GenerationConfig], streamer: collections.abc.Callable[[str], int | None] | openvino_genai.py_openvino_genai.StreamerBase | None = None) -> list[GenerationResult]: ... def get_config(self) -> GenerationConfig: ... @@ -3442,7 +3445,49 @@ class VLMPipeline: def finish_chat(self) -> None: ... @typing.overload - def generate(self, prompt: str, images: collections.abc.Sequence[openvino._pyopenvino.Tensor], video: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig, streamer: collections.abc.Callable[[str], int | None] | openvino_genai.py_openvino_genai.StreamerBase | None = None, **kwargs) -> VLMDecodedResults: + def generate(self, prompt: str, images: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig, streamer: collections.abc.Callable[[str], int | None] | openvino_genai.py_openvino_genai.StreamerBase | None = None, **kwargs) -> VLMDecodedResults: + """ + Generates sequences for VLMs. + + :param prompt: input prompt + :type prompt: str + The prompt can contain with i replaced with + an actual zero based index to refer to an image. Reference to + images used in previous prompts isn't implemented. + A model's native image tag can be used instead of + . These tags are: + InternVL2: \\n + llava-1.5-7b-hf: + LLaVA-NeXT: + MiniCPM-V-2_6: (./)\\n + Phi-3-vision: <|image_i|>\\n - the index starts with one + Phi-4-multimodal-instruct: <|image_i|>\\n - the index starts with one + Qwen2-VL: <|vision_start|><|image_pad|><|vision_end|> + Qwen2.5-VL: <|vision_start|><|image_pad|><|vision_end|> + gemma-3-4b-it: + If the prompt doesn't contain image tags, but images are + provided, the tags are prepended to the prompt. + + :param images: image or list of images + :type images: list[ov.Tensor] or ov.Tensor + + :param video: list of frames + :type video: list[ov.Tensor] + + :param generation_config: generation_config + :type generation_config: GenerationConfig or a dict + + :param streamer: streamer either as a lambda with a boolean returning flag whether generation should be stopped + :type : Callable[[str], bool], ov.genai.StreamerBase + + :param kwargs: arbitrary keyword arguments with keys corresponding to GenerationConfig fields. + :type : dict + + :return: return results in decoded form + :rtype: VLMDecodedResults + """ + @typing.overload + def generate(self, prompt: str, video: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig, streamer: collections.abc.Callable[[str], int | None] | openvino_genai.py_openvino_genai.StreamerBase | None = None, **kwargs) -> VLMDecodedResults: """ Generates sequences for VLMs. From 5edf0a51a63896e79c6829d63867045d90b9a06e Mon Sep 17 00:00:00 2001 From: xipingya Date: Wed, 24 Sep 2025 20:11:57 +0800 Subject: [PATCH 031/118] Remove "const bool& is_video" in add_request and generate. Signed-off-by: xipingya --- .../genai/continuous_batching_pipeline.hpp | 20 ++- .../genai/visual_language/pipeline.hpp | 20 ++- src/cpp/src/continuous_batching/pipeline.cpp | 26 +++- .../src/continuous_batching/pipeline_base.cpp | 52 +++++-- .../src/continuous_batching/pipeline_base.hpp | 18 ++- .../continuous_batching_adapter.hpp | 15 +- src/cpp/src/visual_language/pipeline.cpp | 33 ++++- src/cpp/src/visual_language/pipeline_base.hpp | 29 ++-- .../py_continuous_batching_pipeline.cpp | 135 +++++++++++------- src/python/py_vlm_pipeline.cpp | 12 +- 10 files changed, 261 insertions(+), 99 deletions(-) diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp index 4aa99789ea..726e76b80c 100644 --- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp +++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp @@ -168,8 +168,13 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { GenerationHandle add_request(uint64_t request_id, const std::string& prompt, const std::vector& images, - const ov::genai::GenerationConfig& sampling_params, - const bool& is_video = false); + const ov::genai::GenerationConfig& sampling_params); + + GenerationHandle add_request(uint64_t request_id, + const std::string& prompt, + const std::vector& images, + const std::vector& video, + const ov::genai::GenerationConfig& sampling_params); void step(); @@ -182,8 +187,15 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { const std::vector& prompts, const std::vector>& images, const std::vector& sampling_params, - const StreamerVariant& streamer=std::monostate{}, - const bool& is_video = false); + const StreamerVariant& streamer=std::monostate{}); + + std::vector generate( + const std::vector& prompts, + const std::vector>& images, + const std::vector>& video, + const std::vector& sampling_params, + const StreamerVariant& streamer=std::monostate{}); + /** * @brief start chat with keeping history in kv cache. * @param system_message optional system message. diff --git a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp index c22840b8b5..3d72a3973e 100644 --- a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp +++ b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp @@ -90,7 +90,23 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /// uint8 RGB images with [NHWC] or [HWC] layout. /// @param prompt A prompt to respond to. /// @param images Images to be prepended to a prompt. + /// @param generation_config A config to follow for text generation. + /// @param streamer A streamer to acquire intermediate result. + /// @return A string generated by a model. + /// chat_template will be applied to the prompt, run pipe.set_chat_template(custom_chat_template) to update it. + /// To disable it for non-chat mode, please, use custom_chat_template eq "" or set generation_config.apply_chat_template to false. + VLMDecodedResults generate( + const std::string& prompt, + const std::vector& images, + const GenerationConfig& generation_config, + const StreamerVariant& streamer + ); + + /// @brief Generate a response given a prompt and any number of + /// uint8 RGB images with [NHWC] or [HWC] layout. + /// @param prompt A prompt to respond to. /// @param video Video frames to be prepended to a prompt. + /// @param images Images to be prepended to a prompt. /// @param generation_config A config to follow for text generation. /// @param streamer A streamer to acquire intermediate result. /// @return A string generated by a model. @@ -99,9 +115,9 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { VLMDecodedResults generate( const std::string& prompt, const std::vector& images, + const std::vector& video, const GenerationConfig& generation_config, - const StreamerVariant& streamer, - const bool& is_video = false + const StreamerVariant& streamer ); /// @brief Generate a response given a prompt and uint8 RGB image with [NHWC] or [HWC] layout. diff --git a/src/cpp/src/continuous_batching/pipeline.cpp b/src/cpp/src/continuous_batching/pipeline.cpp index bac55ff432..5899d6d8ed 100644 --- a/src/cpp/src/continuous_batching/pipeline.cpp +++ b/src/cpp/src/continuous_batching/pipeline.cpp @@ -240,9 +240,16 @@ GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, co GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, const std::string& prompt, const std::vector& images, - const ov::genai::GenerationConfig& sampling_params, - const bool& is_video) { - return m_impl->add_request(request_id, prompt, images, sampling_params, is_video); + const ov::genai::GenerationConfig& sampling_params) { + return m_impl->add_request(request_id, prompt, images, sampling_params); +} + +GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, + const std::string& prompt, + const std::vector& images, + const std::vector& video, + const ov::genai::GenerationConfig& sampling_params) { + return m_impl->add_request(request_id, prompt, images, video, sampling_params); } void ContinuousBatchingPipeline::step() { @@ -277,11 +284,18 @@ std::vector ContinuousBatchingPipeline::generate( const std::vector& prompts, const std::vector>& images, const std::vector& sampling_params, - const StreamerVariant& streamer, - const bool& is_video) { - return m_impl->generate(prompts, images, sampling_params, streamer, is_video); + const StreamerVariant& streamer) { + return m_impl->generate(prompts, images, sampling_params, streamer); } +std::vector ContinuousBatchingPipeline::generate( + const std::vector& prompts, + const std::vector>& images, + const std::vector>& video, + const std::vector& sampling_params, + const StreamerVariant& streamer) { + return m_impl->generate(prompts, images, video, sampling_params, streamer); +} void ContinuousBatchingPipeline::start_chat(const std::string& system_message) { m_impl->finish_chat(); diff --git a/src/cpp/src/continuous_batching/pipeline_base.cpp b/src/cpp/src/continuous_batching/pipeline_base.cpp index cf26ac9033..83662ae45d 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.cpp +++ b/src/cpp/src/continuous_batching/pipeline_base.cpp @@ -51,7 +51,7 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( // TODO: remove this code and within model runner add check: if sequence group type is tokens, // but embedding model is available => compute embeddings first, then pass to LLM std::vector> images(prompts.size()); - auto results_vlm = generate(prompts, images, sampling_params, streamer, false); + auto results_vlm = generate(prompts, images, sampling_params, streamer); std::vector resutls; for (auto& vlm_result : results_vlm) { GenerationResult result; @@ -151,13 +151,23 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( const std::vector& prompts, const std::vector>& rgbs_vector, const std::vector& sampling_params, - const StreamerVariant& streamer, - const bool& is_video) { + const StreamerVariant& streamer) { + return generate(prompts, rgbs_vector, {}, sampling_params, streamer); +} + +std::vector +ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( + const std::vector& prompts, + const std::vector>& rgbs_vector, + const std::vector>& video_vector, + const std::vector& sampling_params, + const StreamerVariant& streamer) { auto generate_start_time = std::chrono::steady_clock::now(); OPENVINO_ASSERT(m_model_input_type == ModelInputType::EMBEDDINGS); OPENVINO_ASSERT(prompts.size() == sampling_params.size(), "Number of prompts should be equal to the number of generation configs."); OPENVINO_ASSERT(prompts.size() == rgbs_vector.size(), "Number of prompts should be equal to the number of images vectors."); + OPENVINO_ASSERT(rgbs_vector.empty() || video_vector.empty(), "Only support one input, video or images"); std::vector input_embeds_list; std::vector token_type_ids_list; @@ -167,11 +177,11 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( if (m_is_chat_conversation) { OPENVINO_ASSERT(1 == prompts.size(), "Can't chat with multiple prompts"); - const auto& rgbs = rgbs_vector[0]; + const auto& rgbs = video_vector.empty() ? rgbs_vector[0] : video_vector[0]; const auto& prompt = prompts[0]; auto start_get_inputs_embeds = std::chrono::steady_clock::now(); - encoded_images = m_inputs_embedder->encode_images(rgbs, is_video); + encoded_images = m_inputs_embedder->encode_images(rgbs, rgbs_vector.empty()); m_history_images.insert(m_history_images.end(), encoded_images.begin(), encoded_images.end()); const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); @@ -196,9 +206,9 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( } else { for (size_t i = 0; i < prompts.size(); i++) { const auto& prompt = prompts[i]; - const auto& rgbs = rgbs_vector[i]; + const auto& rgbs = video_vector.empty() ? rgbs_vector[i] : video_vector[i]; - auto encoded_images = m_inputs_embedder->encode_images(rgbs, is_video); + auto encoded_images = m_inputs_embedder->encode_images(rgbs, rgbs_vector.empty()); auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); @@ -262,16 +272,38 @@ GenerationHandle ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(uint64_t request_id, const std::string& prompt, const std::vector& rgbs, - GenerationConfig sampling_params, - const bool& is_video) { + GenerationConfig sampling_params) { + OPENVINO_ASSERT(m_model_input_type == ModelInputType::EMBEDDINGS, "Model doesn't support embeddings."); + ov::genai::VLMPerfMetrics metrics; + ov::Tensor inputs; + { + std::lock_guard lock(m_embeddings_mutex); + m_inputs_embedder->set_apply_chat_template_status(sampling_params.apply_chat_template); + + auto encoded_images = m_inputs_embedder->encode_images(rgbs, false); + + const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, 0, encoded_images); + inputs = m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, metrics, true, image_sequence); + } + return add_request(request_id, inputs, sampling_params); +} + +GenerationHandle ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request( + uint64_t request_id, + const std::string& prompt, + const std::vector& images, + const std::vector& video, + GenerationConfig sampling_params) { OPENVINO_ASSERT(m_model_input_type == ModelInputType::EMBEDDINGS, "Model doesn't support embeddings."); + OPENVINO_ASSERT((video.empty() || images.empty()), "Only support one input, video or images."); ov::genai::VLMPerfMetrics metrics; ov::Tensor inputs; { std::lock_guard lock(m_embeddings_mutex); m_inputs_embedder->set_apply_chat_template_status(sampling_params.apply_chat_template); - auto encoded_images = m_inputs_embedder->encode_images(rgbs, is_video); + auto encoded_images = video.empty() ? m_inputs_embedder->encode_images(images, false) + : m_inputs_embedder->encode_images(video, true); const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, 0, encoded_images); inputs = m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, metrics, true, image_sequence); diff --git a/src/cpp/src/continuous_batching/pipeline_base.hpp b/src/cpp/src/continuous_batching/pipeline_base.hpp index a8760c5558..941b382b58 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.hpp +++ b/src/cpp/src/continuous_batching/pipeline_base.hpp @@ -93,8 +93,13 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline { GenerationHandle add_request(uint64_t request_id, const std::string& prompt, const std::vector& rgbs, - GenerationConfig sampling_params, - const bool& is_video = false); + GenerationConfig sampling_params); + + GenerationHandle add_request(uint64_t request_id, + const std::string& prompt, + const std::vector& images, + const std::vector& video, + GenerationConfig sampling_params); /** * Checks whether server (pipeline) has non-finished requests and step() should be called within a loop @@ -128,8 +133,13 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline { const std::vector& prompts, const std::vector>& rgbs, const std::vector& sampling_params, - const StreamerVariant& streamer, - const bool& is_video = false); + const StreamerVariant& streamer); + + virtual std::vector generate(const std::vector& prompts, + const std::vector>& images, + const std::vector>& video, + const std::vector& sampling_params, + const StreamerVariant& streamer); /** * Starts chat with a given system prompt diff --git a/src/cpp/src/visual_language/continuous_batching_adapter.hpp b/src/cpp/src/visual_language/continuous_batching_adapter.hpp index 6db3828a1c..9a74d993f6 100644 --- a/src/cpp/src/visual_language/continuous_batching_adapter.hpp +++ b/src/cpp/src/visual_language/continuous_batching_adapter.hpp @@ -45,11 +45,20 @@ class ov::genai::VLMPipeline::VLMContinuousBatchingAdapter : public ov::genai::V const std::string& prompt, const std::vector& images, GenerationConfig generation_config, - const StreamerVariant& streamer, - const bool& is_video + const StreamerVariant& streamer + ) override { + return generate(prompt, {}, images, generation_config, streamer); + } + + VLMDecodedResults generate( + const std::string& prompt, + const std::vector& images, + const std::vector& video, + GenerationConfig generation_config, + const StreamerVariant& streamer ) override { auto start_time = std::chrono::steady_clock::now(); - auto result = m_impl.generate({prompt}, {images}, {generation_config}, streamer, is_video)[0]; + auto result = m_impl.generate({prompt}, {images}, {video}, {generation_config}, streamer)[0]; auto stop_time = std::chrono::steady_clock::now(); VLMDecodedResults decoded; diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index b52de37abb..4c196846df 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -154,9 +154,20 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ const std::string& prompt, const std::vector& images, GenerationConfig generation_config, - const StreamerVariant& streamer, - const bool& is_video + const StreamerVariant& streamer ) override { + return generate(prompt, images, {}, generation_config, streamer); + } + + VLMDecodedResults generate( + const std::string& prompt, + const std::vector& images, + const std::vector& video, + GenerationConfig generation_config, + const StreamerVariant& streamer + ) override { + OPENVINO_ASSERT((video.empty() || images.empty()), "Only support one input, video or images."); + auto generate_start_time = std::chrono::steady_clock::now(); VLMPerfMetrics perf_metrics; auto& raw_counters = perf_metrics.raw_metrics; @@ -184,7 +195,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ "Currently only \"num_return_sequences\" equal to 1 is supported for NPU device!"); } - auto encoded_images = m_inputs_embedder->encode_images(images, is_video); + auto encoded_images = video.empty() ? m_inputs_embedder->encode_images(images, false) + : m_inputs_embedder->encode_images(video, true); auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); if (m_is_chat_conversation) { @@ -452,11 +464,20 @@ VLMPipeline::~VLMPipeline() = default; VLMDecodedResults VLMPipeline::generate( const std::string& prompt, const std::vector& images, + const std::vector& video, const GenerationConfig& generation_config, - const StreamerVariant& streamer, - const bool& is_video + const StreamerVariant& streamer +) { + return m_pimpl->generate(prompt, images, video, generation_config, streamer); +} + +VLMDecodedResults VLMPipeline::generate( + const std::string& prompt, + const std::vector& images, + const GenerationConfig& generation_config, + const StreamerVariant& streamer ) { - return m_pimpl->generate(prompt, images, generation_config, streamer, is_video); + return m_pimpl->generate(prompt, images, generation_config, streamer); } VLMDecodedResults VLMPipeline::generate( diff --git a/src/cpp/src/visual_language/pipeline_base.hpp b/src/cpp/src/visual_language/pipeline_base.hpp index 581e66fcb7..a0497f54ec 100644 --- a/src/cpp/src/visual_language/pipeline_base.hpp +++ b/src/cpp/src/visual_language/pipeline_base.hpp @@ -24,8 +24,15 @@ class ov::genai::VLMPipeline::VLMPipelineBase { const std::string& prompt, const std::vector& images, GenerationConfig generation_config, - const StreamerVariant& streamer, - const bool& is_video = false + const StreamerVariant& streamer + ) = 0; + + virtual VLMDecodedResults generate( + const std::string& prompt, + const std::vector& images, + const std::vector& video, + GenerationConfig generation_config, + const StreamerVariant& streamer ) = 0; VLMDecodedResults generate( @@ -38,10 +45,18 @@ class ov::genai::VLMPipeline::VLMPipelineBase { bool is_video = config_map.end() != video; int num_set = (config_map.end() != image) + (config_map.end() != images) + (is_video); OPENVINO_ASSERT(num_set <= 1, "Only one property can be set: image, images, or video."); + + ov::genai::OptionalGenerationConfig config_arg = utils::get_config_from_map(config_map); + GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config(); + config.update_generation_config(config_map); + std::vector image_rgbs; if (config_map.end() != image) { image_rgbs = {image->second.as()}; - } if (config_map.end() != images) { + return generate(prompt, image_rgbs, config, utils::get_streamer_from_map(config_map)); + } + + if (config_map.end() != images) { if (images->second.is>()) { image_rgbs = images->second.as>(); } @@ -51,6 +66,7 @@ class ov::genai::VLMPipeline::VLMPipelineBase { else { OPENVINO_THROW("Unknown images type."); } + return generate(prompt, image_rgbs, config, utils::get_streamer_from_map(config_map)); } if (is_video) { @@ -63,13 +79,10 @@ class ov::genai::VLMPipeline::VLMPipelineBase { else { OPENVINO_THROW("Unknown video type."); } + return generate(prompt, {}, image_rgbs, config, utils::get_streamer_from_map(config_map)); } - ov::genai::OptionalGenerationConfig config_arg = utils::get_config_from_map(config_map); - GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config(); - config.update_generation_config(config_map); - - return generate(prompt, image_rgbs, config, utils::get_streamer_from_map(config_map), is_video); + return generate(prompt, image_rgbs, {}, config, utils::get_streamer_from_map(config_map)); } virtual void start_chat(const std::string& system_message) = 0; diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp index 8060195d49..36dea1dd0d 100644 --- a/src/python/py_continuous_batching_pipeline.cpp +++ b/src/python/py_continuous_batching_pipeline.cpp @@ -379,45 +379,85 @@ void init_continuous_batching_pipeline(py::module_& m) { .def_readonly("avg_cache_usage", &PipelineMetrics::avg_cache_usage) .def_readonly("max_cache_usage", &PipelineMetrics::max_cache_usage); - py::class_(m, "ContinuousBatchingPipeline", "This class is used for generation with LLMs with continuous batchig") - .def(py::init([](const std::filesystem::path& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map& llm_plugin_config, - const std::map& tokenizer_plugin_config, const std::map& inputs_embedder_plugin_config) { - ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); - return std::make_unique(models_path, scheduler_config, device, pyutils::properties_to_any_map(llm_plugin_config), - pyutils::properties_to_any_map(tokenizer_plugin_config), pyutils::properties_to_any_map(inputs_embedder_plugin_config)); - }), - py::arg("models_path"), - py::arg("scheduler_config"), - py::arg("device"), - py::arg("properties") = ov::AnyMap({}), - py::arg("tokenizer_properties") = ov::AnyMap({}), - py::arg("vision_encoder_properties") = ov::AnyMap({})) - - .def(py::init([](const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const py::kwargs& kwargs) { - ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); - return std::make_unique(models_path, tokenizer, scheduler_config, device, pyutils::kwargs_to_any_map(kwargs)); - }), - py::arg("models_path"), - py::arg("tokenizer"), - py::arg("scheduler_config"), - py::arg("device")) + py::class_(m, + "ContinuousBatchingPipeline", + "This class is used for generation with LLMs with continuous batchig") + .def(py::init([](const std::filesystem::path& models_path, + const SchedulerConfig& scheduler_config, + const std::string& device, + const std::map& llm_plugin_config, + const std::map& tokenizer_plugin_config, + const std::map& inputs_embedder_plugin_config) { + ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); + return std::make_unique( + models_path, + scheduler_config, + device, + pyutils::properties_to_any_map(llm_plugin_config), + pyutils::properties_to_any_map(tokenizer_plugin_config), + pyutils::properties_to_any_map(inputs_embedder_plugin_config)); + }), + py::arg("models_path"), + py::arg("scheduler_config"), + py::arg("device"), + py::arg("properties") = ov::AnyMap({}), + py::arg("tokenizer_properties") = ov::AnyMap({}), + py::arg("vision_encoder_properties") = ov::AnyMap({})) + + .def(py::init([](const std::filesystem::path& models_path, + const ov::genai::Tokenizer& tokenizer, + const SchedulerConfig& scheduler_config, + const std::string& device, + const py::kwargs& kwargs) { + ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); + return std::make_unique(models_path, + tokenizer, + scheduler_config, + device, + pyutils::kwargs_to_any_map(kwargs)); + }), + py::arg("models_path"), + py::arg("tokenizer"), + py::arg("scheduler_config"), + py::arg("device")) .def("get_tokenizer", &ContinuousBatchingPipeline::get_tokenizer) .def("get_config", &ContinuousBatchingPipeline::get_config) .def("get_metrics", &ContinuousBatchingPipeline::get_metrics) - .def("add_request", py::overload_cast(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("input_ids"), py::arg("generation_config")) - .def("add_request", py::overload_cast(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("prompt"), py::arg("generation_config")) + .def("add_request", + py::overload_cast( + &ContinuousBatchingPipeline::add_request), + py::arg("request_id"), + py::arg("input_ids"), + py::arg("generation_config")) + .def("add_request", + py::overload_cast( + &ContinuousBatchingPipeline::add_request), + py::arg("request_id"), + py::arg("prompt"), + py::arg("generation_config")) + .def("add_request", + py::overload_cast&, + const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::add_request), + py::arg("request_id"), + py::arg("prompt"), + py::arg("images"), + py::arg("generation_config")) + .def("add_request", py::overload_cast&, - const ov::genai::GenerationConfig&, - const bool&>(&ContinuousBatchingPipeline::add_request), + const std::vector&, + const ov::genai::GenerationConfig&>(&ContinuousBatchingPipeline::add_request), py::arg("request_id"), py::arg("prompt"), py::arg("images"), - py::arg("generation_config"), - py::arg("is_video") = false) + py::arg("video"), + py::arg("generation_config")) + .def("step", &ContinuousBatchingPipeline::step) .def("has_non_finished_requests", &ContinuousBatchingPipeline::has_non_finished_requests) @@ -429,44 +469,41 @@ void init_continuous_batching_pipeline(py::module_& m) { [](ContinuousBatchingPipeline& pipe, const std::vector& input_ids, const std::vector& generation_config, - const pyutils::PyBindStreamerVariant& streamer - ) -> py::typing::Union> { + const pyutils::PyBindStreamerVariant& streamer) + -> py::typing::Union> { return __call_cb_generate(pipe, input_ids, generation_config, streamer); }, py::arg("input_ids"), py::arg("generation_config"), - py::arg("streamer") = std::monostate{} - ) + py::arg("streamer") = std::monostate{}) .def( "generate", [](ContinuousBatchingPipeline& pipe, const std::vector& prompts, const std::vector& generation_config, - const pyutils::PyBindStreamerVariant& streamer - ) -> py::typing::Union> { + const pyutils::PyBindStreamerVariant& streamer) + -> py::typing::Union> { return __call_cb_generate(pipe, prompts, generation_config, streamer); }, py::arg("prompts"), py::arg("generation_config"), - py::arg("streamer") = std::monostate{} - ) - + py::arg("streamer") = std::monostate{}) + .def( "generate", [](ContinuousBatchingPipeline& pipe, const std::string& prompt, const ov::genai::GenerationConfig& generation_config, - const pyutils::PyBindStreamerVariant& streamer - ) -> py::typing::Union> { + const pyutils::PyBindStreamerVariant& streamer) + -> py::typing::Union> { std::vector prompts = { prompts }; std::vector generation_configs = { generation_config }; return __call_cb_generate(pipe, prompts, generation_configs, streamer); }, py::arg("prompt"), py::arg("generation_config"), - py::arg("streamer") = std::monostate{} - ) + py::arg("streamer") = std::monostate{}) .def( "generate", @@ -474,8 +511,8 @@ void init_continuous_batching_pipeline(py::module_& m) { const std::vector& prompts, const std::vector>& images, const std::vector& generation_config, - const pyutils::PyBindStreamerVariant& py_streamer - ) -> py::typing::Union> { + const pyutils::PyBindStreamerVariant& py_streamer) + -> py::typing::Union> { ov::genai::StreamerVariant streamer = pyutils::pystreamer_to_streamer(py_streamer); std::vector generated_results; { @@ -487,8 +524,7 @@ void init_continuous_batching_pipeline(py::module_& m) { py::arg("prompts"), py::arg("images"), py::arg("generation_config"), - py::arg("streamer") = std::monostate{} - ) + py::arg("streamer") = std::monostate{}) .def( "generate", @@ -496,19 +532,18 @@ void init_continuous_batching_pipeline(py::module_& m) { const std::vector& prompts, const std::vector>& videos, const std::vector& generation_config, - const pyutils::PyBindStreamerVariant& py_streamer - ) -> py::typing::Union> { + const pyutils::PyBindStreamerVariant& py_streamer) + -> py::typing::Union> { ov::genai::StreamerVariant streamer = pyutils::pystreamer_to_streamer(py_streamer); std::vector generated_results; { py::gil_scoped_release rel; - generated_results = pipe.generate(prompts, videos, generation_config, streamer, true); + generated_results = pipe.generate(prompts, {}, videos, generation_config, streamer); } return py::cast(generated_results); }, py::arg("prompts"), py::arg("videos"), py::arg("generation_config"), - py::arg("streamer") = std::monostate{} - ); + py::arg("streamer") = std::monostate{}); } diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp index 2b31c37ec6..4d4aa79839 100644 --- a/src/python/py_vlm_pipeline.cpp +++ b/src/python/py_vlm_pipeline.cpp @@ -128,17 +128,17 @@ py::object call_vlm_generate( ov::genai::VLMPipeline& pipe, const std::string& prompt, const std::vector& images, + const std::vector& video, const ov::genai::GenerationConfig& generation_config, const pyutils::PyBindStreamerVariant& py_streamer, - const py::kwargs& kwargs, - const bool& is_video = false + const py::kwargs& kwargs ) { auto updated_config = *pyutils::update_config_from_kwargs(generation_config, kwargs); ov::genai::StreamerVariant streamer = pyutils::pystreamer_to_streamer(py_streamer); ov::genai::VLMDecodedResults res; { py::gil_scoped_release rel; - res= pipe.generate(prompt, images, updated_config, streamer, is_video); + res= pipe.generate(prompt, images, video, updated_config, streamer); } return py::cast(res); } @@ -231,7 +231,7 @@ void init_vlm_pipeline(py::module_& m) { const pyutils::PyBindStreamerVariant& streamer, const py::kwargs& kwargs ) -> py::typing::Union { - return call_vlm_generate(pipe, prompt, images, generation_config, streamer, kwargs); + return call_vlm_generate(pipe, prompt, images, {}, generation_config, streamer, kwargs); }, py::arg("prompt"), "Input string", py::arg("images"), "Input images", @@ -248,7 +248,7 @@ void init_vlm_pipeline(py::module_& m) { const pyutils::PyBindStreamerVariant& streamer, const py::kwargs& kwargs ) -> py::typing::Union { - return call_vlm_generate(pipe, prompt, video, generation_config, streamer, kwargs, true); + return call_vlm_generate(pipe, prompt, {}, video, generation_config, streamer, kwargs); }, py::arg("prompt"), "Input string", py::arg("video"), "Input video", @@ -265,7 +265,7 @@ void init_vlm_pipeline(py::module_& m) { const pyutils::PyBindStreamerVariant& streamer, const py::kwargs& kwargs ) -> py::typing::Union { - return call_vlm_generate(pipe, prompt, {images}, generation_config, streamer, kwargs); + return call_vlm_generate(pipe, prompt, {images}, {}, generation_config, streamer, kwargs); }, py::arg("prompt"), "Input string", py::arg("images"), "Input images", From 2215f8a6bdc2890d9636137199ef875e20913ef8 Mon Sep 17 00:00:00 2001 From: Xiping Yan Date: Thu, 25 Sep 2025 08:42:51 +0800 Subject: [PATCH 032/118] Update src/cpp/src/visual_language/qwen2vl/classes.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/cpp/src/visual_language/qwen2vl/classes.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index 2399c515b2..e5057abd3f 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -704,7 +704,8 @@ EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const std::vec OPENVINO_ASSERT(config.temporal_patch_size == 2u, "temporal_patch_size != 2."); - ov::Tensor same_image(ov::element::f32, ov::Shape{1}, std::vector{images.size() == 2u ? 1.f : 0.f}.data()); + std::vector same_image_data{images.size() == 2u ? 1.f : 0.f}; + ov::Tensor same_image(ov::element::f32, ov::Shape{1}, same_image_data.data()); ov::Tensor input_image_1(ov::element::u8, image_shape, images[0].data()); ov::Tensor input_image_2(ov::element::u8, image_shape, From 14352a7761acc6da473830548d8a6385a3dce9da Mon Sep 17 00:00:00 2001 From: Xiping Yan Date: Thu, 25 Sep 2025 08:45:51 +0800 Subject: [PATCH 033/118] Update src/python/openvino_genai/py_openvino_genai.pyi Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/python/openvino_genai/py_openvino_genai.pyi | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index d28ddeadd2..3a05ac4229 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -412,7 +412,10 @@ class ContinuousBatchingPipeline: def add_request(self, request_id: typing.SupportsInt, prompt: str, generation_config: GenerationConfig) -> GenerationHandle: ... @typing.overload - def add_request(self, request_id: typing.SupportsInt, prompt: str, images: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig, is_video: bool = False) -> GenerationHandle: + def add_request(self, request_id: typing.SupportsInt, prompt: str, images: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig) -> GenerationHandle: + ... + @typing.overload + def add_request(self, request_id: typing.SupportsInt, prompt: str, video: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig) -> GenerationHandle: ... def finish_chat(self) -> None: ... From 89afa540a0b1e5abf6767e42e28f9b104d73cb3a Mon Sep 17 00:00:00 2001 From: xipingya Date: Thu, 25 Sep 2025 08:56:40 +0800 Subject: [PATCH 034/118] copilot give a wrong suggestion. add images and video param for add_request. Signed-off-by: xipingya --- src/python/openvino_genai/py_openvino_genai.pyi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 3a05ac4229..6e093e2443 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -415,7 +415,7 @@ class ContinuousBatchingPipeline: def add_request(self, request_id: typing.SupportsInt, prompt: str, images: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig) -> GenerationHandle: ... @typing.overload - def add_request(self, request_id: typing.SupportsInt, prompt: str, video: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig) -> GenerationHandle: + def add_request(self, request_id: typing.SupportsInt, prompt: str, images: collections.abc.Sequence[openvino._pyopenvino.Tensor], video: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig) -> GenerationHandle: ... def finish_chat(self) -> None: ... From 8768795b35a36c58790ea137e60503201930a2a6 Mon Sep 17 00:00:00 2001 From: xipingya Date: Thu, 25 Sep 2025 10:38:54 +0800 Subject: [PATCH 035/118] Add examples to .md Signed-off-by: xipingya --- README.md | 13 +++++++++++++ .../openvino/genai/visual_language/pipeline.hpp | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ccc9429715..313b12709e 100644 --- a/README.md +++ b/README.md @@ -160,6 +160,13 @@ image_data = ov.Tensor(image_data) prompt = "Can you describe the image?" result = pipe.generate(prompt, image=image_data, max_new_tokens=100) + +# To input multiple images, use 'images=' +# result = pipe.generate(prompt, images=[image_data], max_new_tokens=100) + +# To input video frames, use 'video=' +# result = pipe.generate(prompt, video=[image_data], max_new_tokens=100) + print(result.texts[0]) ``` @@ -181,6 +188,12 @@ int main(int argc, char* argv[]) { ov::genai::image(rgb), ov::genai::max_new_tokens(100) ) << '\n'; + + // To input multiple images, use 'images' + // pipe.generate(prompt, ov::genai::images(std::vector{rgb}), ov::genai::max_new_tokens(100)); + + // To input video frames, use 'video' + // pipe.generate(prompt, ov::genai::video(std::vector{rgb}), ov::genai::max_new_tokens(100)); } ``` diff --git a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp index 3d72a3973e..6b07929494 100644 --- a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp +++ b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp @@ -105,8 +105,8 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /// @brief Generate a response given a prompt and any number of /// uint8 RGB images with [NHWC] or [HWC] layout. /// @param prompt A prompt to respond to. - /// @param video Video frames to be prepended to a prompt. /// @param images Images to be prepended to a prompt. + /// @param video Video frames to be prepended to a prompt. /// @param generation_config A config to follow for text generation. /// @param streamer A streamer to acquire intermediate result. /// @return A string generated by a model. From be57bf265872350a391ee3266b964c9de23980be Mon Sep 17 00:00:00 2001 From: xipingya Date: Thu, 25 Sep 2025 14:42:02 +0800 Subject: [PATCH 036/118] Fix test video error, and input multiple images. Signed-off-by: xipingya --- src/cpp/src/continuous_batching/pipeline_base.cpp | 10 +++++----- .../visual_language/continuous_batching_adapter.hpp | 6 ++++-- src/cpp/src/visual_language/pipeline.cpp | 6 +++--- tests/python_tests/test_vlm_pipeline.py | 4 ++-- 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/cpp/src/continuous_batching/pipeline_base.cpp b/src/cpp/src/continuous_batching/pipeline_base.cpp index cbf633beb1..5c416388b4 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.cpp +++ b/src/cpp/src/continuous_batching/pipeline_base.cpp @@ -166,8 +166,8 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( OPENVINO_ASSERT(m_model_input_type == ModelInputType::EMBEDDINGS); OPENVINO_ASSERT(prompts.size() == sampling_params.size(), "Number of prompts should be equal to the number of generation configs."); - OPENVINO_ASSERT(prompts.size() == rgbs_vector.size(), "Number of prompts should be equal to the number of images vectors."); - OPENVINO_ASSERT(rgbs_vector.empty() || video_vector.empty(), "Only support one input, video or images"); + OPENVINO_ASSERT(prompts.size() == rgbs_vector.size() || prompts.size() == video_vector.size(), "Number of prompts should be equal to the number of images/video vectors."); + OPENVINO_ASSERT(rgbs_vector.size() == 0u || video_vector.size() == 0u, "Only support one input, video or images"); std::vector input_embeds_list; std::vector token_type_ids_list; @@ -295,15 +295,15 @@ GenerationHandle ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_re const std::vector& video, GenerationConfig sampling_params) { OPENVINO_ASSERT(m_model_input_type == ModelInputType::EMBEDDINGS, "Model doesn't support embeddings."); - OPENVINO_ASSERT((video.empty() || images.empty()), "Only support one input, video or images."); + OPENVINO_ASSERT((video.size() == 0u || images.size() == 0u), "Only support one input, video or images."); ov::genai::VLMPerfMetrics metrics; ov::Tensor inputs; { std::lock_guard lock(m_embeddings_mutex); m_inputs_embedder->set_apply_chat_template_status(sampling_params.apply_chat_template); - auto encoded_images = video.empty() ? m_inputs_embedder->encode_images(images, false) - : m_inputs_embedder->encode_images(video, true); + auto encoded_images = video.size() == 0 ? m_inputs_embedder->encode_images(images, false) + : m_inputs_embedder->encode_images(video, true); const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, 0, encoded_images); inputs = m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, metrics, true, image_sequence); diff --git a/src/cpp/src/visual_language/continuous_batching_adapter.hpp b/src/cpp/src/visual_language/continuous_batching_adapter.hpp index 9a74d993f6..1d3653be70 100644 --- a/src/cpp/src/visual_language/continuous_batching_adapter.hpp +++ b/src/cpp/src/visual_language/continuous_batching_adapter.hpp @@ -47,7 +47,7 @@ class ov::genai::VLMPipeline::VLMContinuousBatchingAdapter : public ov::genai::V GenerationConfig generation_config, const StreamerVariant& streamer ) override { - return generate(prompt, {}, images, generation_config, streamer); + return generate(prompt, images, {}, generation_config, streamer); } VLMDecodedResults generate( @@ -58,7 +58,9 @@ class ov::genai::VLMPipeline::VLMContinuousBatchingAdapter : public ov::genai::V const StreamerVariant& streamer ) override { auto start_time = std::chrono::steady_clock::now(); - auto result = m_impl.generate({prompt}, {images}, {video}, {generation_config}, streamer)[0]; + auto images_vec = images.size() == 0u ? std::vector>{} : std::vector>{images}; + auto video_vec = video.size() == 0u ? std::vector>{} : std::vector>{video}; + auto result = m_impl.generate({prompt}, images_vec, video_vec, {generation_config}, streamer)[0]; auto stop_time = std::chrono::steady_clock::now(); VLMDecodedResults decoded; diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 4c196846df..b769a5b714 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -166,7 +166,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ GenerationConfig generation_config, const StreamerVariant& streamer ) override { - OPENVINO_ASSERT((video.empty() || images.empty()), "Only support one input, video or images."); + OPENVINO_ASSERT((video.size() == 0 || images.size() == 0u), "Only support one input, video or images."); auto generate_start_time = std::chrono::steady_clock::now(); VLMPerfMetrics perf_metrics; @@ -195,8 +195,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ "Currently only \"num_return_sequences\" equal to 1 is supported for NPU device!"); } - auto encoded_images = video.empty() ? m_inputs_embedder->encode_images(images, false) - : m_inputs_embedder->encode_images(video, true); + auto encoded_images = video.size() == 0u ? m_inputs_embedder->encode_images(images, false) + : m_inputs_embedder->encode_images(video, true); auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); if (m_is_chat_conversation) { diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index a6892b7885..7364676c34 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -987,10 +987,10 @@ def test_vlm_pipeline_match_optimum_preresized(request, model_id, image_name, ba def test_vlm_pipeline_video_input(request, model_id, image_name, backend): resized_image = request.getfixturevalue(image_name) - prompt = "Describe this image." + prompt = "Describe this video." max_new_tokens = 10 model_path = get_ov_model(model_id) vlm = VLMPipeline(model_path, "CPU", ATTENTION_BACKEND=backend) - genai_output = vlm.generate(prompt, video=[openvino.Tensor(resized_image)], max_new_tokens=max_new_tokens) \ No newline at end of file + genai_output = vlm.generate(prompt, video=[openvino.Tensor(resized_image)]*3, max_new_tokens=max_new_tokens) \ No newline at end of file From d96c5dd2e9b417ca0f78e556f2b55a5438348105 Mon Sep 17 00:00:00 2001 From: xipingya Date: Fri, 26 Sep 2025 11:03:48 +0800 Subject: [PATCH 037/118] Update test based on 4D video. Signed-off-by: xipingya --- tests/python_tests/test_vlm_pipeline.py | 41 ++++++++++++++++++++----- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index 7364676c34..501423cb3e 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -34,6 +34,8 @@ import sys import os import transformers +import numpy as np +import cv2 from optimum.intel.openvino import OVModelForVisualCausalLM from openvino_genai import ( VLMPipeline, @@ -923,6 +925,32 @@ def cat_image_336x336(cat_image): def cat_image_32x32(cat_image): return cat_image.resize((32, 32)) +# Return video with shape: [num_frames, height, width, 3] +def create_countdown_frames(): + frames_count = 5 + height = 240 + width = 360 + frame_list = [] + for count in range(frames_count, 0, -1): + frame = np.zeros((height, width, 3), dtype=np.uint8) + text = str(count) + (text_width, text_height), baseline = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 3, 4) + + text_x = (width - text_width) // 2 + text_y = (height + text_height) // 2 + + cv2.putText(frame, text, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, + 3, (255, 255, 255), 4, cv2.LINE_AA + ) + + frame_list.append(frame) + ov_tensor = openvino.Tensor(np.stack(frame_list)) + return ov_tensor + +@pytest.fixture(scope="module") +def countdown_video(): + return create_countdown_frames() + @pytest.mark.precommit @pytest.mark.parametrize( @@ -978,19 +1006,18 @@ def test_vlm_pipeline_match_optimum_preresized(request, model_id, image_name, ba @pytest.mark.precommit @pytest.mark.parametrize( - "model_id, image_name, backend", + "model_id, video_name, backend", [ - pytest.param("katuni4ka/tiny-random-qwen2.5-vl", "cat_image_336x336", "SDPA"), - pytest.param("katuni4ka/tiny-random-qwen2.5-vl", "cat_image_336x336", "PA"), + pytest.param("katuni4ka/tiny-random-qwen2.5-vl", "countdown_video", "SDPA"), + pytest.param("katuni4ka/tiny-random-qwen2.5-vl", "countdown_video", "PA"), ], ) -def test_vlm_pipeline_video_input(request, model_id, image_name, backend): - resized_image = request.getfixturevalue(image_name) - +def test_vlm_pipeline_video_input(request, model_id, video_name, backend): + video_tensor = request.getfixturevalue(video_name) prompt = "Describe this video." max_new_tokens = 10 model_path = get_ov_model(model_id) vlm = VLMPipeline(model_path, "CPU", ATTENTION_BACKEND=backend) - genai_output = vlm.generate(prompt, video=[openvino.Tensor(resized_image)]*3, max_new_tokens=max_new_tokens) \ No newline at end of file + genai_output = vlm.generate(prompt, video=[video_tensor], max_new_tokens=max_new_tokens) \ No newline at end of file From aaf20b08990605d0bca066336cd52771eeaf9d0c Mon Sep 17 00:00:00 2001 From: xipingya Date: Sat, 27 Sep 2025 09:52:48 +0800 Subject: [PATCH 038/118] Add vlm test dependency: opencv-python Signed-off-by: xipingya --- tests/python_tests/requirements.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt index b262a4fc40..6452149bbb 100644 --- a/tests/python_tests/requirements.txt +++ b/tests/python_tests/requirements.txt @@ -8,6 +8,9 @@ transformers==4.53.3 hf_transfer==0.1.9 gguf==0.17.1 +# VLM requirements +opencv-python + # rag requirements langchain_community==0.3.29 langchain-core==0.3.75 From 6f5189b3257f2f43826b02ae42dfe46ee2982b80 Mon Sep 17 00:00:00 2001 From: xipingya Date: Sat, 27 Sep 2025 22:23:35 +0800 Subject: [PATCH 039/118] Enable mix video and image input. Signed-off-by: xipingya --- .../src/continuous_batching/pipeline_base.cpp | 30 ++++++++++------ .../src/visual_language/gemma3/classes.cpp | 14 +++----- .../src/visual_language/gemma3/classes.hpp | 2 +- .../src/visual_language/inputs_embedder.cpp | 31 ++++++++-------- .../src/visual_language/inputs_embedder.hpp | 12 +++---- src/cpp/src/visual_language/llava/classes.cpp | 7 ++-- src/cpp/src/visual_language/llava/classes.hpp | 2 +- .../visual_language/llava_next/classes.cpp | 6 ++-- .../visual_language/llava_next/classes.hpp | 2 +- .../src/visual_language/nanollava/classes.cpp | 6 +++- .../src/visual_language/nanollava/classes.hpp | 2 +- src/cpp/src/visual_language/pipeline.cpp | 5 +-- src/cpp/src/visual_language/pipeline_base.hpp | 36 ++++++++----------- .../src/visual_language/qwen2vl/classes.cpp | 18 +++++----- .../src/visual_language/qwen2vl/classes.hpp | 2 +- .../src/visual_language/vision_encoder.hpp | 6 ++-- 16 files changed, 91 insertions(+), 90 deletions(-) diff --git a/src/cpp/src/continuous_batching/pipeline_base.cpp b/src/cpp/src/continuous_batching/pipeline_base.cpp index 5c416388b4..64a73a16bc 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.cpp +++ b/src/cpp/src/continuous_batching/pipeline_base.cpp @@ -167,7 +167,6 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( OPENVINO_ASSERT(prompts.size() == sampling_params.size(), "Number of prompts should be equal to the number of generation configs."); OPENVINO_ASSERT(prompts.size() == rgbs_vector.size() || prompts.size() == video_vector.size(), "Number of prompts should be equal to the number of images/video vectors."); - OPENVINO_ASSERT(rgbs_vector.size() == 0u || video_vector.size() == 0u, "Only support one input, video or images"); std::vector input_embeds_list; std::vector token_type_ids_list; @@ -177,11 +176,12 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( if (m_is_chat_conversation) { OPENVINO_ASSERT(1 == prompts.size(), "Can't chat with multiple prompts"); - const auto& rgbs = video_vector.empty() ? rgbs_vector[0] : video_vector[0]; const auto& prompt = prompts[0]; auto start_get_inputs_embeds = std::chrono::steady_clock::now(); - encoded_images = m_inputs_embedder->encode_images(rgbs, rgbs_vector.empty()); + auto image_rgbs = rgbs_vector.size() > 0 ? rgbs_vector[0] : std::vector{}; + auto video_rgbs = video_vector.size() > 0 ? video_vector[0] : std::vector{}; + encoded_images = m_inputs_embedder->encode_images(image_rgbs, video_rgbs); m_history_images.insert(m_history_images.end(), encoded_images.begin(), encoded_images.end()); const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); @@ -193,11 +193,19 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( m_inputs_embedder->set_apply_chat_template_status(false); if (m_inputs_embedder->has_token_type_ids()) { - auto [embeds, tt_ids] = m_inputs_embedder->get_inputs_embeds_with_token_type_ids(templated_history, m_history_images, vlm_perf_metrics[0], rgbs.size() > 0, m_history_image_ids); + auto [embeds, tt_ids] = m_inputs_embedder->get_inputs_embeds_with_token_type_ids(templated_history, + m_history_images, + vlm_perf_metrics[0], + true, + m_history_image_ids); input_embeds_list.push_back(std::move(embeds)); token_type_ids_list.push_back(std::move(tt_ids)); } else { - input_embeds_list.emplace_back(m_inputs_embedder->get_inputs_embeds(templated_history, m_history_images, vlm_perf_metrics[0], rgbs.size() > 0, m_history_image_ids)); + input_embeds_list.emplace_back(m_inputs_embedder->get_inputs_embeds(templated_history, + m_history_images, + vlm_perf_metrics[0], + true, + m_history_image_ids)); } auto end_get_inputs_embeds = std::chrono::steady_clock::now(); @@ -206,11 +214,12 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( } else { for (size_t i = 0; i < prompts.size(); i++) { const auto& prompt = prompts[i]; - const auto& rgbs = video_vector.empty() ? rgbs_vector[i] : video_vector[i]; auto start_get_inputs_embeds = std::chrono::steady_clock::now(); - const auto encoded_images = m_inputs_embedder->encode_images(rgbs, rgbs_vector.empty()); + auto image_rgbs = rgbs_vector.size() > 0 ? rgbs_vector[i] : std::vector{}; + auto video_rgbs = video_vector.size() > 0 ? video_vector[i] : std::vector{}; + const auto encoded_images = m_inputs_embedder->encode_images(image_rgbs, video_rgbs); auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); m_inputs_embedder->set_apply_chat_template_status(sampling_params[i].apply_chat_template); @@ -280,7 +289,7 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(uint64_t re std::lock_guard lock(m_embeddings_mutex); m_inputs_embedder->set_apply_chat_template_status(sampling_params.apply_chat_template); - auto encoded_images = m_inputs_embedder->encode_images(rgbs, false); + auto encoded_images = m_inputs_embedder->encode_images(rgbs, std::vector{}); const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, 0, encoded_images); inputs = m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, metrics, true, image_sequence); @@ -295,15 +304,14 @@ GenerationHandle ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_re const std::vector& video, GenerationConfig sampling_params) { OPENVINO_ASSERT(m_model_input_type == ModelInputType::EMBEDDINGS, "Model doesn't support embeddings."); - OPENVINO_ASSERT((video.size() == 0u || images.size() == 0u), "Only support one input, video or images."); + ov::genai::VLMPerfMetrics metrics; ov::Tensor inputs; { std::lock_guard lock(m_embeddings_mutex); m_inputs_embedder->set_apply_chat_template_status(sampling_params.apply_chat_template); - auto encoded_images = video.size() == 0 ? m_inputs_embedder->encode_images(images, false) - : m_inputs_embedder->encode_images(video, true); + auto encoded_images =m_inputs_embedder->encode_images(images, video); const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, 0, encoded_images); inputs = m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, metrics, true, image_sequence); diff --git a/src/cpp/src/visual_language/gemma3/classes.cpp b/src/cpp/src/visual_language/gemma3/classes.cpp index 2a2ff28df0..81fc531e9b 100644 --- a/src/cpp/src/visual_language/gemma3/classes.cpp +++ b/src/cpp/src/visual_language/gemma3/classes.cpp @@ -71,20 +71,16 @@ bool InputsEmbedderGemma3::has_token_type_ids() const { return true; } -std::vector InputsEmbedderGemma3::encode_images(const std::vector& images, const bool& is_video) { +std::vector InputsEmbedderGemma3::encode_images(const std::vector& images, const std::vector& video) { + if (video.size() > 0) { + OPENVINO_THROW("Gemma3 doesn't support video preprocess currently. Input images are processed as separate images."); + } + std::vector embeds; ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}}; std::vector single_images = to_single_image_tensors(images); - if (is_video) { - embeds = m_vision_encoder->encode_video(single_images, vision_config); - if (!embeds.empty()) { - return embeds; - } - // Fallback to image process. - } - embeds.reserve(single_images.size()); for (const ov::Tensor& image : single_images) { embeds.emplace_back(m_vision_encoder->encode(image, vision_config)); diff --git a/src/cpp/src/visual_language/gemma3/classes.hpp b/src/cpp/src/visual_language/gemma3/classes.hpp index ad8f0814df..26087bb25b 100644 --- a/src/cpp/src/visual_language/gemma3/classes.hpp +++ b/src/cpp/src/visual_language/gemma3/classes.hpp @@ -41,7 +41,7 @@ class InputsEmbedderGemma3 : public InputsEmbedder::IInputsEmbedder { bool has_token_type_ids() const override; - std::vector encode_images(const std::vector& images, const bool& is_video = false) override; + std::vector encode_images(const std::vector& images, const std::vector& video) override; std::pair> normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const override; diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index a85290e14e..6ac93d9f4a 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -165,31 +165,33 @@ std::vector InputsEmbedder::IInputsEmbedder::to_single_image_tensors return single_image_tensors; } -std::vector InputsEmbedder::IInputsEmbedder::encode_images(const std::vector& images, const bool& is_video) { - std::vector single_images = to_single_image_tensors(images); +std::vector InputsEmbedder::IInputsEmbedder::encode_images(const std::vector& images, const std::vector& video) { std::vector embeds; - if (is_video) { - return m_vision_encoder->encode_video(single_images); + for (const ov::Tensor& single_video : video) { + std::vector single_frames = to_single_image_tensors({single_video}); + auto embeds_video = m_vision_encoder->encode_video(single_frames); + embeds.insert(embeds.end(), embeds_video.begin(), embeds_video.end()); } + std::vector single_images = to_single_image_tensors(images); for (const ov::Tensor& image : single_images) { embeds.emplace_back(m_vision_encoder->encode(image)); } return embeds; } -ov::Tensor InputsEmbedder::IInputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector& images, const bool& is_video, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence) { - return get_inputs_embeds(prompt, encode_images(images, is_video), metrics, true, image_sequence); +ov::Tensor InputsEmbedder::IInputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector& images, const std::vector& video, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence) { + return get_inputs_embeds(prompt, encode_images(images, video), metrics, true, image_sequence); } std::pair InputsEmbedder::IInputsEmbedder::get_inputs_embeds_with_token_type_ids( const std::string& prompt, const std::vector& images, - const bool& is_video, + const std::vector& video, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence) { - return get_inputs_embeds_with_token_type_ids(prompt, encode_images(images, is_video), metrics, true, image_sequence); + return get_inputs_embeds_with_token_type_ids(prompt, encode_images(images, video), metrics, true, image_sequence); } std::pair InputsEmbedder::IInputsEmbedder::get_inputs_embeds_with_token_type_ids( @@ -267,8 +269,8 @@ InputsEmbedder::InputsEmbedder(const ModelsMap& models_map, } } -ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector& images, const bool& is_video, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence) { - return m_impl->get_inputs_embeds(prompt, images, is_video, metrics, image_sequence); +ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector& images, const std::vector& video, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence) { + return m_impl->get_inputs_embeds(prompt, images, video, metrics, image_sequence); } ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings, const std::vector& image_sequence) { @@ -278,11 +280,10 @@ ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const st std::pair InputsEmbedder::get_inputs_embeds_with_token_type_ids( const std::string& prompt, const std::vector& images, - const bool& is_video, + const std::vector& video, VLMPerfMetrics& metrics, const std::vector& image_sequence) { - return m_impl->get_inputs_embeds_with_token_type_ids( - prompt, images, is_video, metrics, image_sequence); + return m_impl->get_inputs_embeds_with_token_type_ids(prompt, images, video, metrics, image_sequence); } std::pair InputsEmbedder::get_inputs_embeds_with_token_type_ids( @@ -299,8 +300,8 @@ bool InputsEmbedder::has_token_type_ids() const { return m_impl->has_token_type_ids(); } -std::vector InputsEmbedder::encode_images(const std::vector& images, const bool& is_video) { - return m_impl->encode_images(images, is_video); +std::vector InputsEmbedder::encode_images(const std::vector& images, const std::vector& video) { + return m_impl->encode_images(images, video); } std::pair> InputsEmbedder::get_position_ids(const size_t inputs_embeds_size, const size_t history_size) { diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index e9f5f1261f..7289dbb09d 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -35,18 +35,18 @@ class InputsEmbedder { const ov::AnyMap device_config); // compute input embedding for prompt and multiple images - ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, const bool& is_video, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence); + ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, const std::vector& video, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence); ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}); // compute input embedding and token_type_ids - std::pair get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector& images, const bool& is_video, VLMPerfMetrics& metrics, const std::vector& image_sequence = {}); + std::pair get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector& images, const std::vector& video, VLMPerfMetrics& metrics, const std::vector& image_sequence = {}); std::pair get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector& images, VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}); bool has_token_type_ids() const; - std::vector encode_images(const std::vector& images, const bool& is_video = false); + std::vector encode_images(const std::vector& images, const std::vector& video); // compute position ids for language model input std::pair> get_position_ids(const size_t inputs_embeds_size, const size_t history_size); @@ -108,15 +108,15 @@ class InputsEmbedder { public: virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}) = 0; - ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, const bool& is_video, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence); + ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, const std::vector& video, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence); - std::pair get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector& images, const bool& is_video, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence = {}); + std::pair get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector& images, const std::vector& video, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence = {}); virtual std::pair get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}); virtual bool has_token_type_ids() const; - virtual std::vector encode_images(const std::vector& images, const bool& is_video = false); + virtual std::vector encode_images(const std::vector& images, const std::vector& video); virtual std::pair> get_position_ids(const size_t inputs_embeds_size, const size_t history_size); diff --git a/src/cpp/src/visual_language/llava/classes.cpp b/src/cpp/src/visual_language/llava/classes.cpp index 1af833668a..4f26622d2d 100644 --- a/src/cpp/src/visual_language/llava/classes.cpp +++ b/src/cpp/src/visual_language/llava/classes.cpp @@ -92,10 +92,11 @@ InputsEmbedderLLaVA::InputsEmbedderLLaVA( const ov::AnyMap device_config) : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } -std::vector InputsEmbedderLLaVA::encode_images(const std::vector& images, const bool& is_video) { - if (is_video) { - Logger::warn("LLaVA doesn't support video preprocess currently. Input images are processed as separate images."); +std::vector InputsEmbedderLLaVA::encode_images(const std::vector& images, const std::vector& video) { + if (video.size() > 0) { + OPENVINO_THROW("LLaVA doesn't support video preprocess currently. Input images are processed as separate images."); } + std::vector embeds; ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}}; std::vector single_images = to_single_image_tensors(images); diff --git a/src/cpp/src/visual_language/llava/classes.hpp b/src/cpp/src/visual_language/llava/classes.hpp index 660328fb62..8b4785ac82 100644 --- a/src/cpp/src/visual_language/llava/classes.hpp +++ b/src/cpp/src/visual_language/llava/classes.hpp @@ -37,7 +37,7 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}) override; - std::vector encode_images(const std::vector& images, const bool& is_video = false) override; + std::vector encode_images(const std::vector& images, const std::vector& video) override; std::pair> normalize_prompt( const std::string& prompt, diff --git a/src/cpp/src/visual_language/llava_next/classes.cpp b/src/cpp/src/visual_language/llava_next/classes.cpp index 3f6a6dcd8e..7b5657c711 100644 --- a/src/cpp/src/visual_language/llava_next/classes.cpp +++ b/src/cpp/src/visual_language/llava_next/classes.cpp @@ -333,9 +333,9 @@ ov::Tensor pack_image_features_llava_next( } // namespace -std::vector InputsEmbedderLLaVANext::encode_images(const std::vector& images, const bool& is_video) { - if (is_video) { - Logger::warn("LLaVANext doesn't support video preprocess currently. Input images are processed as separate images."); +std::vector InputsEmbedderLLaVANext::encode_images(const std::vector& images, const std::vector& video) { + if (video.size() > 0) { + OPENVINO_THROW("LLaVANext doesn't support video preprocess currently. Input images are processed as separate images."); } std::vector embeds; diff --git a/src/cpp/src/visual_language/llava_next/classes.hpp b/src/cpp/src/visual_language/llava_next/classes.hpp index 19293c0e78..1bd830fe9f 100644 --- a/src/cpp/src/visual_language/llava_next/classes.hpp +++ b/src/cpp/src/visual_language/llava_next/classes.hpp @@ -24,7 +24,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}) override; - std::vector encode_images(const std::vector& images, const bool& is_video = false) override; + std::vector encode_images(const std::vector& images, const std::vector& video) override; std::pair> normalize_prompt( const std::string& prompt, diff --git a/src/cpp/src/visual_language/nanollava/classes.cpp b/src/cpp/src/visual_language/nanollava/classes.cpp index 9a4a528ead..00bde6c2cd 100644 --- a/src/cpp/src/visual_language/nanollava/classes.cpp +++ b/src/cpp/src/visual_language/nanollava/classes.cpp @@ -127,7 +127,11 @@ InputsEmbedderNanoLLaVA::InputsEmbedderNanoLLaVA( const ov::AnyMap device_config) : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } -std::vector InputsEmbedderNanoLLaVA::encode_images(const std::vector& images) { +std::vector InputsEmbedderNanoLLaVA::encode_images(const std::vector& images, const std::vector& video) { + if (video.size() > 0) { + OPENVINO_THROW("NanoLLaVA doesn't support video preprocess currently. Input images are processed as separate images."); + } + std::vector embeds; ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}}; std::vector single_images = to_single_image_tensors(images); diff --git a/src/cpp/src/visual_language/nanollava/classes.hpp b/src/cpp/src/visual_language/nanollava/classes.hpp index 5b185c2219..33b5747471 100644 --- a/src/cpp/src/visual_language/nanollava/classes.hpp +++ b/src/cpp/src/visual_language/nanollava/classes.hpp @@ -37,7 +37,7 @@ class InputsEmbedderNanoLLaVA : public InputsEmbedder::IInputsEmbedder { ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}) override; - std::vector encode_images(const std::vector& images) override; + std::vector encode_images(const std::vector& images, const std::vector& video) override; std::pair> normalize_prompt( const std::string& prompt, diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index b769a5b714..00d41227dd 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -166,8 +166,6 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ GenerationConfig generation_config, const StreamerVariant& streamer ) override { - OPENVINO_ASSERT((video.size() == 0 || images.size() == 0u), "Only support one input, video or images."); - auto generate_start_time = std::chrono::steady_clock::now(); VLMPerfMetrics perf_metrics; auto& raw_counters = perf_metrics.raw_metrics; @@ -195,8 +193,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ "Currently only \"num_return_sequences\" equal to 1 is supported for NPU device!"); } - auto encoded_images = video.size() == 0u ? m_inputs_embedder->encode_images(images, false) - : m_inputs_embedder->encode_images(video, true); + auto encoded_images = m_inputs_embedder->encode_images(images, video); auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); if (m_is_chat_conversation) { diff --git a/src/cpp/src/visual_language/pipeline_base.hpp b/src/cpp/src/visual_language/pipeline_base.hpp index a0497f54ec..54be4b6934 100644 --- a/src/cpp/src/visual_language/pipeline_base.hpp +++ b/src/cpp/src/visual_language/pipeline_base.hpp @@ -42,47 +42,39 @@ class ov::genai::VLMPipeline::VLMPipelineBase { auto image = config_map.find(ov::genai::image.name()); auto images = config_map.find(ov::genai::images.name()); auto video = config_map.find(ov::genai::video.name()); - bool is_video = config_map.end() != video; - int num_set = (config_map.end() != image) + (config_map.end() != images) + (is_video); - OPENVINO_ASSERT(num_set <= 1, "Only one property can be set: image, images, or video."); ov::genai::OptionalGenerationConfig config_arg = utils::get_config_from_map(config_map); GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config(); config.update_generation_config(config_map); - std::vector image_rgbs; + std::vector image_rgbs = {}; + std::vector video_rgbs = {}; if (config_map.end() != image) { image_rgbs = {image->second.as()}; - return generate(prompt, image_rgbs, config, utils::get_streamer_from_map(config_map)); - } + } if (config_map.end() != images) { if (images->second.is>()) { - image_rgbs = images->second.as>(); - } - else if (images->second.is()){ - image_rgbs = {images->second.as()}; - } - else { + auto imgs = images->second.as>(); + image_rgbs.insert(image_rgbs.end(), imgs.begin(), imgs.end()); + } else if (images->second.is()) { + image_rgbs.push_back(std::move(images->second.as())); + } else { OPENVINO_THROW("Unknown images type."); } - return generate(prompt, image_rgbs, config, utils::get_streamer_from_map(config_map)); } - if (is_video) { + if (config_map.end() != video) { if (video->second.is>()) { - image_rgbs = video->second.as>(); - } - else if (video->second.is()){ - image_rgbs = {video->second.as()}; - } - else { + video_rgbs = video->second.as>(); + } else if (video->second.is()) { + video_rgbs = {video->second.as()}; + } else { OPENVINO_THROW("Unknown video type."); } - return generate(prompt, {}, image_rgbs, config, utils::get_streamer_from_map(config_map)); } - return generate(prompt, image_rgbs, {}, config, utils::get_streamer_from_map(config_map)); + return generate(prompt, image_rgbs, video_rgbs, config, utils::get_streamer_from_map(config_map)); } virtual void start_chat(const std::string& system_message) = 0; diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index 0346af0763..5974bc5e48 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -768,21 +768,21 @@ EncodedImage VisionEncoderQwen2VL::encode(const ov::Tensor& image, const ov::Any return encode_with_imagepreprocess_ov({image}, config_map); } -std::vector VisionEncoderQwen2VL::encode_video(const std::vector& images, +std::vector VisionEncoderQwen2VL::encode_video(const std::vector& frames, const ov::AnyMap& config_map) { ProcessorConfig config = utils::from_any_map(config_map, m_processor_config); std::vector encoded_imgs; - int i = 0; - int image_num = static_cast(images.size()); - for (; i < image_num - static_cast(config.temporal_patch_size); i += config.temporal_patch_size) { + size_t i = 0; + size_t image_num = frames.size(); + for (; i + config.temporal_patch_size <= image_num; i += config.temporal_patch_size) { EncodedImage encoded_img; if (use_ov_image_preprocess == false) { encoded_img = encode_with_imagepreprocess_cpp( - std::vector(images.begin() + i, images.begin() + i + config.temporal_patch_size), + std::vector(frames.begin() + i, frames.begin() + i + config.temporal_patch_size), config_map); } else { encoded_img = encode_with_imagepreprocess_ov( - std::vector(images.begin() + i, images.begin() + i + config.temporal_patch_size), + std::vector(frames.begin() + i, frames.begin() + i + config.temporal_patch_size), config_map); } @@ -791,9 +791,9 @@ std::vector VisionEncoderQwen2VL::encode_video(const std::vector> InputsEmbedderQwen2VL::normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const { auto [unified_prompt, images_sequence] = normalize(prompt, NATIVE_TAG, NATIVE_TAG, base_id, images.size()); - std::vector> images_grid_thw; + std::vector> images_grid_thw; images_grid_thw.reserve(images.size()); for (const auto& encoded_image : images) { diff --git a/src/cpp/src/visual_language/qwen2vl/classes.hpp b/src/cpp/src/visual_language/qwen2vl/classes.hpp index 87fa1588c8..864c3bae3d 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.hpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.hpp @@ -18,7 +18,7 @@ class VisionEncoderQwen2VL : public VisionEncoder { explicit VisionEncoderQwen2VL(const ModelsMap& models_map, const std::filesystem::path& config_dir_path, const std::string& device, const ov::AnyMap properties); EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map) override; - std::vector encode_video(const std::vector& image, const ov::AnyMap& config_map) override; + std::vector encode_video(const std::vector& frames, const ov::AnyMap& config_map) override; private: EncodedImage encode_with_imagepreprocess_cpp(const std::vector& image, const ov::AnyMap& config_map); diff --git a/src/cpp/src/visual_language/vision_encoder.hpp b/src/cpp/src/visual_language/vision_encoder.hpp index 46907b82b0..cd6cbf7dcf 100644 --- a/src/cpp/src/visual_language/vision_encoder.hpp +++ b/src/cpp/src/visual_language/vision_encoder.hpp @@ -101,8 +101,10 @@ class VisionEncoder { /// @return Resulting embeddings for the resized source image and /// its slices. virtual EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map = {}) = 0; - virtual std::vector encode_video(const std::vector& images, const ov::AnyMap& config_map = {}) { - // Video encode not implemented, return empty and fallback to image encode. + + /// @brief Compute embeddings of a or mulitple video given + virtual std::vector encode_video(const std::vector& frames, const ov::AnyMap& config_map = {}) { + OPENVINO_THROW("The current model does not support 'video' input, please use 'images' instead."); return {}; } From c0829a3d66f745aa787a1b0915b78616d97b901d Mon Sep 17 00:00:00 2001 From: xipingya Date: Sun, 28 Sep 2025 09:58:48 +0800 Subject: [PATCH 040/118] split encode_images into encode_images and encode_video Signed-off-by: xipingya --- .../src/continuous_batching/pipeline_base.cpp | 3 ++- .../src/visual_language/inputs_embedder.cpp | 23 ++++++++++++------- .../src/visual_language/inputs_embedder.hpp | 7 ++++-- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/src/cpp/src/continuous_batching/pipeline_base.cpp b/src/cpp/src/continuous_batching/pipeline_base.cpp index 64a73a16bc..fd7cc53590 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.cpp +++ b/src/cpp/src/continuous_batching/pipeline_base.cpp @@ -181,7 +181,8 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( auto image_rgbs = rgbs_vector.size() > 0 ? rgbs_vector[0] : std::vector{}; auto video_rgbs = video_vector.size() > 0 ? video_vector[0] : std::vector{}; - encoded_images = m_inputs_embedder->encode_images(image_rgbs, video_rgbs); + encoded_images = m_inputs_embedder->encode_images(image_rgbs); + encoded_images = m_inputs_embedder->encode_video(video_rgbs); m_history_images.insert(m_history_images.end(), encoded_images.begin(), encoded_images.end()); const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 6ac93d9f4a..6c89f76866 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -165,19 +165,22 @@ std::vector InputsEmbedder::IInputsEmbedder::to_single_image_tensors return single_image_tensors; } -std::vector InputsEmbedder::IInputsEmbedder::encode_images(const std::vector& images, const std::vector& video) { +std::vector InputsEmbedder::IInputsEmbedder::encode_images(const std::vector& images) { std::vector embeds; + std::vector single_images = to_single_image_tensors(images); + for (const ov::Tensor& image : single_images) { + embeds.emplace_back(m_vision_encoder->encode(image)); + } + return embeds; +} +std::vector InputsEmbedder::IInputsEmbedder::encode_video(const std::vector& video) { + std::vector embeds; for (const ov::Tensor& single_video : video) { std::vector single_frames = to_single_image_tensors({single_video}); auto embeds_video = m_vision_encoder->encode_video(single_frames); embeds.insert(embeds.end(), embeds_video.begin(), embeds_video.end()); } - - std::vector single_images = to_single_image_tensors(images); - for (const ov::Tensor& image : single_images) { - embeds.emplace_back(m_vision_encoder->encode(image)); - } return embeds; } @@ -300,8 +303,12 @@ bool InputsEmbedder::has_token_type_ids() const { return m_impl->has_token_type_ids(); } -std::vector InputsEmbedder::encode_images(const std::vector& images, const std::vector& video) { - return m_impl->encode_images(images, video); +std::vector InputsEmbedder::encode_images(const std::vector& images) { + return m_impl->encode_images(images); +} + +std::vector InputsEmbedder::encode_video(const std::vector& video) { + return m_impl->encode_video(video); } std::pair> InputsEmbedder::get_position_ids(const size_t inputs_embeds_size, const size_t history_size) { diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index 7289dbb09d..a762474130 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -46,7 +46,8 @@ class InputsEmbedder { bool has_token_type_ids() const; - std::vector encode_images(const std::vector& images, const std::vector& video); + std::vector encode_images(const std::vector& images); + std::vector encode_video(const std::vector& video); // compute position ids for language model input std::pair> get_position_ids(const size_t inputs_embeds_size, const size_t history_size); @@ -116,7 +117,9 @@ class InputsEmbedder { virtual bool has_token_type_ids() const; - virtual std::vector encode_images(const std::vector& images, const std::vector& video); + virtual std::vector encode_images(const std::vector& images); + + virtual std::vector encode_video(const std::vector& video); virtual std::pair> get_position_ids(const size_t inputs_embeds_size, const size_t history_size); From f25770b4fdd3968012ffde513efc934f8e975ffd Mon Sep 17 00:00:00 2001 From: xipingya Date: Sun, 28 Sep 2025 10:06:35 +0800 Subject: [PATCH 041/118] Remove: get_inputs_embeds(const std::string& prompt, const std::vector& images ... get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector& images, ... Because 1: they never been called for current codes. 2: Getting embeds feature, we usually need to apply a chat template. I think only keeping below interface is enough. get_inputs_embeds(const std::string& prompt, const std::vector& images... get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector& images... Signed-off-by: xipingya --- .../src/continuous_batching/pipeline_base.cpp | 5 +++-- src/cpp/src/visual_language/gemma3/classes.cpp | 10 ++++++---- src/cpp/src/visual_language/gemma3/classes.hpp | 2 +- .../src/visual_language/inputs_embedder.cpp | 18 +----------------- .../src/visual_language/inputs_embedder.hpp | 9 +-------- src/cpp/src/visual_language/llava/classes.cpp | 6 +++--- .../src/visual_language/llava_next/classes.cpp | 6 +++--- .../src/visual_language/llava_next/classes.hpp | 2 +- .../src/visual_language/nanollava/classes.cpp | 8 ++++---- .../src/visual_language/nanollava/classes.hpp | 2 +- 10 files changed, 24 insertions(+), 44 deletions(-) diff --git a/src/cpp/src/continuous_batching/pipeline_base.cpp b/src/cpp/src/continuous_batching/pipeline_base.cpp index fd7cc53590..cbf4deeede 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.cpp +++ b/src/cpp/src/continuous_batching/pipeline_base.cpp @@ -290,7 +290,7 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(uint64_t re std::lock_guard lock(m_embeddings_mutex); m_inputs_embedder->set_apply_chat_template_status(sampling_params.apply_chat_template); - auto encoded_images = m_inputs_embedder->encode_images(rgbs, std::vector{}); + auto encoded_images = m_inputs_embedder->encode_images(rgbs); const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, 0, encoded_images); inputs = m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, metrics, true, image_sequence); @@ -312,7 +312,8 @@ GenerationHandle ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_re std::lock_guard lock(m_embeddings_mutex); m_inputs_embedder->set_apply_chat_template_status(sampling_params.apply_chat_template); - auto encoded_images =m_inputs_embedder->encode_images(images, video); + auto encoded_images = m_inputs_embedder->encode_images(images); + auto encoded_video = m_inputs_embedder->encode_images(video); const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, 0, encoded_images); inputs = m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, metrics, true, image_sequence); diff --git a/src/cpp/src/visual_language/gemma3/classes.cpp b/src/cpp/src/visual_language/gemma3/classes.cpp index 81fc531e9b..5a914e5af6 100644 --- a/src/cpp/src/visual_language/gemma3/classes.cpp +++ b/src/cpp/src/visual_language/gemma3/classes.cpp @@ -71,11 +71,13 @@ bool InputsEmbedderGemma3::has_token_type_ids() const { return true; } -std::vector InputsEmbedderGemma3::encode_images(const std::vector& images, const std::vector& video) { - if (video.size() > 0) { - OPENVINO_THROW("Gemma3 doesn't support video preprocess currently. Input images are processed as separate images."); - } +// std::vector InputsEmbedderGemma3::encode_images(const std::vector& images, const std::vector& video) { +// if (video.size() > 0) { +// OPENVINO_THROW("Gemma3 doesn't support video preprocess currently. Input images are processed as separate images."); +// } +// } +std::vector InputsEmbedderGemma3::encode_images(const std::vector& images) { std::vector embeds; ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}}; diff --git a/src/cpp/src/visual_language/gemma3/classes.hpp b/src/cpp/src/visual_language/gemma3/classes.hpp index 26087bb25b..b78fa8d193 100644 --- a/src/cpp/src/visual_language/gemma3/classes.hpp +++ b/src/cpp/src/visual_language/gemma3/classes.hpp @@ -41,7 +41,7 @@ class InputsEmbedderGemma3 : public InputsEmbedder::IInputsEmbedder { bool has_token_type_ids() const override; - std::vector encode_images(const std::vector& images, const std::vector& video) override; + std::vector encode_images(const std::vector& images) override; std::pair> normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const override; diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 6c89f76866..62a29aec5b 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -175,6 +175,7 @@ std::vector InputsEmbedder::IInputsEmbedder::encode_ima } std::vector InputsEmbedder::IInputsEmbedder::encode_video(const std::vector& video) { + // encode_images std::vector embeds; for (const ov::Tensor& single_video : video) { std::vector single_frames = to_single_image_tensors({single_video}); @@ -184,19 +185,6 @@ std::vector InputsEmbedder::IInputsEmbedder::encode_vid return embeds; } -ov::Tensor InputsEmbedder::IInputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector& images, const std::vector& video, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence) { - return get_inputs_embeds(prompt, encode_images(images, video), metrics, true, image_sequence); -} - -std::pair InputsEmbedder::IInputsEmbedder::get_inputs_embeds_with_token_type_ids( - const std::string& prompt, - const std::vector& images, - const std::vector& video, - ov::genai::VLMPerfMetrics& metrics, - const std::vector& image_sequence) { - return get_inputs_embeds_with_token_type_ids(prompt, encode_images(images, video), metrics, true, image_sequence); -} - std::pair InputsEmbedder::IInputsEmbedder::get_inputs_embeds_with_token_type_ids( const std::string& prompt, const std::vector& images, @@ -272,10 +260,6 @@ InputsEmbedder::InputsEmbedder(const ModelsMap& models_map, } } -ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector& images, const std::vector& video, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence) { - return m_impl->get_inputs_embeds(prompt, images, video, metrics, image_sequence); -} - ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings, const std::vector& image_sequence) { return m_impl->get_inputs_embeds(prompt, images, metrics, recalculate_merged_embeddings, image_sequence); } diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index a762474130..adde372768 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -35,18 +35,15 @@ class InputsEmbedder { const ov::AnyMap device_config); // compute input embedding for prompt and multiple images - ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, const std::vector& video, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence); - ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}); // compute input embedding and token_type_ids - std::pair get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector& images, const std::vector& video, VLMPerfMetrics& metrics, const std::vector& image_sequence = {}); - std::pair get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector& images, VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}); bool has_token_type_ids() const; std::vector encode_images(const std::vector& images); + std::vector encode_video(const std::vector& video); // compute position ids for language model input @@ -109,10 +106,6 @@ class InputsEmbedder { public: virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}) = 0; - ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, const std::vector& video, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence); - - std::pair get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector& images, const std::vector& video, ov::genai::VLMPerfMetrics& metrics, const std::vector& image_sequence = {}); - virtual std::pair get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}); virtual bool has_token_type_ids() const; diff --git a/src/cpp/src/visual_language/llava/classes.cpp b/src/cpp/src/visual_language/llava/classes.cpp index 4f26622d2d..b1756514bd 100644 --- a/src/cpp/src/visual_language/llava/classes.cpp +++ b/src/cpp/src/visual_language/llava/classes.cpp @@ -93,9 +93,9 @@ InputsEmbedderLLaVA::InputsEmbedderLLaVA( IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } std::vector InputsEmbedderLLaVA::encode_images(const std::vector& images, const std::vector& video) { - if (video.size() > 0) { - OPENVINO_THROW("LLaVA doesn't support video preprocess currently. Input images are processed as separate images."); - } + // if (video.size() > 0) { + // OPENVINO_THROW("LLaVA doesn't support video preprocess currently. Input images are processed as separate images."); + // } std::vector embeds; ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}}; diff --git a/src/cpp/src/visual_language/llava_next/classes.cpp b/src/cpp/src/visual_language/llava_next/classes.cpp index 7b5657c711..fd593d17ed 100644 --- a/src/cpp/src/visual_language/llava_next/classes.cpp +++ b/src/cpp/src/visual_language/llava_next/classes.cpp @@ -334,9 +334,9 @@ ov::Tensor pack_image_features_llava_next( } // namespace std::vector InputsEmbedderLLaVANext::encode_images(const std::vector& images, const std::vector& video) { - if (video.size() > 0) { - OPENVINO_THROW("LLaVANext doesn't support video preprocess currently. Input images are processed as separate images."); - } + // if (video.size() > 0) { + // OPENVINO_THROW("LLaVANext doesn't support video preprocess currently. Input images are processed as separate images."); + // } std::vector embeds; ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}}; diff --git a/src/cpp/src/visual_language/llava_next/classes.hpp b/src/cpp/src/visual_language/llava_next/classes.hpp index 1bd830fe9f..b79597b519 100644 --- a/src/cpp/src/visual_language/llava_next/classes.hpp +++ b/src/cpp/src/visual_language/llava_next/classes.hpp @@ -24,7 +24,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}) override; - std::vector encode_images(const std::vector& images, const std::vector& video) override; + std::vector encode_images(const std::vector& images) override; std::pair> normalize_prompt( const std::string& prompt, diff --git a/src/cpp/src/visual_language/nanollava/classes.cpp b/src/cpp/src/visual_language/nanollava/classes.cpp index 00bde6c2cd..a898daf960 100644 --- a/src/cpp/src/visual_language/nanollava/classes.cpp +++ b/src/cpp/src/visual_language/nanollava/classes.cpp @@ -127,10 +127,10 @@ InputsEmbedderNanoLLaVA::InputsEmbedderNanoLLaVA( const ov::AnyMap device_config) : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } -std::vector InputsEmbedderNanoLLaVA::encode_images(const std::vector& images, const std::vector& video) { - if (video.size() > 0) { - OPENVINO_THROW("NanoLLaVA doesn't support video preprocess currently. Input images are processed as separate images."); - } +std::vector InputsEmbedderNanoLLaVA::encode_images(const std::vector& images) { + // if (video.size() > 0) { + // OPENVINO_THROW("NanoLLaVA doesn't support video preprocess currently. Input images are processed as separate images."); + // } std::vector embeds; ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}}; diff --git a/src/cpp/src/visual_language/nanollava/classes.hpp b/src/cpp/src/visual_language/nanollava/classes.hpp index 33b5747471..5b185c2219 100644 --- a/src/cpp/src/visual_language/nanollava/classes.hpp +++ b/src/cpp/src/visual_language/nanollava/classes.hpp @@ -37,7 +37,7 @@ class InputsEmbedderNanoLLaVA : public InputsEmbedder::IInputsEmbedder { ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}) override; - std::vector encode_images(const std::vector& images, const std::vector& video) override; + std::vector encode_images(const std::vector& images) override; std::pair> normalize_prompt( const std::string& prompt, From 72c621b84e8fcfdc009b6b2034d1f5bb91b56c76 Mon Sep 17 00:00:00 2001 From: xipingya Date: Sun, 28 Sep 2025 21:17:59 +0800 Subject: [PATCH 042/118] 1: Add placeholder, 2: Enable video for get_input_embeds Signed-off-by: xipingya --- .../src/continuous_batching/pipeline_base.cpp | 31 +++++++-- .../src/visual_language/gemma3/classes.cpp | 6 -- .../src/visual_language/inputs_embedder.cpp | 66 ++++++++++++------ .../src/visual_language/inputs_embedder.hpp | 31 ++++++++- src/cpp/src/visual_language/llava/classes.cpp | 6 +- src/cpp/src/visual_language/llava/classes.hpp | 2 +- .../visual_language/llava_next/classes.cpp | 6 +- .../src/visual_language/nanollava/classes.cpp | 4 -- src/cpp/src/visual_language/pipeline.cpp | 9 ++- .../visual_language/qwen2_5_vl/classes.cpp | 4 +- .../visual_language/qwen2_5_vl/classes.hpp | 4 +- .../src/visual_language/qwen2vl/classes.cpp | 68 +++++++++++++++++-- .../src/visual_language/qwen2vl/classes.hpp | 24 ++++++- .../src/visual_language/vision_encoder.hpp | 1 - src/cpp/src/visual_language/vlm_config.hpp | 1 + 15 files changed, 201 insertions(+), 62 deletions(-) diff --git a/src/cpp/src/continuous_batching/pipeline_base.cpp b/src/cpp/src/continuous_batching/pipeline_base.cpp index cbf4deeede..77453bb316 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.cpp +++ b/src/cpp/src/continuous_batching/pipeline_base.cpp @@ -173,19 +173,26 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( std::vector vlm_perf_metrics(prompts.size()); std::vector encoded_images = {}; + std::vector> encoded_videos = {}; if (m_is_chat_conversation) { OPENVINO_ASSERT(1 == prompts.size(), "Can't chat with multiple prompts"); const auto& prompt = prompts[0]; auto start_get_inputs_embeds = std::chrono::steady_clock::now(); - auto image_rgbs = rgbs_vector.size() > 0 ? rgbs_vector[0] : std::vector{}; + std::vector vision_sequence; auto video_rgbs = video_vector.size() > 0 ? video_vector[0] : std::vector{}; + for (auto& vd : video_rgbs) { + auto encoded_vd = m_inputs_embedder->encode_videos({vd}); + m_history_images.insert(m_history_images.end(), encoded_vd.begin(), encoded_vd.end()); + encoded_videos.push_back(encoded_vd); + } + + auto image_rgbs = rgbs_vector.size() > 0 ? rgbs_vector[0] : std::vector{}; encoded_images = m_inputs_embedder->encode_images(image_rgbs); - encoded_images = m_inputs_embedder->encode_video(video_rgbs); m_history_images.insert(m_history_images.end(), encoded_images.begin(), encoded_images.end()); - const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); + const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images, encoded_videos); m_history.push_back({{"role", "user"}, {"content", unified_prompt}}); m_history_image_ids.insert(m_history_image_ids.end(), image_sequence.begin(), image_sequence.end()); @@ -220,8 +227,14 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( auto image_rgbs = rgbs_vector.size() > 0 ? rgbs_vector[i] : std::vector{}; auto video_rgbs = video_vector.size() > 0 ? video_vector[i] : std::vector{}; - const auto encoded_images = m_inputs_embedder->encode_images(image_rgbs, video_rgbs); - auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); + const auto encoded_images = m_inputs_embedder->encode_images(image_rgbs); + std::vector> encoded_videos; + for (auto& vd : video_rgbs) { + auto encoded_vd = m_inputs_embedder->encode_videos({vd}); + encoded_videos.push_back(encoded_vd); + } + + auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images, encoded_videos); m_inputs_embedder->set_apply_chat_template_status(sampling_params[i].apply_chat_template); @@ -313,9 +326,13 @@ GenerationHandle ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_re m_inputs_embedder->set_apply_chat_template_status(sampling_params.apply_chat_template); auto encoded_images = m_inputs_embedder->encode_images(images); - auto encoded_video = m_inputs_embedder->encode_images(video); + std::vector> encoded_videos; + for (auto& vd : video) { + auto encoded_vd = m_inputs_embedder->encode_videos({vd}); + encoded_videos.push_back(encoded_vd); + } - const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, 0, encoded_images); + const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, 0, encoded_images, encoded_videos); inputs = m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, metrics, true, image_sequence); } return add_request(request_id, inputs, sampling_params); diff --git a/src/cpp/src/visual_language/gemma3/classes.cpp b/src/cpp/src/visual_language/gemma3/classes.cpp index 5a914e5af6..cee5f2cbfd 100644 --- a/src/cpp/src/visual_language/gemma3/classes.cpp +++ b/src/cpp/src/visual_language/gemma3/classes.cpp @@ -71,12 +71,6 @@ bool InputsEmbedderGemma3::has_token_type_ids() const { return true; } -// std::vector InputsEmbedderGemma3::encode_images(const std::vector& images, const std::vector& video) { -// if (video.size() > 0) { -// OPENVINO_THROW("Gemma3 doesn't support video preprocess currently. Input images are processed as separate images."); -// } -// } - std::vector InputsEmbedderGemma3::encode_images(const std::vector& images) { std::vector embeds; diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 62a29aec5b..dd87bdd349 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -174,15 +174,27 @@ std::vector InputsEmbedder::IInputsEmbedder::encode_ima return embeds; } -std::vector InputsEmbedder::IInputsEmbedder::encode_video(const std::vector& video) { - // encode_images - std::vector embeds; - for (const ov::Tensor& single_video : video) { - std::vector single_frames = to_single_image_tensors({single_video}); - auto embeds_video = m_vision_encoder->encode_video(single_frames); - embeds.insert(embeds.end(), embeds_video.begin(), embeds_video.end()); - } - return embeds; +ov::Tensor InputsEmbedder::IInputsEmbedder::get_inputs_embeds( + const std::string& prompt, + const std::vector& images, + const std::vector>& videos, + ov::genai::VLMPerfMetrics& metrics, + bool recalculate_merged_embeddings, + const std::vector& images_sequence, + const std::vector& videos_sequence) { + OPENVINO_THROW("Current model doesn't support video preprocess currently. Input images are processed as separate images."); +} + +std::vector InputsEmbedder::IInputsEmbedder::encode_videos(const std::vector& videos) { + OPENVINO_THROW("Current model doesn't support video preprocess currently. Input images are processed as separate images."); +} + +std::pair> InputsEmbedder::IInputsEmbedder::normalize_prompt( + const std::string& prompt, + size_t base_id, + const std::vector& images, + const std::vector>& videos) const { + OPENVINO_THROW("Current model doesn't support video preprocess currently. Input images are processed as separate images."); } std::pair InputsEmbedder::IInputsEmbedder::get_inputs_embeds_with_token_type_ids( @@ -264,13 +276,20 @@ ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, const st return m_impl->get_inputs_embeds(prompt, images, metrics, recalculate_merged_embeddings, image_sequence); } -std::pair InputsEmbedder::get_inputs_embeds_with_token_type_ids( - const std::string& prompt, - const std::vector& images, - const std::vector& video, - VLMPerfMetrics& metrics, - const std::vector& image_sequence) { - return m_impl->get_inputs_embeds_with_token_type_ids(prompt, images, video, metrics, image_sequence); +ov::Tensor InputsEmbedder::get_inputs_embeds(const std::string& prompt, + const std::vector& images, + const std::vector>& videos, + ov::genai::VLMPerfMetrics& metrics, + bool recalculate_merged_embeddings, + const std::vector& images_sequence, + const std::vector& videos_sequence) { + return m_impl->get_inputs_embeds(prompt, + images, + videos, + metrics, + recalculate_merged_embeddings, + images_sequence, + videos_sequence); } std::pair InputsEmbedder::get_inputs_embeds_with_token_type_ids( @@ -291,8 +310,8 @@ std::vector InputsEmbedder::encode_images(const std::ve return m_impl->encode_images(images); } -std::vector InputsEmbedder::encode_video(const std::vector& video) { - return m_impl->encode_video(video); +std::vector InputsEmbedder::encode_videos(const std::vector& videos) { + return m_impl->encode_videos(videos); } std::pair> InputsEmbedder::get_position_ids(const size_t inputs_embeds_size, const size_t history_size) { @@ -332,7 +351,16 @@ std::pair> InputsEmbedder::normalize_prompt( size_t base_id, const std::vector& images ) const { - return m_impl->normalize_prompt(prompt, base_id, images); + return m_impl->normalize_prompt(prompt, base_id, images, {}); +} + +std::pair> InputsEmbedder::normalize_prompt( + const std::string& prompt, + size_t base_id, + const std::vector& images, + const std::vector>& videos +) const { + return m_impl->normalize_prompt(prompt, base_id, images, videos); } void verify_ids(const std::vector& image_ids, size_t base_id, size_t n_images) { diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index adde372768..f4b328f9b1 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -36,6 +36,13 @@ class InputsEmbedder { // compute input embedding for prompt and multiple images ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}); + ov::Tensor get_inputs_embeds(const std::string& prompt, + const std::vector& images, + const std::vector>& videos, + ov::genai::VLMPerfMetrics& metrics, + bool recalculate_merged_embeddings = true, + const std::vector& image_sequence = {}, + const std::vector& videos_sequence = {}); // compute input embedding and token_type_ids std::pair get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector& images, VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}); @@ -44,7 +51,7 @@ class InputsEmbedder { std::vector encode_images(const std::vector& images); - std::vector encode_video(const std::vector& video); + std::vector encode_videos(const std::vector& videos); // compute position ids for language model input std::pair> get_position_ids(const size_t inputs_embeds_size, const size_t history_size); @@ -76,6 +83,12 @@ class InputsEmbedder { const std::vector& images ) const; + virtual std::pair> normalize_prompt( + const std::string& prompt, + size_t base_id, + const std::vector& images, + const std::vector>& videos) const; + private: class IInputsEmbedder { protected: @@ -105,6 +118,13 @@ class InputsEmbedder { public: virtual ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}) = 0; + virtual ov::Tensor get_inputs_embeds(const std::string& prompt, + const std::vector& images, + const std::vector>& videos, + ov::genai::VLMPerfMetrics& metrics, + bool recalculate_merged_embeddings = true, + const std::vector& image_sequence = {}, + const std::vector& videos_sequence = {}); virtual std::pair get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}); @@ -112,7 +132,7 @@ class InputsEmbedder { virtual std::vector encode_images(const std::vector& images); - virtual std::vector encode_video(const std::vector& video); + virtual std::vector encode_videos(const std::vector& videos); virtual std::pair> get_position_ids(const size_t inputs_embeds_size, const size_t history_size); @@ -143,7 +163,12 @@ class InputsEmbedder { size_t base_id, const std::vector& images ) const = 0; - + virtual std::pair> normalize_prompt( + const std::string& prompt, + size_t base_id, + const std::vector& images, + const std::vector>& videos) const; + protected: IInputsEmbedder( const VLMConfig& vlm_config, diff --git a/src/cpp/src/visual_language/llava/classes.cpp b/src/cpp/src/visual_language/llava/classes.cpp index b1756514bd..7e0e6ae774 100644 --- a/src/cpp/src/visual_language/llava/classes.cpp +++ b/src/cpp/src/visual_language/llava/classes.cpp @@ -92,11 +92,7 @@ InputsEmbedderLLaVA::InputsEmbedderLLaVA( const ov::AnyMap device_config) : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } -std::vector InputsEmbedderLLaVA::encode_images(const std::vector& images, const std::vector& video) { - // if (video.size() > 0) { - // OPENVINO_THROW("LLaVA doesn't support video preprocess currently. Input images are processed as separate images."); - // } - +std::vector InputsEmbedderLLaVA::encode_images(const std::vector& images) { std::vector embeds; ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}}; std::vector single_images = to_single_image_tensors(images); diff --git a/src/cpp/src/visual_language/llava/classes.hpp b/src/cpp/src/visual_language/llava/classes.hpp index 8b4785ac82..8cc8c147d7 100644 --- a/src/cpp/src/visual_language/llava/classes.hpp +++ b/src/cpp/src/visual_language/llava/classes.hpp @@ -37,7 +37,7 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}) override; - std::vector encode_images(const std::vector& images, const std::vector& video) override; + std::vector encode_images(const std::vector& images) override; std::pair> normalize_prompt( const std::string& prompt, diff --git a/src/cpp/src/visual_language/llava_next/classes.cpp b/src/cpp/src/visual_language/llava_next/classes.cpp index fd593d17ed..db9717c568 100644 --- a/src/cpp/src/visual_language/llava_next/classes.cpp +++ b/src/cpp/src/visual_language/llava_next/classes.cpp @@ -333,11 +333,7 @@ ov::Tensor pack_image_features_llava_next( } // namespace -std::vector InputsEmbedderLLaVANext::encode_images(const std::vector& images, const std::vector& video) { - // if (video.size() > 0) { - // OPENVINO_THROW("LLaVANext doesn't support video preprocess currently. Input images are processed as separate images."); - // } - +std::vector InputsEmbedderLLaVANext::encode_images(const std::vector& images) { std::vector embeds; ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}}; std::vector single_images = to_single_image_tensors(images); diff --git a/src/cpp/src/visual_language/nanollava/classes.cpp b/src/cpp/src/visual_language/nanollava/classes.cpp index a898daf960..9a4a528ead 100644 --- a/src/cpp/src/visual_language/nanollava/classes.cpp +++ b/src/cpp/src/visual_language/nanollava/classes.cpp @@ -128,10 +128,6 @@ InputsEmbedderNanoLLaVA::InputsEmbedderNanoLLaVA( IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } std::vector InputsEmbedderNanoLLaVA::encode_images(const std::vector& images) { - // if (video.size() > 0) { - // OPENVINO_THROW("NanoLLaVA doesn't support video preprocess currently. Input images are processed as separate images."); - // } - std::vector embeds; ov::AnyMap vision_config = {{"patch_size", m_vlm_config.vision_config_patch_size}}; std::vector single_images = to_single_image_tensors(images); diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 00d41227dd..927a3a40aa 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -193,8 +193,13 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ "Currently only \"num_return_sequences\" equal to 1 is supported for NPU device!"); } - auto encoded_images = m_inputs_embedder->encode_images(images, video); - auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images); + auto encoded_images = m_inputs_embedder->encode_images(images); + std::vector> encoded_videos; + for (auto& vd : video) { + auto encoded_vd = m_inputs_embedder->encode_videos({vd}); + encoded_videos.push_back(encoded_vd); + } + auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images, encoded_videos); if (m_is_chat_conversation) { m_history.push_back({{"role", "user"}, {"content", unified_prompt}}); diff --git a/src/cpp/src/visual_language/qwen2_5_vl/classes.cpp b/src/cpp/src/visual_language/qwen2_5_vl/classes.cpp index f2def1a5c8..b4ef0fad0c 100644 --- a/src/cpp/src/visual_language/qwen2_5_vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2_5_vl/classes.cpp @@ -109,7 +109,9 @@ InputsEmbedderQwen2_5_VL::InputsEmbedderQwen2_5_VL( ov::Tensor InputsEmbedderQwen2_5_VL::run_image_embeddings_merger( const std::vector& images, - const std::vector& images_sequence + const std::vector& images_sequence, + const std::vector>& videos, + const std::vector& videos_sequence ) { auto [reordered_image_embeds, reordered_images_grid_thw] = qwen2_vl_utils::reorder_image_embeds_and_grid_thw(images, images_sequence); diff --git a/src/cpp/src/visual_language/qwen2_5_vl/classes.hpp b/src/cpp/src/visual_language/qwen2_5_vl/classes.hpp index 1dca52d276..081ac1cb82 100644 --- a/src/cpp/src/visual_language/qwen2_5_vl/classes.hpp +++ b/src/cpp/src/visual_language/qwen2_5_vl/classes.hpp @@ -37,7 +37,9 @@ class InputsEmbedderQwen2_5_VL : public InputsEmbedderQwen2VL { protected: ov::Tensor run_image_embeddings_merger( const std::vector& images, - const std::vector& images_sequence) override; + const std::vector& images_sequence, + const std::vector>& videos, + const std::vector& videos_sequence) override; }; } // namespace ov::genai diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index 5974bc5e48..43de10d4c8 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -29,6 +29,7 @@ namespace { // Chat template hardcodes char sequence instead of referring to tag values, so NATIVE_TAG is hardcoded as well. const std::string NATIVE_TAG = "<|vision_start|><|image_pad|><|vision_end|>"; +const std::string NATIVE_VIDEO_TAG = "<|vision_start|><|video_pad|><|vision_end|>"; std::shared_ptr create_f32_nchw_input(std::shared_ptr input) { auto raw_images_f32 = std::make_shared(input, ov::element::f32); @@ -854,7 +855,12 @@ InputsEmbedderQwen2VL::InputsEmbedderQwen2VL( }); } -std::pair> InputsEmbedderQwen2VL::normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const { +std::pair> InputsEmbedderQwen2VL::normalize_prompt( + const std::string& prompt, + size_t base_id, + const std::vector& images, + const std::vector>& videos) const { + // Images auto [unified_prompt, images_sequence] = normalize(prompt, NATIVE_TAG, NATIVE_TAG, base_id, images.size()); std::vector> images_grid_thw; images_grid_thw.reserve(images.size()); @@ -878,10 +884,52 @@ std::pair> InputsEmbedderQwen2VL::normalize_pro expanded_tag += m_vlm_config.vision_end_token; unified_prompt.replace(unified_prompt.find(NATIVE_TAG), NATIVE_TAG.length(), expanded_tag); } - return {std::move(unified_prompt), std::move(images_sequence)}; + + // Video + std::vector videos_sequence; + std::tie(unified_prompt, videos_sequence) = + normalize(unified_prompt, NATIVE_VIDEO_TAG, NATIVE_VIDEO_TAG, base_id, videos.size()); + std::vector> video_grid_thw; + video_grid_thw.reserve(videos.size()); + + for (const auto& encoded_vd : videos) { + size_t grid_t = encoded_vd.size(); + OPENVINO_ASSERT(grid_t > 0, "Input at least one frame for video."); + size_t grid_h = encoded_vd[0].resized_source_size.height; + size_t grid_w = encoded_vd[0].resized_source_size.width; + video_grid_thw.push_back({grid_t, grid_h, grid_w}); + } + + for (size_t new_image_id : videos_sequence) { + auto [grid_t, grid_h, grid_w] = video_grid_thw.at(new_image_id - base_id); + size_t merge_length = std::pow(m_vision_encoder->get_processor_config().merge_size, 2); + size_t num_video_pad_tokens = grid_t * grid_h * grid_w / merge_length; + + std::string expanded_tag = m_vlm_config.vision_start_token; + for (int i = 0; i < num_video_pad_tokens; i++) { + expanded_tag += m_vlm_config.video_pad_token; + } + expanded_tag += m_vlm_config.vision_end_token; + unified_prompt.replace(unified_prompt.find(NATIVE_VIDEO_TAG), NATIVE_VIDEO_TAG.length(), expanded_tag); + } + + std::vector vision_sequence; + vision_sequence.insert(vision_sequence.end(), videos_sequence.begin(), videos_sequence.end()); + vision_sequence.insert(vision_sequence.end(), images_sequence.begin(), images_sequence.end()); + return {std::move(unified_prompt), std::move(vision_sequence)}; } ov::Tensor InputsEmbedderQwen2VL::get_inputs_embeds(const std::string& unified_prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings, const std::vector& images_sequence) { + return get_inputs_embeds(unified_prompt, images, {}, metrics, recalculate_merged_embeddings, images_sequence, {}); +} + +ov::Tensor InputsEmbedderQwen2VL::get_inputs_embeds(const std::string& unified_prompt, + const std::vector& images, + const std::vector>& videos, + ov::genai::VLMPerfMetrics& metrics, + bool recalculate_merged_embeddings, + const std::vector& images_sequence, + const std::vector& videos_sequence) { std::vector> images_grid_thw; images_grid_thw.reserve(images.size()); for (const auto& encoded_image : images) { @@ -917,13 +965,23 @@ ov::Tensor InputsEmbedderQwen2VL::get_inputs_embeds(const std::string& unified_p } ov::Tensor merged_image_embeddings_tensor; if (recalculate_merged_embeddings) { - m_merged_image_embeddings = run_image_embeddings_merger(images, images_sequence); + m_merged_image_embeddings = run_image_embeddings_merger(images, images_sequence, videos, videos_sequence); } merged_image_embeddings_tensor = m_merged_image_embeddings; return qwen2_vl_utils::merge_text_and_image_embeddings(input_ids, text_embeds, merged_image_embeddings_tensor, image_pad_token_id); } +std::vector InputsEmbedderQwen2VL::encode_videos(const std::vector& video) { + std::vector embeds; + for (const ov::Tensor& single_video : video) { + std::vector single_frames = to_single_image_tensors({single_video}); + auto embeds_video = m_vision_encoder->encode_video(single_frames); + embeds.insert(embeds.end(), embeds_video.begin(), embeds_video.end()); + } + return embeds; +} + std::pair> InputsEmbedderQwen2VL::get_position_ids(const size_t inputs_embeds_size, const size_t history_size) { if (history_size != 0) { ov::Tensor position_ids{ov::element::i64, {3, 1, inputs_embeds_size}}; @@ -952,7 +1010,9 @@ void InputsEmbedderQwen2VL::finish_chat() { ov::Tensor InputsEmbedderQwen2VL::run_image_embeddings_merger( const std::vector& images, - const std::vector& images_sequence + const std::vector& images_sequence, + const std::vector>& videos, + const std::vector& videos_sequence ) { auto [reordered_image_embeds, reordered_images_grid_thw] = qwen2_vl_utils::reorder_image_embeds_and_grid_thw(images, images_sequence); diff --git a/src/cpp/src/visual_language/qwen2vl/classes.hpp b/src/cpp/src/visual_language/qwen2vl/classes.hpp index 864c3bae3d..426b493bac 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.hpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.hpp @@ -43,6 +43,15 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { const ov::AnyMap device_config); ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}) override; + ov::Tensor get_inputs_embeds(const std::string& prompt, + const std::vector& images, + const std::vector>& videos, + ov::genai::VLMPerfMetrics& metrics, + bool recalculate_merged_embeddings = true, + const std::vector& image_sequence = {}, + const std::vector& videos_sequence = {}) override; + + std::vector encode_videos(const std::vector& videos) override; std::pair> get_position_ids(const size_t inputs_embeds_size, const size_t history_size) override; @@ -53,8 +62,15 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { std::pair> normalize_prompt( const std::string& prompt, size_t base_id, - const std::vector& images - ) const override; + const std::vector& images) const override { + return normalize_prompt(prompt, base_id, images, {}); + } + + std::pair> normalize_prompt( + const std::string& prompt, + size_t base_id, + const std::vector& images, + const std::vector>& videos) const override; protected: // A model for merging image embeddings (hidden states), rotary_pos_emb and attension_mask. @@ -73,7 +89,9 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { virtual ov::Tensor run_image_embeddings_merger( const std::vector& images, - const std::vector& images_sequence); + const std::vector& images_sequence, + const std::vector>& videos, + const std::vector& videos_sequence); ov::Tensor get_rotary_pos_emb(const std::vector>& grids_thw); diff --git a/src/cpp/src/visual_language/vision_encoder.hpp b/src/cpp/src/visual_language/vision_encoder.hpp index cd6cbf7dcf..ade9736bb4 100644 --- a/src/cpp/src/visual_language/vision_encoder.hpp +++ b/src/cpp/src/visual_language/vision_encoder.hpp @@ -105,7 +105,6 @@ class VisionEncoder { /// @brief Compute embeddings of a or mulitple video given virtual std::vector encode_video(const std::vector& frames, const ov::AnyMap& config_map = {}) { OPENVINO_THROW("The current model does not support 'video' input, please use 'images' instead."); - return {}; } /// @brief Gets processor config diff --git a/src/cpp/src/visual_language/vlm_config.hpp b/src/cpp/src/visual_language/vlm_config.hpp index c30ed6d2ab..0c45689dec 100644 --- a/src/cpp/src/visual_language/vlm_config.hpp +++ b/src/cpp/src/visual_language/vlm_config.hpp @@ -76,6 +76,7 @@ class VLMConfig { std::string vision_start_token = "<|vision_start|>"; /// @brief A placeholder for image embeddings in text for Qwen2VL model. std::string image_pad_token = "<|image_pad|>"; + std::string video_pad_token = "<|video_pad|>"; /// @brief A string token denoting end of vision embeddings for Qwen2VL model. std::string vision_end_token = "<|vision_end|>"; From 132b2282daeabdc3921ecf419a288a165200878e Mon Sep 17 00:00:00 2001 From: xipingya Date: Mon, 29 Sep 2025 20:47:56 +0800 Subject: [PATCH 043/118] Update position_ids after enable video. Signed-off-by: xipingya --- .../src/continuous_batching/pipeline_base.cpp | 32 +++---- .../src/continuous_batching/pipeline_base.hpp | 2 + .../src/visual_language/inputs_embedder.cpp | 13 +-- .../src/visual_language/inputs_embedder.hpp | 9 +- src/cpp/src/visual_language/pipeline.cpp | 16 ++-- .../src/visual_language/processor_config.hpp | 1 + .../visual_language/qwen2_5_vl/classes.cpp | 2 +- .../src/visual_language/qwen2vl/classes.cpp | 87 ++++++++++++++----- .../src/visual_language/qwen2vl/classes.hpp | 17 ++-- .../src/visual_language/vision_encoder.hpp | 8 +- 10 files changed, 124 insertions(+), 63 deletions(-) diff --git a/src/cpp/src/continuous_batching/pipeline_base.cpp b/src/cpp/src/continuous_batching/pipeline_base.cpp index 77453bb316..81442a25b7 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.cpp +++ b/src/cpp/src/continuous_batching/pipeline_base.cpp @@ -166,7 +166,6 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( OPENVINO_ASSERT(m_model_input_type == ModelInputType::EMBEDDINGS); OPENVINO_ASSERT(prompts.size() == sampling_params.size(), "Number of prompts should be equal to the number of generation configs."); - OPENVINO_ASSERT(prompts.size() == rgbs_vector.size() || prompts.size() == video_vector.size(), "Number of prompts should be equal to the number of images/video vectors."); std::vector input_embeds_list; std::vector token_type_ids_list; @@ -180,11 +179,10 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( const auto& prompt = prompts[0]; auto start_get_inputs_embeds = std::chrono::steady_clock::now(); - std::vector vision_sequence; auto video_rgbs = video_vector.size() > 0 ? video_vector[0] : std::vector{}; for (auto& vd : video_rgbs) { - auto encoded_vd = m_inputs_embedder->encode_videos({vd}); - m_history_images.insert(m_history_images.end(), encoded_vd.begin(), encoded_vd.end()); + auto encoded_vd = m_inputs_embedder->encode_video({vd}); + m_history_videos.push_back(encoded_vd); encoded_videos.push_back(encoded_vd); } @@ -192,15 +190,17 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( encoded_images = m_inputs_embedder->encode_images(image_rgbs); m_history_images.insert(m_history_images.end(), encoded_images.begin(), encoded_images.end()); - const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images, encoded_videos); - m_history.push_back({{"role", "user"}, {"content", unified_prompt}}); - m_history_image_ids.insert(m_history_image_ids.end(), image_sequence.begin(), image_sequence.end()); + auto norm_prompt = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images, encoded_videos); + m_history.push_back({{"role", "user"}, {"content", norm_prompt.unified_prompt}}); + m_history_image_ids.insert(m_history_image_ids.end(), norm_prompt.images_sequence.begin(), norm_prompt.images_sequence.end()); + m_history_video_ids.insert(m_history_video_ids.end(), norm_prompt.videos_sequence.begin(), norm_prompt.videos_sequence.end()); std::string templated_history = m_tokenizer.apply_chat_template(m_history, true); m_inputs_embedder->set_apply_chat_template_status(false); if (m_inputs_embedder->has_token_type_ids()) { + // Todo: support video auto [embeds, tt_ids] = m_inputs_embedder->get_inputs_embeds_with_token_type_ids(templated_history, m_history_images, vlm_perf_metrics[0], @@ -211,9 +211,11 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( } else { input_embeds_list.emplace_back(m_inputs_embedder->get_inputs_embeds(templated_history, m_history_images, + m_history_videos, vlm_perf_metrics[0], true, - m_history_image_ids)); + m_history_image_ids, + m_history_video_ids)); } auto end_get_inputs_embeds = std::chrono::steady_clock::now(); @@ -230,20 +232,20 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( const auto encoded_images = m_inputs_embedder->encode_images(image_rgbs); std::vector> encoded_videos; for (auto& vd : video_rgbs) { - auto encoded_vd = m_inputs_embedder->encode_videos({vd}); + auto encoded_vd = m_inputs_embedder->encode_video({vd}); encoded_videos.push_back(encoded_vd); } - auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images, encoded_videos); + auto norm_prompt = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images, encoded_videos); m_inputs_embedder->set_apply_chat_template_status(sampling_params[i].apply_chat_template); if (m_inputs_embedder->has_token_type_ids()) { - auto [embeds, tt_ids] = m_inputs_embedder->get_inputs_embeds_with_token_type_ids(unified_prompt, encoded_images, vlm_perf_metrics[i], true, image_sequence); + auto [embeds, tt_ids] = m_inputs_embedder->get_inputs_embeds_with_token_type_ids(norm_prompt.unified_prompt, encoded_images, vlm_perf_metrics[i], true, norm_prompt.images_sequence); input_embeds_list.push_back(std::move(embeds)); token_type_ids_list.push_back(std::move(tt_ids)); } else { - input_embeds_list.emplace_back(m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, vlm_perf_metrics[i], true, image_sequence)); + input_embeds_list.emplace_back(m_inputs_embedder->get_inputs_embeds(norm_prompt.unified_prompt, encoded_images, vlm_perf_metrics[i], true, norm_prompt.images_sequence)); } auto end_get_inputs_embeds = std::chrono::steady_clock::now(); @@ -328,12 +330,12 @@ GenerationHandle ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_re auto encoded_images = m_inputs_embedder->encode_images(images); std::vector> encoded_videos; for (auto& vd : video) { - auto encoded_vd = m_inputs_embedder->encode_videos({vd}); + auto encoded_vd = m_inputs_embedder->encode_video({vd}); encoded_videos.push_back(encoded_vd); } - const auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, 0, encoded_images, encoded_videos); - inputs = m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, metrics, true, image_sequence); + auto norm_prompt = m_inputs_embedder->normalize_prompt(prompt, 0, encoded_images, encoded_videos); + inputs = m_inputs_embedder->get_inputs_embeds(norm_prompt.unified_prompt, encoded_images, metrics, true, norm_prompt.images_sequence); } return add_request(request_id, inputs, sampling_params); } diff --git a/src/cpp/src/continuous_batching/pipeline_base.hpp b/src/cpp/src/continuous_batching/pipeline_base.hpp index 941b382b58..323aa45279 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.hpp +++ b/src/cpp/src/continuous_batching/pipeline_base.hpp @@ -52,7 +52,9 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline { bool m_is_chat_conversation = false; ChatHistory m_history; std::vector m_history_images; + std::vector> m_history_videos; std::vector m_history_image_ids; + std::vector m_history_video_ids; size_t m_image_id = 0; float m_load_time_ms = 0.0f; diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index dd87bdd349..25bfe39dff 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -185,11 +185,11 @@ ov::Tensor InputsEmbedder::IInputsEmbedder::get_inputs_embeds( OPENVINO_THROW("Current model doesn't support video preprocess currently. Input images are processed as separate images."); } -std::vector InputsEmbedder::IInputsEmbedder::encode_videos(const std::vector& videos) { +std::vector InputsEmbedder::IInputsEmbedder::encode_video(const std::vector& videos) { OPENVINO_THROW("Current model doesn't support video preprocess currently. Input images are processed as separate images."); } -std::pair> InputsEmbedder::IInputsEmbedder::normalize_prompt( +NormlizedPrompt InputsEmbedder::IInputsEmbedder::normalize_prompt( const std::string& prompt, size_t base_id, const std::vector& images, @@ -310,8 +310,8 @@ std::vector InputsEmbedder::encode_images(const std::ve return m_impl->encode_images(images); } -std::vector InputsEmbedder::encode_videos(const std::vector& videos) { - return m_impl->encode_videos(videos); +std::vector InputsEmbedder::encode_video(const std::vector& videos) { + return m_impl->encode_video(videos); } std::pair> InputsEmbedder::get_position_ids(const size_t inputs_embeds_size, const size_t history_size) { @@ -351,10 +351,11 @@ std::pair> InputsEmbedder::normalize_prompt( size_t base_id, const std::vector& images ) const { - return m_impl->normalize_prompt(prompt, base_id, images, {}); + auto norm_prompt = m_impl->normalize_prompt(prompt, base_id, images, {}); + return {norm_prompt.unified_prompt, norm_prompt.images_sequence}; } -std::pair> InputsEmbedder::normalize_prompt( +NormlizedPrompt InputsEmbedder::normalize_prompt( const std::string& prompt, size_t base_id, const std::vector& images, diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index f4b328f9b1..e1cb5fdaa0 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -51,7 +51,7 @@ class InputsEmbedder { std::vector encode_images(const std::vector& images); - std::vector encode_videos(const std::vector& videos); + std::vector encode_video(const std::vector& videos); // compute position ids for language model input std::pair> get_position_ids(const size_t inputs_embeds_size, const size_t history_size); @@ -83,7 +83,7 @@ class InputsEmbedder { const std::vector& images ) const; - virtual std::pair> normalize_prompt( + virtual NormlizedPrompt normalize_prompt( const std::string& prompt, size_t base_id, const std::vector& images, @@ -132,7 +132,7 @@ class InputsEmbedder { virtual std::vector encode_images(const std::vector& images); - virtual std::vector encode_videos(const std::vector& videos); + virtual std::vector encode_video(const std::vector& videos); virtual std::pair> get_position_ids(const size_t inputs_embeds_size, const size_t history_size); @@ -163,7 +163,8 @@ class InputsEmbedder { size_t base_id, const std::vector& images ) const = 0; - virtual std::pair> normalize_prompt( + + virtual NormlizedPrompt normalize_prompt( const std::string& prompt, size_t base_id, const std::vector& images, diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 927a3a40aa..b39a0de8bd 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -196,17 +196,17 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ auto encoded_images = m_inputs_embedder->encode_images(images); std::vector> encoded_videos; for (auto& vd : video) { - auto encoded_vd = m_inputs_embedder->encode_videos({vd}); + auto encoded_vd = m_inputs_embedder->encode_video({vd}); encoded_videos.push_back(encoded_vd); } - auto [unified_prompt, image_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images, encoded_videos); + auto norm_prompt = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images, encoded_videos); if (m_is_chat_conversation) { - m_history.push_back({{"role", "user"}, {"content", unified_prompt}}); - unified_prompt = m_tokenizer.apply_chat_template(m_history, true); + m_history.push_back({{"role", "user"}, {"content", norm_prompt.unified_prompt}}); + norm_prompt.unified_prompt = m_tokenizer.apply_chat_template(m_history, true); - for (size_t idx = 0; idx < image_sequence.size(); idx++) { - image_sequence[idx] -= m_image_id; + for (size_t idx = 0; idx < norm_prompt.images_sequence.size(); idx++) { + norm_prompt.images_sequence[idx] -= m_image_id; } } else { @@ -217,9 +217,9 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ auto start_get_inputs_embeds = std::chrono::steady_clock::now(); if (m_inputs_embedder->has_token_type_ids()) { - std::tie(inputs_embeds, token_type_ids) = m_inputs_embedder->get_inputs_embeds_with_token_type_ids(unified_prompt, encoded_images, perf_metrics, encoded_images.size() > 0, image_sequence); + std::tie(inputs_embeds, token_type_ids) = m_inputs_embedder->get_inputs_embeds_with_token_type_ids(norm_prompt.unified_prompt, encoded_images, perf_metrics, encoded_images.size() > 0, norm_prompt.images_sequence); } else { - inputs_embeds = m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, perf_metrics, encoded_images.size() > 0, image_sequence); + inputs_embeds = m_inputs_embedder->get_inputs_embeds(norm_prompt.unified_prompt, encoded_images, perf_metrics, encoded_images.size() > 0, norm_prompt.images_sequence); } auto end_get_inputs_embeds = std::chrono::steady_clock::now(); diff --git a/src/cpp/src/visual_language/processor_config.hpp b/src/cpp/src/visual_language/processor_config.hpp index 1ce6c964ef..3a82158d1e 100644 --- a/src/cpp/src/visual_language/processor_config.hpp +++ b/src/cpp/src/visual_language/processor_config.hpp @@ -59,6 +59,7 @@ class ProcessorConfig { size_t max_pixels = 12845056; size_t temporal_patch_size = 2; size_t merge_size = 2; + size_t tokens_per_second = 2; /// @brief Default constructor ProcessorConfig() = default; diff --git a/src/cpp/src/visual_language/qwen2_5_vl/classes.cpp b/src/cpp/src/visual_language/qwen2_5_vl/classes.cpp index b4ef0fad0c..78b30322af 100644 --- a/src/cpp/src/visual_language/qwen2_5_vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2_5_vl/classes.cpp @@ -113,7 +113,7 @@ ov::Tensor InputsEmbedderQwen2_5_VL::run_image_embeddings_merger( const std::vector>& videos, const std::vector& videos_sequence ) { - auto [reordered_image_embeds, reordered_images_grid_thw] = qwen2_vl_utils::reorder_image_embeds_and_grid_thw(images, images_sequence); + auto [reordered_image_embeds, reordered_images_grid_thw] = qwen2_vl_utils::reorder_image_video_embeds_and_grid_thw(images, images_sequence, videos, videos_sequence); ov::Tensor concatenated_embeds = qwen2_vl_utils::concatenate_image_embeds(reordered_image_embeds); ov::Tensor rotary_pos_emb = get_rotary_pos_emb(reordered_images_grid_thw); diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index 43de10d4c8..7469d9dba9 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -418,15 +418,30 @@ ov::Tensor transpose_image_patches(const ov::Tensor& reshaped_patches) { return transposed_patches; } -std::pair, std::vector>> reorder_image_embeds_and_grid_thw( +std::pair, std::vector>> reorder_image_video_embeds_and_grid_thw( const std::vector& encoded_images, - const std::vector& images_sequence + const std::vector& images_sequence, + const std::vector>& videos, + const std::vector& videos_sequence ) { std::vector image_embeds; std::vector> images_grid_thw; image_embeds.reserve(encoded_images.size()); images_grid_thw.reserve(encoded_images.size()); + for (const auto& encoded_video : videos) { + for (const auto& encoded_frame : encoded_video) { + ov::Tensor single_image_embeds = encoded_frame.resized_source; + image_embeds.push_back(std::move(single_image_embeds)); + } + + size_t grid_t = encoded_video.size(); + OPENVINO_ASSERT(grid_t > 0, "Input at least one frame for video."); + size_t grid_h = encoded_video[0].resized_source_size.height; + size_t grid_w = encoded_video[0].resized_source_size.width; + images_grid_thw.push_back({grid_t, grid_h, grid_w}); + } + for (const auto& encoded_image : encoded_images) { ov::Tensor single_image_embeds = encoded_image.resized_source; image_embeds.push_back(std::move(single_image_embeds)); @@ -439,11 +454,17 @@ std::pair, std::vector>> reorder_i std::vector reordered_image_embeds; std::vector> reordered_images_grid_thw; + for (size_t new_video_id : videos_sequence) { + reordered_image_embeds.push_back(image_embeds.at(new_video_id)); + reordered_images_grid_thw.push_back(images_grid_thw.at(new_video_id)); + } + // Todo: add offset of video? for (size_t new_image_id : images_sequence) { reordered_image_embeds.push_back(image_embeds.at(new_image_id)); reordered_images_grid_thw.push_back(images_grid_thw.at(new_image_id)); } + return {reordered_image_embeds, reordered_images_grid_thw}; } @@ -769,8 +790,8 @@ EncodedImage VisionEncoderQwen2VL::encode(const ov::Tensor& image, const ov::Any return encode_with_imagepreprocess_ov({image}, config_map); } -std::vector VisionEncoderQwen2VL::encode_video(const std::vector& frames, - const ov::AnyMap& config_map) { +std::vector VisionEncoderQwen2VL::encode_frames(const std::vector& frames, + const ov::AnyMap& config_map) { ProcessorConfig config = utils::from_any_map(config_map, m_processor_config); std::vector encoded_imgs; size_t i = 0; @@ -855,7 +876,7 @@ InputsEmbedderQwen2VL::InputsEmbedderQwen2VL( }); } -std::pair> InputsEmbedderQwen2VL::normalize_prompt( +NormlizedPrompt InputsEmbedderQwen2VL::normalize_prompt( const std::string& prompt, size_t base_id, const std::vector& images, @@ -913,10 +934,7 @@ std::pair> InputsEmbedderQwen2VL::normalize_pro unified_prompt.replace(unified_prompt.find(NATIVE_VIDEO_TAG), NATIVE_VIDEO_TAG.length(), expanded_tag); } - std::vector vision_sequence; - vision_sequence.insert(vision_sequence.end(), videos_sequence.begin(), videos_sequence.end()); - vision_sequence.insert(vision_sequence.end(), images_sequence.begin(), images_sequence.end()); - return {std::move(unified_prompt), std::move(vision_sequence)}; + return {std::move(unified_prompt), std::move(images_sequence), std::move(videos_sequence)}; } ov::Tensor InputsEmbedderQwen2VL::get_inputs_embeds(const std::string& unified_prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings, const std::vector& images_sequence) { @@ -939,6 +957,16 @@ ov::Tensor InputsEmbedderQwen2VL::get_inputs_embeds(const std::string& unified_p images_grid_thw.push_back({grid_t, grid_h, grid_w}); } + std::vector> video_grid_thw; + video_grid_thw.reserve(videos.size()); + for (const auto& encoded_video : videos) { + size_t grid_t = encoded_video.size(); + OPENVINO_ASSERT(grid_t > 0, "Input at least one frame for video."); + size_t grid_h = encoded_video[0].resized_source_size.height; + size_t grid_w = encoded_video[0].resized_source_size.width; + video_grid_thw.push_back({grid_t, grid_h, grid_w}); + } + ov::Tensor input_ids = get_encoded_input_ids(unified_prompt, metrics); CircularBufferQueueElementGuard embeddings_request_guard(m_embedding->get_request_queue().get()); EmbeddingsRequest& req = embeddings_request_guard.get(); @@ -946,19 +974,20 @@ ov::Tensor InputsEmbedderQwen2VL::get_inputs_embeds(const std::string& unified_p auto start_tokenizer_time = std::chrono::steady_clock::now(); ov::Tensor encoded_vision_start_token = m_tokenizer.encode(m_vlm_config.vision_start_token, ov::genai::add_special_tokens(false)).input_ids; - ov::Tensor encoded_image_pad_token = m_tokenizer.encode(m_vlm_config.image_pad_token, ov::genai::add_special_tokens(false)).input_ids; + ov::Tensor encoded_image_pad_token = m_tokenizer.encode(m_vlm_config.image_pad_token + m_vlm_config.video_pad_token, ov::genai::add_special_tokens(false)).input_ids; auto end_tokenizer_time = std::chrono::steady_clock::now(); OPENVINO_ASSERT(metrics.raw_metrics.tokenization_durations.size() > 0); metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); int64_t vision_start_token_id = encoded_vision_start_token.data()[encoded_vision_start_token.get_size() - 1]; - int64_t image_pad_token_id = encoded_image_pad_token.data()[encoded_image_pad_token.get_size() - 1]; + int64_t image_pad_token_id = encoded_image_pad_token.data()[encoded_image_pad_token.get_size() - 2]; + int64_t video_pad_token_id = encoded_image_pad_token.data()[encoded_image_pad_token.get_size() - 1]; - m_position_ids = create_position_ids(input_ids, images_grid_thw, images_sequence, 0, vision_start_token_id); + m_position_ids = create_position_ids(input_ids, images_grid_thw, images_sequence, video_grid_thw, videos_sequence, 0, vision_start_token_id); int64_t position_ids_max_element = *std::max_element(m_position_ids.data(), m_position_ids.data() + m_position_ids.get_size()); m_rope_delta = position_ids_max_element + 1 - static_cast(input_ids.get_shape().at(1)); - if (images.empty()) { + if (images.empty() && videos.empty()) { ov::Tensor inputs_embeds(text_embeds.get_element_type(), text_embeds.get_shape()); std::memcpy(inputs_embeds.data(), text_embeds.data(), text_embeds.get_byte_size()); return inputs_embeds; @@ -972,11 +1001,11 @@ ov::Tensor InputsEmbedderQwen2VL::get_inputs_embeds(const std::string& unified_p return qwen2_vl_utils::merge_text_and_image_embeddings(input_ids, text_embeds, merged_image_embeddings_tensor, image_pad_token_id); } -std::vector InputsEmbedderQwen2VL::encode_videos(const std::vector& video) { +std::vector InputsEmbedderQwen2VL::encode_video(const std::vector& video) { std::vector embeds; for (const ov::Tensor& single_video : video) { std::vector single_frames = to_single_image_tensors({single_video}); - auto embeds_video = m_vision_encoder->encode_video(single_frames); + auto embeds_video = m_vision_encoder->encode_frames(single_frames); embeds.insert(embeds.end(), embeds_video.begin(), embeds_video.end()); } return embeds; @@ -1014,7 +1043,7 @@ ov::Tensor InputsEmbedderQwen2VL::run_image_embeddings_merger( const std::vector>& videos, const std::vector& videos_sequence ) { - auto [reordered_image_embeds, reordered_images_grid_thw] = qwen2_vl_utils::reorder_image_embeds_and_grid_thw(images, images_sequence); + auto [reordered_image_embeds, reordered_images_grid_thw] = qwen2_vl_utils::reorder_image_video_embeds_and_grid_thw(images, images_sequence, videos, videos_sequence); ov::Tensor concatenated_embeds = qwen2_vl_utils::concatenate_image_embeds(reordered_image_embeds); ov::Tensor rotary_pos_emb = get_rotary_pos_emb(reordered_images_grid_thw); @@ -1115,11 +1144,17 @@ ov::Tensor InputsEmbedderQwen2VL::create_position_ids( const ov::Tensor& input_ids_tensor, const std::vector>& images_grid_thw, const std::vector& images_sequence, + const std::vector>& videos_grid_thw, + const std::vector& videos_sequence, const size_t image_id, const int64_t vision_start_token_id) { const size_t spatial_merge_size = m_vision_encoder->get_processor_config().merge_size; + const size_t tokens_per_second = m_vision_encoder->get_processor_config().tokens_per_second; std::vector> reordered_images_grid_thw; + for (size_t new_frame_id : videos_sequence) { + reordered_images_grid_thw.push_back(videos_grid_thw.at(new_frame_id - image_id)); + } for (size_t new_image_id : images_sequence) { reordered_images_grid_thw.push_back(images_grid_thw.at(new_image_id - image_id)); } @@ -1165,20 +1200,28 @@ ov::Tensor InputsEmbedderQwen2VL::create_position_ids( // Process image token with grid if (grid_idx < reordered_images_grid_thw.size()) { const auto& grid = reordered_images_grid_thw.at(grid_idx); + size_t llm_grid_t = grid.at(0); size_t llm_grid_h = grid.at(1) / spatial_merge_size; size_t llm_grid_w = grid.at(2) / spatial_merge_size; - size_t ed_image = ed + llm_grid_h * llm_grid_w; + size_t ed_image = ed + llm_grid_t * llm_grid_h * llm_grid_w; // Fill temporal dimension - std::fill_n(pos_data + ed, llm_grid_h * llm_grid_w, next_pos); + size_t llm_grid_sz = llm_grid_h * llm_grid_w; + for (size_t t = 0; t < llm_grid_t; t++) { + std::fill_n(pos_data + ed + t * llm_grid_sz, llm_grid_h * llm_grid_w, next_pos + t * tokens_per_second); + } // Fill height and width dimensions int64_t* height_data = pos_data + seq_len + ed; int64_t* width_data = pos_data + 2 * seq_len + ed; - for (size_t h = 0; h < llm_grid_h; ++h) { - std::fill_n(height_data + h * llm_grid_w, llm_grid_w, next_pos + h); - for (size_t w = 0; w < llm_grid_w; ++w) { - width_data[h * llm_grid_w + w] = next_pos + w; + for (size_t t = 0; t < llm_grid_t; t++) { + size_t offset_sz = t * llm_grid_sz; + for (size_t h = 0; h < llm_grid_h; ++h) { + size_t offset = h * llm_grid_w + offset_sz; + std::fill_n(height_data + offset, llm_grid_w, next_pos + h); + for (size_t w = 0; w < llm_grid_w; ++w) { + width_data[offset + w] = next_pos + w; + } } } diff --git a/src/cpp/src/visual_language/qwen2vl/classes.hpp b/src/cpp/src/visual_language/qwen2vl/classes.hpp index 426b493bac..d1d8dc1a20 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.hpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.hpp @@ -18,7 +18,7 @@ class VisionEncoderQwen2VL : public VisionEncoder { explicit VisionEncoderQwen2VL(const ModelsMap& models_map, const std::filesystem::path& config_dir_path, const std::string& device, const ov::AnyMap properties); EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map) override; - std::vector encode_video(const std::vector& frames, const ov::AnyMap& config_map) override; + std::vector encode_frames(const std::vector& frames, const ov::AnyMap& config_map) override; private: EncodedImage encode_with_imagepreprocess_cpp(const std::vector& image, const ov::AnyMap& config_map); @@ -51,7 +51,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { const std::vector& image_sequence = {}, const std::vector& videos_sequence = {}) override; - std::vector encode_videos(const std::vector& videos) override; + std::vector encode_video(const std::vector& videos) override; std::pair> get_position_ids(const size_t inputs_embeds_size, const size_t history_size) override; @@ -63,10 +63,11 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { const std::string& prompt, size_t base_id, const std::vector& images) const override { - return normalize_prompt(prompt, base_id, images, {}); + auto norm_prompt = normalize_prompt(prompt, base_id, images, {}); + return {norm_prompt.unified_prompt, norm_prompt.images_sequence}; } - std::pair> normalize_prompt( + NormlizedPrompt normalize_prompt( const std::string& prompt, size_t base_id, const std::vector& images, @@ -99,6 +100,8 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { const ov::Tensor& input_ids_tensor, const std::vector>& images_grid_thw, const std::vector& images_sequence, + const std::vector>& videos_grid_thw, + const std::vector& videos_sequence, const size_t image_id, const int64_t vision_start_token_id ); @@ -106,9 +109,11 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { namespace qwen2_vl_utils { -std::pair, std::vector>> reorder_image_embeds_and_grid_thw( +std::pair, std::vector>> reorder_image_video_embeds_and_grid_thw( const std::vector& encoded_images, - const std::vector& images_sequence + const std::vector& images_sequence, + const std::vector>& videos, + const std::vector& videos_sequence ); ov::Tensor get_attention_mask(const std::vector>& reordered_images_grid_thw); diff --git a/src/cpp/src/visual_language/vision_encoder.hpp b/src/cpp/src/visual_language/vision_encoder.hpp index ade9736bb4..f41cc8439a 100644 --- a/src/cpp/src/visual_language/vision_encoder.hpp +++ b/src/cpp/src/visual_language/vision_encoder.hpp @@ -59,6 +59,12 @@ struct EncodedImage { ResampledImage resampled_image; }; +struct NormlizedPrompt { + std::string unified_prompt; + std::vector images_sequence; + std::vector videos_sequence; +}; + /// @brief A class used to infer embeddings of an image using /// ov::InferRequest and configured by ProcessorConfig. class VisionEncoder { @@ -103,7 +109,7 @@ class VisionEncoder { virtual EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map = {}) = 0; /// @brief Compute embeddings of a or mulitple video given - virtual std::vector encode_video(const std::vector& frames, const ov::AnyMap& config_map = {}) { + virtual std::vector encode_frames(const std::vector& frames, const ov::AnyMap& config_map = {}) { OPENVINO_THROW("The current model does not support 'video' input, please use 'images' instead."); } From 8c0e13deab8512cf1abfae5d91c07766e1ba9a42 Mon Sep 17 00:00:00 2001 From: xipingya Date: Tue, 30 Sep 2025 10:18:26 +0800 Subject: [PATCH 044/118] add video histry id. Signed-off-by: xipingya --- .../src/continuous_batching/pipeline_base.cpp | 12 +++-- .../src/continuous_batching/pipeline_base.hpp | 1 + .../src/visual_language/inputs_embedder.cpp | 6 ++- .../src/visual_language/inputs_embedder.hpp | 2 + src/cpp/src/visual_language/pipeline.cpp | 6 ++- .../src/visual_language/qwen2vl/classes.cpp | 50 +++++++++++-------- .../src/visual_language/qwen2vl/classes.hpp | 6 ++- 7 files changed, 53 insertions(+), 30 deletions(-) diff --git a/src/cpp/src/continuous_batching/pipeline_base.cpp b/src/cpp/src/continuous_batching/pipeline_base.cpp index 81442a25b7..1195ef16c6 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.cpp +++ b/src/cpp/src/continuous_batching/pipeline_base.cpp @@ -35,11 +35,14 @@ void ContinuousBatchingPipeline::IContinuousBatchingPipeline::finish_chat() { m_is_chat_conversation = false; m_history.clear(); m_history_images.clear(); + m_history_videos.clear(); m_history_image_ids.clear(); + m_history_video_ids.clear(); if (m_inputs_embedder) { m_inputs_embedder->finish_chat(); } m_image_id = 0; + m_video_id = 0; }; std::vector @@ -190,7 +193,7 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( encoded_images = m_inputs_embedder->encode_images(image_rgbs); m_history_images.insert(m_history_images.end(), encoded_images.begin(), encoded_images.end()); - auto norm_prompt = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images, encoded_videos); + auto norm_prompt = m_inputs_embedder->normalize_prompt(prompt, m_image_id, m_video_id, encoded_images, encoded_videos); m_history.push_back({{"role", "user"}, {"content", norm_prompt.unified_prompt}}); m_history_image_ids.insert(m_history_image_ids.end(), norm_prompt.images_sequence.begin(), norm_prompt.images_sequence.end()); m_history_video_ids.insert(m_history_video_ids.end(), norm_prompt.videos_sequence.begin(), norm_prompt.videos_sequence.end()); @@ -236,7 +239,7 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( encoded_videos.push_back(encoded_vd); } - auto norm_prompt = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images, encoded_videos); + auto norm_prompt = m_inputs_embedder->normalize_prompt(prompt, m_image_id, m_video_id, encoded_images, encoded_videos); m_inputs_embedder->set_apply_chat_template_status(sampling_params[i].apply_chat_template); @@ -280,6 +283,7 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( m_inputs_embedder->update_chat_history(results[0].texts[0], encoded_results[0].m_status); if (encoded_results[0].m_status != ov::genai::GenerationStatus::CANCEL) { m_image_id += encoded_images.size(); + m_video_id += encoded_videos.size(); m_history.push_back({{"role", "assistant"}, {"content", results[0].texts[0]}}); } else { @@ -287,6 +291,8 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( for (size_t idx = 0; idx < encoded_images.size(); idx++) { m_history_image_ids.pop_back(); m_history_images.pop_back(); + m_history_video_ids.pop_back(); + m_history_videos.pop_back(); } } } @@ -334,7 +340,7 @@ GenerationHandle ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_re encoded_videos.push_back(encoded_vd); } - auto norm_prompt = m_inputs_embedder->normalize_prompt(prompt, 0, encoded_images, encoded_videos); + auto norm_prompt = m_inputs_embedder->normalize_prompt(prompt, 0, 0, encoded_images, encoded_videos); inputs = m_inputs_embedder->get_inputs_embeds(norm_prompt.unified_prompt, encoded_images, metrics, true, norm_prompt.images_sequence); } return add_request(request_id, inputs, sampling_params); diff --git a/src/cpp/src/continuous_batching/pipeline_base.hpp b/src/cpp/src/continuous_batching/pipeline_base.hpp index 323aa45279..98f22079c9 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.hpp +++ b/src/cpp/src/continuous_batching/pipeline_base.hpp @@ -56,6 +56,7 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline { std::vector m_history_image_ids; std::vector m_history_video_ids; size_t m_image_id = 0; + size_t m_video_id = 0; float m_load_time_ms = 0.0f; // to access m_load_time_ms diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 25bfe39dff..36a1a35bc1 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -192,6 +192,7 @@ std::vector InputsEmbedder::IInputsEmbedder::encode_vid NormlizedPrompt InputsEmbedder::IInputsEmbedder::normalize_prompt( const std::string& prompt, size_t base_id, + size_t video_base_id, const std::vector& images, const std::vector>& videos) const { OPENVINO_THROW("Current model doesn't support video preprocess currently. Input images are processed as separate images."); @@ -351,17 +352,18 @@ std::pair> InputsEmbedder::normalize_prompt( size_t base_id, const std::vector& images ) const { - auto norm_prompt = m_impl->normalize_prompt(prompt, base_id, images, {}); + auto norm_prompt = m_impl->normalize_prompt(prompt, base_id, 0, images, {}); return {norm_prompt.unified_prompt, norm_prompt.images_sequence}; } NormlizedPrompt InputsEmbedder::normalize_prompt( const std::string& prompt, size_t base_id, + size_t video_base_id, const std::vector& images, const std::vector>& videos ) const { - return m_impl->normalize_prompt(prompt, base_id, images, videos); + return m_impl->normalize_prompt(prompt, base_id, video_base_id, images, videos); } void verify_ids(const std::vector& image_ids, size_t base_id, size_t n_images) { diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index e1cb5fdaa0..839ca3d14d 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -86,6 +86,7 @@ class InputsEmbedder { virtual NormlizedPrompt normalize_prompt( const std::string& prompt, size_t base_id, + size_t video_base_id, const std::vector& images, const std::vector>& videos) const; @@ -167,6 +168,7 @@ class InputsEmbedder { virtual NormlizedPrompt normalize_prompt( const std::string& prompt, size_t base_id, + size_t video_base_id, const std::vector& images, const std::vector>& videos) const; diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index b39a0de8bd..4e6ee46b45 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -46,6 +46,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ size_t m_max_kv_cache_size = std::numeric_limits::max(); bool m_is_npu = false; size_t m_image_id = 0; + size_t m_video_id = 0; ChatHistory m_history; public: @@ -199,7 +200,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ auto encoded_vd = m_inputs_embedder->encode_video({vd}); encoded_videos.push_back(encoded_vd); } - auto norm_prompt = m_inputs_embedder->normalize_prompt(prompt, m_image_id, encoded_images, encoded_videos); + auto norm_prompt = m_inputs_embedder->normalize_prompt(prompt, m_image_id, m_video_id, encoded_images, encoded_videos); if (m_is_chat_conversation) { m_history.push_back({{"role", "user"}, {"content", norm_prompt.unified_prompt}}); @@ -207,6 +208,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ for (size_t idx = 0; idx < norm_prompt.images_sequence.size(); idx++) { norm_prompt.images_sequence[idx] -= m_image_id; + norm_prompt.videos_sequence[idx] -= m_video_id; } } else { @@ -286,6 +288,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ if (finish_info.streaming_finish_status != ov::genai::GenerationStatus::CANCEL) { m_image_id += encoded_images.size(); + m_video_id += encoded_videos.size(); // Tail of chat template is missing in KV cache. // Find the tail to concatenate it with the next input prompt. m_history.push_back({{"role", "assistant"}, {"content", decoded_results}}); @@ -332,6 +335,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ OPENVINO_ASSERT(!m_is_npu, "finish_chat() isn't supported in VLMPipeline for NPU device"); m_is_chat_conversation = false; m_image_id = 0; + m_video_id = 0; // Resetting state may be slow. m_language.reset_state(); m_language.get_tensor("attention_mask").set_shape({0, 0}); diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index 7469d9dba9..09bb7137d6 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -426,20 +426,24 @@ std::pair, std::vector>> reorder_i ) { std::vector image_embeds; std::vector> images_grid_thw; - image_embeds.reserve(encoded_images.size()); - images_grid_thw.reserve(encoded_images.size()); - + size_t video_frames_sz = 0; + for (const auto& encoded_video : videos) { + video_frames_sz += encoded_video.size(); + } + + // From here on, treat the video frames as images completely. + image_embeds.reserve(encoded_images.size() + video_frames_sz); + images_grid_thw.reserve(encoded_images.size() + video_frames_sz); + for (const auto& encoded_video : videos) { for (const auto& encoded_frame : encoded_video) { ov::Tensor single_image_embeds = encoded_frame.resized_source; image_embeds.push_back(std::move(single_image_embeds)); + size_t grid_t = 1; + size_t grid_h = encoded_video[0].resized_source_size.height; + size_t grid_w = encoded_video[0].resized_source_size.width; + images_grid_thw.push_back({grid_t, grid_h, grid_w}); } - - size_t grid_t = encoded_video.size(); - OPENVINO_ASSERT(grid_t > 0, "Input at least one frame for video."); - size_t grid_h = encoded_video[0].resized_source_size.height; - size_t grid_w = encoded_video[0].resized_source_size.width; - images_grid_thw.push_back({grid_t, grid_h, grid_w}); } for (const auto& encoded_image : encoded_images) { @@ -454,17 +458,16 @@ std::pair, std::vector>> reorder_i std::vector reordered_image_embeds; std::vector> reordered_images_grid_thw; - for (size_t new_video_id : videos_sequence) { + for (size_t new_video_id = 0; new_video_id < video_frames_sz; new_video_id++) { reordered_image_embeds.push_back(image_embeds.at(new_video_id)); reordered_images_grid_thw.push_back(images_grid_thw.at(new_video_id)); } - // Todo: add offset of video? + for (size_t new_image_id : images_sequence) { - reordered_image_embeds.push_back(image_embeds.at(new_image_id)); - reordered_images_grid_thw.push_back(images_grid_thw.at(new_image_id)); + reordered_image_embeds.push_back(image_embeds.at(new_image_id + video_frames_sz)); + reordered_images_grid_thw.push_back(images_grid_thw.at(new_image_id + video_frames_sz)); } - return {reordered_image_embeds, reordered_images_grid_thw}; } @@ -545,7 +548,8 @@ ov::Tensor merge_text_and_image_embeddings( const ov::Tensor& input_ids, const ov::Tensor& text_embeds, const ov::Tensor& processed_vision_embeds, - const int64_t image_pad_token_id + const int64_t image_pad_token_id, + const int64_t video_pad_token_id ) { ov::Tensor merged_embeds(text_embeds.get_element_type(), text_embeds.get_shape()); std::memcpy(merged_embeds.data(), text_embeds.data(), text_embeds.get_byte_size()); @@ -563,7 +567,7 @@ ov::Tensor merge_text_and_image_embeddings( for (size_t batch_idx = 0; batch_idx < batch_size; ++batch_idx) { for (size_t seq_idx = 0; seq_idx < seq_length; ++seq_idx) { size_t flat_idx = batch_idx * seq_length + seq_idx; - if (input_ids_data[flat_idx] == image_pad_token_id) { + if (input_ids_data[flat_idx] == image_pad_token_id || input_ids_data[flat_idx] == video_pad_token_id) { std::copy_n( vision_embeds_data + vision_embed_idx * hidden_size, hidden_size, @@ -879,6 +883,7 @@ InputsEmbedderQwen2VL::InputsEmbedderQwen2VL( NormlizedPrompt InputsEmbedderQwen2VL::normalize_prompt( const std::string& prompt, size_t base_id, + size_t video_base_id, const std::vector& images, const std::vector>& videos) const { // Images @@ -909,7 +914,7 @@ NormlizedPrompt InputsEmbedderQwen2VL::normalize_prompt( // Video std::vector videos_sequence; std::tie(unified_prompt, videos_sequence) = - normalize(unified_prompt, NATIVE_VIDEO_TAG, NATIVE_VIDEO_TAG, base_id, videos.size()); + normalize(unified_prompt, NATIVE_VIDEO_TAG, NATIVE_VIDEO_TAG, video_base_id, videos.size()); std::vector> video_grid_thw; video_grid_thw.reserve(videos.size()); @@ -922,7 +927,7 @@ NormlizedPrompt InputsEmbedderQwen2VL::normalize_prompt( } for (size_t new_image_id : videos_sequence) { - auto [grid_t, grid_h, grid_w] = video_grid_thw.at(new_image_id - base_id); + auto [grid_t, grid_h, grid_w] = video_grid_thw.at(new_image_id - video_base_id); size_t merge_length = std::pow(m_vision_encoder->get_processor_config().merge_size, 2); size_t num_video_pad_tokens = grid_t * grid_h * grid_w / merge_length; @@ -982,7 +987,7 @@ ov::Tensor InputsEmbedderQwen2VL::get_inputs_embeds(const std::string& unified_p int64_t image_pad_token_id = encoded_image_pad_token.data()[encoded_image_pad_token.get_size() - 2]; int64_t video_pad_token_id = encoded_image_pad_token.data()[encoded_image_pad_token.get_size() - 1]; - m_position_ids = create_position_ids(input_ids, images_grid_thw, images_sequence, video_grid_thw, videos_sequence, 0, vision_start_token_id); + m_position_ids = create_position_ids(input_ids, images_grid_thw, images_sequence, 0, video_grid_thw, videos_sequence, 0, vision_start_token_id); int64_t position_ids_max_element = *std::max_element(m_position_ids.data(), m_position_ids.data() + m_position_ids.get_size()); m_rope_delta = position_ids_max_element + 1 - static_cast(input_ids.get_shape().at(1)); @@ -998,7 +1003,7 @@ ov::Tensor InputsEmbedderQwen2VL::get_inputs_embeds(const std::string& unified_p } merged_image_embeddings_tensor = m_merged_image_embeddings; - return qwen2_vl_utils::merge_text_and_image_embeddings(input_ids, text_embeds, merged_image_embeddings_tensor, image_pad_token_id); + return qwen2_vl_utils::merge_text_and_image_embeddings(input_ids, text_embeds, merged_image_embeddings_tensor, image_pad_token_id, video_pad_token_id); } std::vector InputsEmbedderQwen2VL::encode_video(const std::vector& video) { @@ -1144,16 +1149,17 @@ ov::Tensor InputsEmbedderQwen2VL::create_position_ids( const ov::Tensor& input_ids_tensor, const std::vector>& images_grid_thw, const std::vector& images_sequence, + const size_t image_id, const std::vector>& videos_grid_thw, const std::vector& videos_sequence, - const size_t image_id, + const size_t video_id, const int64_t vision_start_token_id) { const size_t spatial_merge_size = m_vision_encoder->get_processor_config().merge_size; const size_t tokens_per_second = m_vision_encoder->get_processor_config().tokens_per_second; std::vector> reordered_images_grid_thw; for (size_t new_frame_id : videos_sequence) { - reordered_images_grid_thw.push_back(videos_grid_thw.at(new_frame_id - image_id)); + reordered_images_grid_thw.push_back(videos_grid_thw.at(new_frame_id - video_id)); } for (size_t new_image_id : images_sequence) { reordered_images_grid_thw.push_back(images_grid_thw.at(new_image_id - image_id)); diff --git a/src/cpp/src/visual_language/qwen2vl/classes.hpp b/src/cpp/src/visual_language/qwen2vl/classes.hpp index d1d8dc1a20..7a74d4aee0 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.hpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.hpp @@ -63,13 +63,14 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { const std::string& prompt, size_t base_id, const std::vector& images) const override { - auto norm_prompt = normalize_prompt(prompt, base_id, images, {}); + auto norm_prompt = normalize_prompt(prompt, base_id, 0, images, {}); return {norm_prompt.unified_prompt, norm_prompt.images_sequence}; } NormlizedPrompt normalize_prompt( const std::string& prompt, size_t base_id, + size_t video_base_id, const std::vector& images, const std::vector>& videos) const override; @@ -100,9 +101,10 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { const ov::Tensor& input_ids_tensor, const std::vector>& images_grid_thw, const std::vector& images_sequence, + const size_t image_id, const std::vector>& videos_grid_thw, const std::vector& videos_sequence, - const size_t image_id, + const size_t video_id, const int64_t vision_start_token_id ); }; From 64ba6845f5a385e2a87e8ced80f712814d0fc0ce Mon Sep 17 00:00:00 2001 From: Xiping Yan Date: Tue, 30 Sep 2025 10:25:31 +0800 Subject: [PATCH 045/118] Update src/cpp/include/openvino/genai/visual_language/pipeline.hpp Co-authored-by: Chen Peter --- src/cpp/include/openvino/genai/visual_language/pipeline.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp index 17945de39d..9824c74e74 100644 --- a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp +++ b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp @@ -115,7 +115,7 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { VLMDecodedResults generate( const std::string& prompt, const std::vector& images, - const std::vector& video, + const std::vector& videos, const GenerationConfig& generation_config, const StreamerVariant& streamer ); From 6e33dcfb8cd0808870ac99b12162993b0f0713ea Mon Sep 17 00:00:00 2001 From: xipingya Date: Tue, 30 Sep 2025 10:46:30 +0800 Subject: [PATCH 046/118] Rename video to videos, reducing confusion. std::vector videos std::vector means multiple videos ov::Tensor means [N,H,W,C], N represents multiple frames of a video. Signed-off-by: xipingya --- .../genai/continuous_batching_pipeline.hpp | 2 +- .../openvino/genai/visual_language/pipeline.hpp | 7 ++++--- src/cpp/src/continuous_batching/pipeline.cpp | 4 ++-- .../src/continuous_batching/pipeline_base.cpp | 4 ++-- .../src/continuous_batching/pipeline_base.hpp | 2 +- .../continuous_batching_adapter.hpp | 4 ++-- src/cpp/src/visual_language/inputs_embedder.cpp | 2 +- src/cpp/src/visual_language/pipeline.cpp | 8 ++++---- src/cpp/src/visual_language/pipeline_base.hpp | 16 ++++++++-------- src/cpp/src/visual_language/qwen2vl/classes.cpp | 4 ++-- src/python/py_continuous_batching_pipeline.cpp | 2 +- src/python/py_vlm_pipeline.cpp | 10 +++++----- tests/python_tests/test_vlm_pipeline.py | 2 +- 13 files changed, 34 insertions(+), 33 deletions(-) diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp index 726e76b80c..e0ffc6bb11 100644 --- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp +++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp @@ -173,7 +173,7 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { GenerationHandle add_request(uint64_t request_id, const std::string& prompt, const std::vector& images, - const std::vector& video, + const std::vector& videos, const ov::genai::GenerationConfig& sampling_params); void step(); diff --git a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp index 9824c74e74..1f086b85cf 100644 --- a/src/cpp/include/openvino/genai/visual_language/pipeline.hpp +++ b/src/cpp/include/openvino/genai/visual_language/pipeline.hpp @@ -104,9 +104,10 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { /// @brief Generate a response given a prompt and any number of /// uint8 RGB images with [NHWC] or [HWC] layout. + /// Or uint8 RGB video frames with [NHWC] layout, first dim means frames number. /// @param prompt A prompt to respond to. /// @param images Images to be prepended to a prompt. - /// @param video Video frames to be prepended to a prompt. + /// @param videos Multiple videos, each providing multiple frames, to be prepended to a prompt. /// @param generation_config A config to follow for text generation. /// @param streamer A streamer to acquire intermediate result. /// @return A string generated by a model. @@ -263,9 +264,9 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { * utils that allow to use generate() in the following way: * pipe.generate(prompt, ov::genai::image(image_tensor)). * pipe.generate(prompt, ov::genai::images(image_tensors)). - * pipe.generate(prompt, ov::genai::video(video_tensors)). + * pipe.generate(prompt, ov::genai::videos(videos_tensors)). */ static constexpr ov::Property image{"image"}; static constexpr ov::Property> images{"images"}; -static constexpr ov::Property> video{"video"}; +static constexpr ov::Property> videos{"videos"}; } diff --git a/src/cpp/src/continuous_batching/pipeline.cpp b/src/cpp/src/continuous_batching/pipeline.cpp index 7d786c96a7..b91ecab6b2 100644 --- a/src/cpp/src/continuous_batching/pipeline.cpp +++ b/src/cpp/src/continuous_batching/pipeline.cpp @@ -249,9 +249,9 @@ GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, GenerationHandle ContinuousBatchingPipeline::add_request(uint64_t request_id, const std::string& prompt, const std::vector& images, - const std::vector& video, + const std::vector& videos, const ov::genai::GenerationConfig& sampling_params) { - return m_impl->add_request(request_id, prompt, images, video, sampling_params); + return m_impl->add_request(request_id, prompt, images, videos, sampling_params); } void ContinuousBatchingPipeline::step() { diff --git a/src/cpp/src/continuous_batching/pipeline_base.cpp b/src/cpp/src/continuous_batching/pipeline_base.cpp index 1195ef16c6..b5b548cdf5 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.cpp +++ b/src/cpp/src/continuous_batching/pipeline_base.cpp @@ -323,7 +323,7 @@ GenerationHandle ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_re uint64_t request_id, const std::string& prompt, const std::vector& images, - const std::vector& video, + const std::vector& videos, GenerationConfig sampling_params) { OPENVINO_ASSERT(m_model_input_type == ModelInputType::EMBEDDINGS, "Model doesn't support embeddings."); @@ -335,7 +335,7 @@ GenerationHandle ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_re auto encoded_images = m_inputs_embedder->encode_images(images); std::vector> encoded_videos; - for (auto& vd : video) { + for (auto& vd : videos) { auto encoded_vd = m_inputs_embedder->encode_video({vd}); encoded_videos.push_back(encoded_vd); } diff --git a/src/cpp/src/continuous_batching/pipeline_base.hpp b/src/cpp/src/continuous_batching/pipeline_base.hpp index 98f22079c9..34fe497f19 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.hpp +++ b/src/cpp/src/continuous_batching/pipeline_base.hpp @@ -101,7 +101,7 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline { GenerationHandle add_request(uint64_t request_id, const std::string& prompt, const std::vector& images, - const std::vector& video, + const std::vector& videos, GenerationConfig sampling_params); /** diff --git a/src/cpp/src/visual_language/continuous_batching_adapter.hpp b/src/cpp/src/visual_language/continuous_batching_adapter.hpp index 1d3653be70..dbd89b66db 100644 --- a/src/cpp/src/visual_language/continuous_batching_adapter.hpp +++ b/src/cpp/src/visual_language/continuous_batching_adapter.hpp @@ -53,13 +53,13 @@ class ov::genai::VLMPipeline::VLMContinuousBatchingAdapter : public ov::genai::V VLMDecodedResults generate( const std::string& prompt, const std::vector& images, - const std::vector& video, + const std::vector& videos, GenerationConfig generation_config, const StreamerVariant& streamer ) override { auto start_time = std::chrono::steady_clock::now(); auto images_vec = images.size() == 0u ? std::vector>{} : std::vector>{images}; - auto video_vec = video.size() == 0u ? std::vector>{} : std::vector>{video}; + auto video_vec = videos.size() == 0u ? std::vector>{} : std::vector>{videos}; auto result = m_impl.generate({prompt}, images_vec, video_vec, {generation_config}, streamer)[0]; auto stop_time = std::chrono::steady_clock::now(); diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 36a1a35bc1..3c81f5aa77 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -186,7 +186,7 @@ ov::Tensor InputsEmbedder::IInputsEmbedder::get_inputs_embeds( } std::vector InputsEmbedder::IInputsEmbedder::encode_video(const std::vector& videos) { - OPENVINO_THROW("Current model doesn't support video preprocess currently. Input images are processed as separate images."); + OPENVINO_THROW("Current model doesn't support videos preprocess currently. Input images are processed as separate images."); } NormlizedPrompt InputsEmbedder::IInputsEmbedder::normalize_prompt( diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 4e6ee46b45..822b90bd3a 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -163,7 +163,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ VLMDecodedResults generate( const std::string& prompt, const std::vector& images, - const std::vector& video, + const std::vector& videos, GenerationConfig generation_config, const StreamerVariant& streamer ) override { @@ -196,7 +196,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ auto encoded_images = m_inputs_embedder->encode_images(images); std::vector> encoded_videos; - for (auto& vd : video) { + for (auto& vd : videos) { auto encoded_vd = m_inputs_embedder->encode_video({vd}); encoded_videos.push_back(encoded_vd); } @@ -470,11 +470,11 @@ VLMPipeline::~VLMPipeline() = default; VLMDecodedResults VLMPipeline::generate( const std::string& prompt, const std::vector& images, - const std::vector& video, + const std::vector& videos, const GenerationConfig& generation_config, const StreamerVariant& streamer ) { - return m_pimpl->generate(prompt, images, video, generation_config, streamer); + return m_pimpl->generate(prompt, images, videos, generation_config, streamer); } VLMDecodedResults VLMPipeline::generate( diff --git a/src/cpp/src/visual_language/pipeline_base.hpp b/src/cpp/src/visual_language/pipeline_base.hpp index 54be4b6934..4a4fa67579 100644 --- a/src/cpp/src/visual_language/pipeline_base.hpp +++ b/src/cpp/src/visual_language/pipeline_base.hpp @@ -30,7 +30,7 @@ class ov::genai::VLMPipeline::VLMPipelineBase { virtual VLMDecodedResults generate( const std::string& prompt, const std::vector& images, - const std::vector& video, + const std::vector& videos, GenerationConfig generation_config, const StreamerVariant& streamer ) = 0; @@ -41,7 +41,7 @@ class ov::genai::VLMPipeline::VLMPipelineBase { ) { auto image = config_map.find(ov::genai::image.name()); auto images = config_map.find(ov::genai::images.name()); - auto video = config_map.find(ov::genai::video.name()); + auto videos = config_map.find(ov::genai::videos.name()); ov::genai::OptionalGenerationConfig config_arg = utils::get_config_from_map(config_map); GenerationConfig config = (config_arg.has_value()) ? *config_arg : get_generation_config(); @@ -64,13 +64,13 @@ class ov::genai::VLMPipeline::VLMPipelineBase { } } - if (config_map.end() != video) { - if (video->second.is>()) { - video_rgbs = video->second.as>(); - } else if (video->second.is()) { - video_rgbs = {video->second.as()}; + if (config_map.end() != videos) { + if (videos->second.is>()) { + video_rgbs = videos->second.as>(); + } else if (videos->second.is()) { + video_rgbs = {videos->second.as()}; } else { - OPENVINO_THROW("Unknown video type."); + OPENVINO_THROW("Unknown videos type."); } } diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index 09bb7137d6..7d6c066b29 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -1006,9 +1006,9 @@ ov::Tensor InputsEmbedderQwen2VL::get_inputs_embeds(const std::string& unified_p return qwen2_vl_utils::merge_text_and_image_embeddings(input_ids, text_embeds, merged_image_embeddings_tensor, image_pad_token_id, video_pad_token_id); } -std::vector InputsEmbedderQwen2VL::encode_video(const std::vector& video) { +std::vector InputsEmbedderQwen2VL::encode_video(const std::vector& videos) { std::vector embeds; - for (const ov::Tensor& single_video : video) { + for (const ov::Tensor& single_video : videos) { std::vector single_frames = to_single_image_tensors({single_video}); auto embeds_video = m_vision_encoder->encode_frames(single_frames); embeds.insert(embeds.end(), embeds_video.begin(), embeds_video.end()); diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp index 36dea1dd0d..56dc6990e8 100644 --- a/src/python/py_continuous_batching_pipeline.cpp +++ b/src/python/py_continuous_batching_pipeline.cpp @@ -455,7 +455,7 @@ void init_continuous_batching_pipeline(py::module_& m) { py::arg("request_id"), py::arg("prompt"), py::arg("images"), - py::arg("video"), + py::arg("videos"), py::arg("generation_config")) .def("step", &ContinuousBatchingPipeline::step) diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp index 5d24cdd1a3..44febc7d94 100644 --- a/src/python/py_vlm_pipeline.cpp +++ b/src/python/py_vlm_pipeline.cpp @@ -132,7 +132,7 @@ py::object call_vlm_generate( ov::genai::VLMPipeline& pipe, const std::string& prompt, const std::vector& images, - const std::vector& video, + const std::vector& videos, const ov::genai::GenerationConfig& generation_config, const pyutils::PyBindStreamerVariant& py_streamer, const py::kwargs& kwargs @@ -142,7 +142,7 @@ py::object call_vlm_generate( ov::genai::VLMDecodedResults res; { py::gil_scoped_release rel; - res= pipe.generate(prompt, images, video, updated_config, streamer); + res= pipe.generate(prompt, images, videos, updated_config, streamer); } return py::cast(res); } @@ -247,15 +247,15 @@ void init_vlm_pipeline(py::module_& m) { "generate", [](ov::genai::VLMPipeline& pipe, const std::string& prompt, - const std::vector& video, + const std::vector& videos, const ov::genai::GenerationConfig& generation_config, const pyutils::PyBindStreamerVariant& streamer, const py::kwargs& kwargs ) -> py::typing::Union { - return call_vlm_generate(pipe, prompt, {}, video, generation_config, streamer, kwargs); + return call_vlm_generate(pipe, prompt, {}, videos, generation_config, streamer, kwargs); }, py::arg("prompt"), "Input string", - py::arg("video"), "Input video", + py::arg("videos"), "Input videos, each providing multiple frames", py::arg("generation_config"), "generation_config", py::arg("streamer") = std::monostate(), "streamer", (vlm_generate_docstring + std::string(" \n ")).c_str() diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index 675c10f02a..03eebedf37 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -1083,4 +1083,4 @@ def test_vlm_pipeline_video_input(request, model_id, video_name, backend): model_path = get_ov_model(model_id) vlm = VLMPipeline(model_path, "CPU", ATTENTION_BACKEND=backend) - genai_output = vlm.generate(prompt, video=[video_tensor], max_new_tokens=max_new_tokens) \ No newline at end of file + genai_output = vlm.generate(prompt, videos=[video_tensor], max_new_tokens=max_new_tokens) \ No newline at end of file From 6bf63de80cffab9b08213868d5d2dce038fb9fa0 Mon Sep 17 00:00:00 2001 From: xipingya Date: Tue, 30 Sep 2025 11:33:21 +0800 Subject: [PATCH 047/118] Remove useless header. --- src/cpp/src/visual_language/vision_encoder.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/cpp/src/visual_language/vision_encoder.hpp b/src/cpp/src/visual_language/vision_encoder.hpp index f41cc8439a..f930b0d19a 100644 --- a/src/cpp/src/visual_language/vision_encoder.hpp +++ b/src/cpp/src/visual_language/vision_encoder.hpp @@ -5,7 +5,6 @@ #include #include "openvino/runtime/infer_request.hpp" -#include "logger.hpp" #include "openvino/genai/common_types.hpp" #include "visual_language/vlm_config.hpp" #include "visual_language/processor_config.hpp" From eb4faea682eee7c65fdb95e5ce125bdd4d46c6cb Mon Sep 17 00:00:00 2001 From: xipingya Date: Tue, 30 Sep 2025 11:40:32 +0800 Subject: [PATCH 048/118] Update video-> videos in Readme Signed-off-by: xipingya --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 313b12709e..5749b0b6e3 100644 --- a/README.md +++ b/README.md @@ -164,8 +164,8 @@ result = pipe.generate(prompt, image=image_data, max_new_tokens=100) # To input multiple images, use 'images=' # result = pipe.generate(prompt, images=[image_data], max_new_tokens=100) -# To input video frames, use 'video=' -# result = pipe.generate(prompt, video=[image_data], max_new_tokens=100) +# To input videos frames, use 'videos=' +# result = pipe.generate(prompt, videos=[frames_data], max_new_tokens=100) print(result.texts[0]) ``` @@ -192,8 +192,8 @@ int main(int argc, char* argv[]) { // To input multiple images, use 'images' // pipe.generate(prompt, ov::genai::images(std::vector{rgb}), ov::genai::max_new_tokens(100)); - // To input video frames, use 'video' - // pipe.generate(prompt, ov::genai::video(std::vector{rgb}), ov::genai::max_new_tokens(100)); + // To input videos frames, use 'videos' + // pipe.generate(prompt, ov::genai::videos(std::vector{frames}), ov::genai::max_new_tokens(100)); } ``` From 123221b7b9133d751c75dacb6e98ba577be3a13f Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Tue, 30 Sep 2025 17:06:41 +0800 Subject: [PATCH 049/118] all video -> videos Signed-off-by: xiping.yan --- .../include/openvino/genai/continuous_batching_pipeline.hpp | 2 +- src/cpp/src/continuous_batching/pipeline.cpp | 4 ++-- src/cpp/src/continuous_batching/pipeline_base.hpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp index e0ffc6bb11..9ccec57c2f 100644 --- a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp +++ b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp @@ -192,7 +192,7 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline { std::vector generate( const std::vector& prompts, const std::vector>& images, - const std::vector>& video, + const std::vector>& videos, const std::vector& sampling_params, const StreamerVariant& streamer=std::monostate{}); diff --git a/src/cpp/src/continuous_batching/pipeline.cpp b/src/cpp/src/continuous_batching/pipeline.cpp index b91ecab6b2..7abec55b88 100644 --- a/src/cpp/src/continuous_batching/pipeline.cpp +++ b/src/cpp/src/continuous_batching/pipeline.cpp @@ -293,10 +293,10 @@ std::vector ContinuousBatchingPipeline::generate( std::vector ContinuousBatchingPipeline::generate( const std::vector& prompts, const std::vector>& images, - const std::vector>& video, + const std::vector>& videos, const std::vector& sampling_params, const StreamerVariant& streamer) { - return m_impl->generate(prompts, images, video, sampling_params, streamer); + return m_impl->generate(prompts, images, videos, sampling_params, streamer); } void ContinuousBatchingPipeline::start_chat(const std::string& system_message) { diff --git a/src/cpp/src/continuous_batching/pipeline_base.hpp b/src/cpp/src/continuous_batching/pipeline_base.hpp index 34fe497f19..687ad842e1 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.hpp +++ b/src/cpp/src/continuous_batching/pipeline_base.hpp @@ -140,7 +140,7 @@ class ContinuousBatchingPipeline::IContinuousBatchingPipeline { virtual std::vector generate(const std::vector& prompts, const std::vector>& images, - const std::vector>& video, + const std::vector>& videos, const std::vector& sampling_params, const StreamerVariant& streamer); From 515c91141c9626794477627814aa301fd19864ae Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Tue, 30 Sep 2025 21:34:48 +0800 Subject: [PATCH 050/118] Call images when the models not implement video process. Signed-off-by: xiping.yan --- .../src/continuous_batching/pipeline_base.cpp | 22 ++++++++-- .../src/visual_language/inputs_embedder.cpp | 43 +++++++++++++++++-- .../src/visual_language/inputs_embedder.hpp | 17 ++++++++ 3 files changed, 75 insertions(+), 7 deletions(-) diff --git a/src/cpp/src/continuous_batching/pipeline_base.cpp b/src/cpp/src/continuous_batching/pipeline_base.cpp index b5b548cdf5..43c9179fde 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.cpp +++ b/src/cpp/src/continuous_batching/pipeline_base.cpp @@ -203,12 +203,13 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( m_inputs_embedder->set_apply_chat_template_status(false); if (m_inputs_embedder->has_token_type_ids()) { - // Todo: support video auto [embeds, tt_ids] = m_inputs_embedder->get_inputs_embeds_with_token_type_ids(templated_history, m_history_images, + m_history_videos, vlm_perf_metrics[0], true, - m_history_image_ids); + m_history_image_ids, + m_history_video_ids); input_embeds_list.push_back(std::move(embeds)); token_type_ids_list.push_back(std::move(tt_ids)); } else { @@ -244,11 +245,24 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( m_inputs_embedder->set_apply_chat_template_status(sampling_params[i].apply_chat_template); if (m_inputs_embedder->has_token_type_ids()) { - auto [embeds, tt_ids] = m_inputs_embedder->get_inputs_embeds_with_token_type_ids(norm_prompt.unified_prompt, encoded_images, vlm_perf_metrics[i], true, norm_prompt.images_sequence); + auto [embeds, tt_ids] = + m_inputs_embedder->get_inputs_embeds_with_token_type_ids(norm_prompt.unified_prompt, + encoded_images, + encoded_videos, + vlm_perf_metrics[i], + true, + norm_prompt.images_sequence, + norm_prompt.videos_sequence); input_embeds_list.push_back(std::move(embeds)); token_type_ids_list.push_back(std::move(tt_ids)); } else { - input_embeds_list.emplace_back(m_inputs_embedder->get_inputs_embeds(norm_prompt.unified_prompt, encoded_images, vlm_perf_metrics[i], true, norm_prompt.images_sequence)); + input_embeds_list.emplace_back(m_inputs_embedder->get_inputs_embeds(norm_prompt.unified_prompt, + encoded_images, + encoded_videos, + vlm_perf_metrics[i], + true, + norm_prompt.images_sequence, + norm_prompt.videos_sequence)); } auto end_get_inputs_embeds = std::chrono::steady_clock::now(); diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 3c81f5aa77..b9f1583fb7 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -182,11 +182,15 @@ ov::Tensor InputsEmbedder::IInputsEmbedder::get_inputs_embeds( bool recalculate_merged_embeddings, const std::vector& images_sequence, const std::vector& videos_sequence) { - OPENVINO_THROW("Current model doesn't support video preprocess currently. Input images are processed as separate images."); + if (videos.size() > 0) { + OPENVINO_THROW("The model doesn't support 'videos' preprocessing yet. Please use 'images' instead."); + } else { + return get_inputs_embeds(prompt, images, metrics, recalculate_merged_embeddings, images_sequence); + } } std::vector InputsEmbedder::IInputsEmbedder::encode_video(const std::vector& videos) { - OPENVINO_THROW("Current model doesn't support videos preprocess currently. Input images are processed as separate images."); + OPENVINO_THROW("The model doesn't support 'videos' preprocessing yet. Please use 'images' instead."); } NormlizedPrompt InputsEmbedder::IInputsEmbedder::normalize_prompt( @@ -195,7 +199,13 @@ NormlizedPrompt InputsEmbedder::IInputsEmbedder::normalize_prompt( size_t video_base_id, const std::vector& images, const std::vector>& videos) const { - OPENVINO_THROW("Current model doesn't support video preprocess currently. Input images are processed as separate images."); + if (videos.size() > 0) { + OPENVINO_THROW("The model doesn't support 'videos' preprocessing yet. Please use 'images' instead."); + } else { + NormlizedPrompt norm_prompt; + std::tie(norm_prompt.unified_prompt, norm_prompt.images_sequence) = normalize_prompt(prompt, base_id, images); + return norm_prompt; + } } std::pair InputsEmbedder::IInputsEmbedder::get_inputs_embeds_with_token_type_ids( @@ -207,6 +217,21 @@ std::pair InputsEmbedder::IInputsEmbedder::get_inputs_em OPENVINO_THROW("This model does not support token_type_ids."); } +std::pair InputsEmbedder::IInputsEmbedder::get_inputs_embeds_with_token_type_ids( + const std::string& prompt, + const std::vector& images, + const std::vector>& videos, + VLMPerfMetrics& metrics, + bool recalculate_merged_embeddings, + const std::vector& image_sequence, + const std::vector& videos_sequence) { + if (videos.size() > 0) { + OPENVINO_THROW("The model doesn't support 'videos' preprocessing yet. Please use 'images' instead."); + } else { + return get_inputs_embeds_with_token_type_ids(prompt, images, metrics, recalculate_merged_embeddings, image_sequence); + } +} + bool InputsEmbedder::IInputsEmbedder::has_token_type_ids() const { return false; } /// Public InputsEmbedder class @@ -303,6 +328,18 @@ std::pair InputsEmbedder::get_inputs_embeds_with_token_t prompt, images, metrics, recalculate_merged_embeddings, image_sequence); } +std::pair InputsEmbedder::get_inputs_embeds_with_token_type_ids( + const std::string& prompt, + const std::vector& images, + const std::vector>& videos, + VLMPerfMetrics& metrics, + bool recalculate_merged_embeddings, + const std::vector& image_sequence, + const std::vector& videos_sequence) { + return m_impl->get_inputs_embeds_with_token_type_ids( + prompt, images, videos, metrics, recalculate_merged_embeddings, image_sequence, videos_sequence); +} + bool InputsEmbedder::has_token_type_ids() const { return m_impl->has_token_type_ids(); } diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index 839ca3d14d..3e8ba541ee 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -47,6 +47,15 @@ class InputsEmbedder { // compute input embedding and token_type_ids std::pair get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector& images, VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}); + std::pair get_inputs_embeds_with_token_type_ids( + const std::string& prompt, + const std::vector& images, + const std::vector>& videos, + ov::genai::VLMPerfMetrics& metrics, + bool recalculate_merged_embeddings = true, + const std::vector& image_sequence = {}, + const std::vector& videos_sequence = {}); + bool has_token_type_ids() const; std::vector encode_images(const std::vector& images); @@ -128,6 +137,14 @@ class InputsEmbedder { const std::vector& videos_sequence = {}); virtual std::pair get_inputs_embeds_with_token_type_ids(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}); + virtual std::pair get_inputs_embeds_with_token_type_ids( + const std::string& prompt, + const std::vector& images, + const std::vector>& videos, + ov::genai::VLMPerfMetrics& metrics, + bool recalculate_merged_embeddings = true, + const std::vector& image_sequence = {}, + const std::vector& videos_sequence = {}); virtual bool has_token_type_ids() const; From 7c9a2204faf8112038ceb9906c8aa6765c7def2d Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Tue, 7 Oct 2025 20:09:31 +0800 Subject: [PATCH 051/118] Update test for video input. Signed-off-by: xiping.yan --- tests/python_tests/test_vlm_pipeline.py | 114 +++++++++++++++--------- 1 file changed, 72 insertions(+), 42 deletions(-) diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index 03eebedf37..e06cddb124 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -156,6 +156,31 @@ def handwritten_tensor(pytestconfig): handwritten_url = "https://github.com/user-attachments/assets/8c9ae017-7837-4abc-ae92-c1054c9ec350" return openvino.Tensor(from_cache_or_download(pytestconfig, handwritten_url, "handwritten.png")) +# Return video with shape: [num_frames, height, width, 3] +def create_countdown_frames(): + frames_count = 5 + height = 240 + width = 360 + frame_list = [] + for count in range(frames_count, 0, -1): + frame = np.zeros((height, width, 3), dtype=np.uint8) + text = str(count) + (text_width, text_height), baseline = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 3, 4) + + text_x = (width - text_width) // 2 + text_y = (height + text_height) // 2 + + cv2.putText(frame, text, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, + 3, (255, 255, 255), 4, cv2.LINE_AA + ) + + frame_list.append(frame) + ov_tensor = openvino.Tensor(np.stack(frame_list)) + return ov_tensor + +@pytest.fixture(scope="module") +def countdown_video(): + return create_countdown_frames() model_ids = [ "katuni4ka/tiny-random-minicpmv-2_6", @@ -170,6 +195,11 @@ def handwritten_tensor(pytestconfig): "qnguyen3/nanoLLaVA" ] +model_video_ids = [ + "katuni4ka/tiny-random-qwen2vl", + "katuni4ka/tiny-random-qwen2.5-vl" +] + # On macOS, transformers<4.52 is required, but this causes gemma3 to fail GEMMA3_MACOS_XFAIL_REASON = "gemma3 not supported on macOS with older transformers" @@ -212,6 +242,33 @@ def streamer(word: str) -> bool: gc.collect() +@pytest.mark.precommit +@pytest.mark.parametrize("model_id", model_video_ids) +@pytest.mark.parametrize("backend", attention_backend) +def test_vlm_pipeline_video_input(model_id, backend, cat_tensor, countdown_video): + def streamer(word: str) -> bool: + nonlocal result_from_streamer + result_from_streamer.append(word) + return False + + models_path = get_ov_model(model_id) + ov_pipe = VLMPipeline(models_path, "CPU", ATTENTION_BACKEND=backend) + generation_config = ov_pipe.get_generation_config() + generation_config.max_new_tokens = 30 + generation_config.set_eos_token_id(ov_pipe.get_tokenizer().get_eos_token_id()) + + for images, videos in [[], [countdown_video]], [[cat_tensor], [countdown_video]]: + result_from_streamer = [] + res = ov_pipe.generate( + prompts[0], + images=images, + videos=videos, + generation_config=generation_config, + streamer=streamer, + ) + assert res.texts[0] == "".join(result_from_streamer) + + gc.collect() configs = [ get_greedy(), @@ -952,33 +1009,6 @@ def cat_image_384x384(cat_image): def cat_image_32x32(cat_image): return cat_image.resize((32, 32)) -# Return video with shape: [num_frames, height, width, 3] -def create_countdown_frames(): - frames_count = 5 - height = 240 - width = 360 - frame_list = [] - for count in range(frames_count, 0, -1): - frame = np.zeros((height, width, 3), dtype=np.uint8) - text = str(count) - (text_width, text_height), baseline = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 3, 4) - - text_x = (width - text_width) // 2 - text_y = (height + text_height) // 2 - - cv2.putText(frame, text, (text_x, text_y), cv2.FONT_HERSHEY_SIMPLEX, - 3, (255, 255, 255), 4, cv2.LINE_AA - ) - - frame_list.append(frame) - ov_tensor = openvino.Tensor(np.stack(frame_list)) - return ov_tensor - -@pytest.fixture(scope="module") -def countdown_video(): - return create_countdown_frames() - - @pytest.mark.precommit @pytest.mark.parametrize( "model_id, image_name, backend", @@ -1067,20 +1097,20 @@ def get_nanollava_processor(): gc.collect() -@pytest.mark.precommit -@pytest.mark.parametrize( - "model_id, video_name, backend", - [ - pytest.param("katuni4ka/tiny-random-qwen2.5-vl", "countdown_video", "SDPA"), - pytest.param("katuni4ka/tiny-random-qwen2.5-vl", "countdown_video", "PA"), - ], -) -def test_vlm_pipeline_video_input(request, model_id, video_name, backend): - video_tensor = request.getfixturevalue(video_name) - prompt = "Describe this video." - max_new_tokens = 10 +# @pytest.mark.precommit +# @pytest.mark.parametrize( +# "model_id, video_name, backend", +# [ +# pytest.param("katuni4ka/tiny-random-qwen2.5-vl", "countdown_video", "SDPA"), +# pytest.param("katuni4ka/tiny-random-qwen2.5-vl", "countdown_video", "PA"), +# ], +# ) +# def test_vlm_pipeline_video_input(request, model_id, video_name, backend): +# video_tensor = request.getfixturevalue(video_name) +# prompt = "Describe this video." +# max_new_tokens = 10 - model_path = get_ov_model(model_id) +# model_path = get_ov_model(model_id) - vlm = VLMPipeline(model_path, "CPU", ATTENTION_BACKEND=backend) - genai_output = vlm.generate(prompt, videos=[video_tensor], max_new_tokens=max_new_tokens) \ No newline at end of file +# vlm = VLMPipeline(model_path, "CPU", ATTENTION_BACKEND=backend) +# genai_output = vlm.generate(prompt, videos=[video_tensor], max_new_tokens=max_new_tokens) \ No newline at end of file From 28242fe67df78dab653bc68b490a5ebc7ef9df70 Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Tue, 7 Oct 2025 21:24:47 +0800 Subject: [PATCH 052/118] Add test: CB+Add_request. Signed-off-by: xiping.yan --- tests/python_tests/test_vlm_pipeline.py | 34 +++++++++++++++++-------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index e06cddb124..4c81b9c12d 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -275,12 +275,14 @@ def streamer(word: str) -> bool: get_beam_search(), ] +video_input = [False, True] @pytest.mark.precommit @pytest.mark.parametrize("config", configs) -def test_vlm_continuous_batching_generate_vs_add_request(config, cat_tensor): +@pytest.mark.parametrize("is_video_input", video_input) +def test_vlm_continuous_batching_generate_vs_add_request(config, is_video_input, cat_tensor, countdown_video): scheduler_config = SchedulerConfig() - models_path = get_ov_model(model_ids[0]) + models_path = get_ov_model(model_video_ids[0] if is_video_input else model_ids[0]) ov_pipe = VLMPipeline( models_path, "CPU", @@ -290,15 +292,23 @@ def test_vlm_continuous_batching_generate_vs_add_request(config, cat_tensor): generation_config = config generation_config.max_new_tokens = 30 eps = 0.001 - image_links_list = [[], [cat_tensor]] + vision_links_list = [[], [countdown_video]] if is_video_input else [[], [cat_tensor]] res_generate = [] - for images in image_links_list: - res_generate.append( - ov_pipe.generate( - prompts[0], images=images, generation_config=generation_config + if is_video_input: + for videos in vision_links_list: + res_generate.append( + ov_pipe.generate( + prompts[0], videos=videos, generation_config=generation_config + ) + ) + else: + for images in vision_links_list: + res_generate.append( + ov_pipe.generate( + prompts[0], images=images, generation_config=generation_config + ) ) - ) cb_pipe = ContinuousBatchingPipeline( models_path, @@ -308,8 +318,12 @@ def test_vlm_continuous_batching_generate_vs_add_request(config, cat_tensor): ) tokenizer = cb_pipe.get_tokenizer() - for idx, images in enumerate(image_links_list): - handle = cb_pipe.add_request(idx, prompts[0], images, generation_config) + for idx, images in enumerate(vision_links_list): + if is_video_input: + handle = cb_pipe.add_request(idx, prompts[0], [], images, generation_config) + else: + handle = cb_pipe.add_request(idx, prompts[0], images, generation_config) + while handle.get_status() != GenerationStatus.FINISHED: cb_pipe.step() outputs = handle.read_all() From 5e637df2c1ea5f04f6f61bcac9bb36aed12ff8af Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Wed, 8 Oct 2025 10:40:01 +0800 Subject: [PATCH 053/118] Add test: comparing with optimum. but result is different. Signed-off-by: xiping.yan --- tests/python_tests/test_vlm_pipeline.py | 66 +++++++++++++++++++------ 1 file changed, 51 insertions(+), 15 deletions(-) diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index 4c81b9c12d..42cc738be0 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -25,6 +25,7 @@ model_and_tag """ +import torch import openvino_tokenizers import openvino import gc @@ -1111,20 +1112,55 @@ def get_nanollava_processor(): gc.collect() -# @pytest.mark.precommit -# @pytest.mark.parametrize( -# "model_id, video_name, backend", -# [ -# pytest.param("katuni4ka/tiny-random-qwen2.5-vl", "countdown_video", "SDPA"), -# pytest.param("katuni4ka/tiny-random-qwen2.5-vl", "countdown_video", "PA"), -# ], -# ) -# def test_vlm_pipeline_video_input(request, model_id, video_name, backend): -# video_tensor = request.getfixturevalue(video_name) -# prompt = "Describe this video." -# max_new_tokens = 10 +@pytest.mark.precommit +@pytest.mark.parametrize( + "model_id, video_name, backend", + [ + pytest.param("katuni4ka/tiny-random-qwen2vl", "countdown_video", "SDPA"), + pytest.param("katuni4ka/tiny-random-qwen2vl", "countdown_video", "PA"), + pytest.param("katuni4ka/tiny-random-qwen2.5-vl", "countdown_video", "SDPA"), + pytest.param("katuni4ka/tiny-random-qwen2.5-vl", "countdown_video", "PA", marks=pytest.mark.xfail(reason="CVS-167316")), + ], +) +def test_vlm_pipeline_video_input_match_optimum(request, model_id, video_name, backend): + video_tensor = request.getfixturevalue(video_name) + if isinstance(video_tensor, openvino.Tensor): + video_torch_tensor = torch.from_numpy(video_tensor.data) + video_frames = torch.unbind(video_torch_tensor, dim=0) + + prompt = "Describe this video." + max_new_tokens = 20 + + model_path = get_ov_model(model_id) + + # Run the model with optimum-intel + model = OVModelForVisualCausalLM.from_pretrained(model_path, trust_remote_code=True) + conversation = [ + { + "role": "user", + "content": [ + {"type": "video"}, + {"type": "text", "text": prompt}, + ], + } + ] -# model_path = get_ov_model(model_id) + processor = transformers.AutoProcessor.from_pretrained(model_path, trust_remote_code=True) + templated_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) -# vlm = VLMPipeline(model_path, "CPU", ATTENTION_BACKEND=backend) -# genai_output = vlm.generate(prompt, videos=[video_tensor], max_new_tokens=max_new_tokens) \ No newline at end of file + inputs = processor(text=[templated_prompt], videos=[video_frames], padding=True, return_tensors="pt") + + output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) + input_ids = inputs["input_ids"] if isinstance(inputs, dict) else inputs.input_ids + generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(input_ids, output_ids)] + + optimum_output = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) + optimum_text = optimum_output[0] + + # Run the model with GenAI + vlm = VLMPipeline(model_path, "CPU", ATTENTION_BACKEND=backend) + genai_output = vlm.generate(prompt, videos=[video_tensor], max_new_tokens=max_new_tokens, do_sample=False) + genai_text = genai_output.texts[0] + + assert optimum_text == genai_text + gc.collect() \ No newline at end of file From ef752e248d1ec02d33d9ae4038b241697f3027a1 Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Wed, 8 Oct 2025 15:52:37 +0800 Subject: [PATCH 054/118] cb+add_request test pass. Backend==PA, mark fail because CVS-167316, because CB pipeline use wrong position_ids. Signed-off-by: xiping.yan --- tests/python_tests/test_vlm_pipeline.py | 87 ++++++++++++++++++------- 1 file changed, 64 insertions(+), 23 deletions(-) diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index 42cc738be0..a8312202e9 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -276,14 +276,11 @@ def streamer(word: str) -> bool: get_beam_search(), ] -video_input = [False, True] - @pytest.mark.precommit @pytest.mark.parametrize("config", configs) -@pytest.mark.parametrize("is_video_input", video_input) -def test_vlm_continuous_batching_generate_vs_add_request(config, is_video_input, cat_tensor, countdown_video): +def test_vlm_continuous_batching_generate_vs_add_request(config, cat_tensor): scheduler_config = SchedulerConfig() - models_path = get_ov_model(model_video_ids[0] if is_video_input else model_ids[0]) + models_path = get_ov_model(model_ids[0]) ov_pipe = VLMPipeline( models_path, "CPU", @@ -293,23 +290,15 @@ def test_vlm_continuous_batching_generate_vs_add_request(config, is_video_input, generation_config = config generation_config.max_new_tokens = 30 eps = 0.001 - vision_links_list = [[], [countdown_video]] if is_video_input else [[], [cat_tensor]] + vision_links_list = [[], [cat_tensor]] res_generate = [] - if is_video_input: - for videos in vision_links_list: - res_generate.append( - ov_pipe.generate( - prompts[0], videos=videos, generation_config=generation_config - ) - ) - else: - for images in vision_links_list: - res_generate.append( - ov_pipe.generate( - prompts[0], images=images, generation_config=generation_config - ) + for images in vision_links_list: + res_generate.append( + ov_pipe.generate( + prompts[0], images=images, generation_config=generation_config ) + ) cb_pipe = ContinuousBatchingPipeline( models_path, @@ -320,10 +309,62 @@ def test_vlm_continuous_batching_generate_vs_add_request(config, is_video_input, tokenizer = cb_pipe.get_tokenizer() for idx, images in enumerate(vision_links_list): - if is_video_input: - handle = cb_pipe.add_request(idx, prompts[0], [], images, generation_config) - else: - handle = cb_pipe.add_request(idx, prompts[0], images, generation_config) + handle = cb_pipe.add_request(idx, prompts[0], images, generation_config) + + while handle.get_status() != GenerationStatus.FINISHED: + cb_pipe.step() + outputs = handle.read_all() + for out_idx, output in enumerate(outputs): + text = tokenizer.decode(output.generated_ids) + assert text == res_generate[idx].texts[out_idx] + assert abs(output.score - res_generate[idx].scores[out_idx]) < eps + assert ( + output.finish_reason == GenerationFinishReason.STOP + or output.finish_reason == GenerationFinishReason.LENGTH + ) + + +@pytest.mark.precommit +@pytest.mark.parametrize("config", configs) +@pytest.mark.parametrize( + "backend", + [ + pytest.param("SDPA"), + pytest.param("PA", marks=pytest.mark.xfail(reason="CVS-167316")), + ], +) +def test_vlm_continuous_batching_generate_vs_add_request_video_input(config, backend, countdown_video): + scheduler_config = SchedulerConfig() + models_path = get_ov_model(model_video_ids[0]) + ov_pipe = VLMPipeline( + models_path, + "CPU", + ATTENTION_BACKEND=backend, + **get_default_llm_properties(), + ) + generation_config = config + generation_config.max_new_tokens = 30 + eps = 0.001 + video_links_list = [[], [countdown_video]] + + res_generate = [] + for videos in video_links_list: + res_generate.append( + ov_pipe.generate( + prompts[0], videos=videos, generation_config=generation_config + ) + ) + + cb_pipe = ContinuousBatchingPipeline( + models_path, + scheduler_config=scheduler_config, + device="CPU", + properties=get_default_llm_properties(), + ) + tokenizer = cb_pipe.get_tokenizer() + + for idx, videos in enumerate(video_links_list): + handle = cb_pipe.add_request(idx, prompts[0], [], videos, generation_config) while handle.get_status() != GenerationStatus.FINISHED: cb_pipe.step() From b6a87e5cf29e23d0d78092dd2b59dbccb0c7b13e Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Wed, 8 Oct 2025 16:15:07 +0800 Subject: [PATCH 055/118] vlm pipeline vs optimum. still fail. Signed-off-by: xiping.yan --- tests/python_tests/test_vlm_pipeline.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index a8312202e9..85ceb05aec 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -1163,11 +1163,10 @@ def get_nanollava_processor(): pytest.param("katuni4ka/tiny-random-qwen2.5-vl", "countdown_video", "PA", marks=pytest.mark.xfail(reason="CVS-167316")), ], ) -def test_vlm_pipeline_video_input_match_optimum(request, model_id, video_name, backend): - video_tensor = request.getfixturevalue(video_name) - if isinstance(video_tensor, openvino.Tensor): - video_torch_tensor = torch.from_numpy(video_tensor.data) - video_frames = torch.unbind(video_torch_tensor, dim=0) +def test_vlm_pipeline_match_optimum_video_input(request, model_id, video_name, backend): + video_ov_tensor = request.getfixturevalue(video_name) + assert(isinstance(video_ov_tensor, openvino.Tensor)) + video_torch_tensor = torch.from_numpy(video_ov_tensor.data) prompt = "Describe this video." max_new_tokens = 20 @@ -1188,8 +1187,7 @@ def test_vlm_pipeline_video_input_match_optimum(request, model_id, video_name, b processor = transformers.AutoProcessor.from_pretrained(model_path, trust_remote_code=True) templated_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) - - inputs = processor(text=[templated_prompt], videos=[video_frames], padding=True, return_tensors="pt") + inputs = processor(text=[templated_prompt], video=video_torch_tensor, padding=True, return_tensors="pt") output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) input_ids = inputs["input_ids"] if isinstance(inputs, dict) else inputs.input_ids @@ -1200,7 +1198,7 @@ def test_vlm_pipeline_video_input_match_optimum(request, model_id, video_name, b # Run the model with GenAI vlm = VLMPipeline(model_path, "CPU", ATTENTION_BACKEND=backend) - genai_output = vlm.generate(prompt, videos=[video_tensor], max_new_tokens=max_new_tokens, do_sample=False) + genai_output = vlm.generate(prompt, videos=[video_ov_tensor], max_new_tokens=max_new_tokens, do_sample=False) genai_text = genai_output.texts[0] assert optimum_text == genai_text From 1d810b6d548b4d878b35cb41f23157adcba6c494 Mon Sep 17 00:00:00 2001 From: Xiping Yan Date: Thu, 9 Oct 2025 09:16:05 +0800 Subject: [PATCH 056/118] Apply suggestion from @Copilot Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/python/py_vlm_pipeline.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/python/py_vlm_pipeline.cpp b/src/python/py_vlm_pipeline.cpp index 44febc7d94..94c4bc66f6 100644 --- a/src/python/py_vlm_pipeline.cpp +++ b/src/python/py_vlm_pipeline.cpp @@ -48,8 +48,8 @@ auto vlm_generate_docstring = R"( :param images: image or list of images :type images: list[ov.Tensor] or ov.Tensor - :param video: list of frames - :type video: list[ov.Tensor] + :param videos: list of frames + :type videos: list[ov.Tensor] :param generation_config: generation_config :type generation_config: GenerationConfig or a dict From 01bbd49e698b75a7204b2cb45caf6d8023070419 Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Thu, 9 Oct 2025 09:20:16 +0800 Subject: [PATCH 057/118] Revert useless update. Signed-off-by: xiping.yan --- tests/python_tests/test_vlm_pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index 85ceb05aec..22f88784df 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -290,10 +290,10 @@ def test_vlm_continuous_batching_generate_vs_add_request(config, cat_tensor): generation_config = config generation_config.max_new_tokens = 30 eps = 0.001 - vision_links_list = [[], [cat_tensor]] + image_links_list = [[], [cat_tensor]] res_generate = [] - for images in vision_links_list: + for images in image_links_list: res_generate.append( ov_pipe.generate( prompts[0], images=images, generation_config=generation_config @@ -308,7 +308,7 @@ def test_vlm_continuous_batching_generate_vs_add_request(config, cat_tensor): ) tokenizer = cb_pipe.get_tokenizer() - for idx, images in enumerate(vision_links_list): + for idx, images in enumerate(image_links_list): handle = cb_pipe.add_request(idx, prompts[0], images, generation_config) while handle.get_status() != GenerationStatus.FINISHED: From 274108efdc4e28aff63542431de6db6f84431208 Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Thu, 9 Oct 2025 09:29:25 +0800 Subject: [PATCH 058/118] merge master submodule --- thirdparty/openvino_tokenizers | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thirdparty/openvino_tokenizers b/thirdparty/openvino_tokenizers index 191359a9f4..760f9140ab 160000 --- a/thirdparty/openvino_tokenizers +++ b/thirdparty/openvino_tokenizers @@ -1 +1 @@ -Subproject commit 191359a9f49ac09cc169b9e3f81eb43d239a9690 +Subproject commit 760f9140ab1fd330d62ec171673ca0705abe6aa0 From dff97c12ab4f5dbd6969b2eeff898e87c20cd0c5 Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Thu, 9 Oct 2025 09:46:13 +0800 Subject: [PATCH 059/118] clarify frames data layout. Signed-off-by: xiping.yan --- README.md | 2 +- tests/python_tests/test_vlm_pipeline.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5749b0b6e3..0c2b9e599b 100644 --- a/README.md +++ b/README.md @@ -164,7 +164,7 @@ result = pipe.generate(prompt, image=image_data, max_new_tokens=100) # To input multiple images, use 'images=' # result = pipe.generate(prompt, images=[image_data], max_new_tokens=100) -# To input videos frames, use 'videos=' +# To input videos frames, use 'videos=', frames_data layout = [Frame num, H, W, C] # result = pipe.generate(prompt, videos=[frames_data], max_new_tokens=100) print(result.texts[0]) diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index 22f88784df..48697399ff 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -1065,6 +1065,7 @@ def cat_image_384x384(cat_image): def cat_image_32x32(cat_image): return cat_image.resize((32, 32)) + @pytest.mark.precommit @pytest.mark.parametrize( "model_id, image_name, backend", From 5cac72eef43f5fec81b4755e319ac502310be39c Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Thu, 9 Oct 2025 11:22:36 +0800 Subject: [PATCH 060/118] Fix bug: only pass images trigger crash. Signed-off-by: xiping.yan --- src/cpp/src/visual_language/pipeline.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 71cbd5c0b7..3e6df7b42b 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -223,6 +223,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ for (size_t idx = 0; idx < norm_prompt.images_sequence.size(); idx++) { norm_prompt.images_sequence[idx] -= m_image_id; + } + for (size_t idx = 0; idx < norm_prompt.videos_sequence.size(); idx++) { norm_prompt.videos_sequence[idx] -= m_video_id; } } From 02773925c32c8e3d62092066cf6621963a993ab5 Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Thu, 9 Oct 2025 21:01:52 +0800 Subject: [PATCH 061/118] 1: Add macro to disable "if" Node. 2: Pass constant node with constant type. Signed-off-by: xiping.yan --- .../src/visual_language/qwen2vl/classes.cpp | 59 +++++++++++++++---- 1 file changed, 48 insertions(+), 11 deletions(-) diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index 7d6c066b29..4034866f2e 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -147,7 +147,7 @@ std::pair, std::shared_ptr> patch auto img_normalized_2 = create_normalization(img_resized_2, image_mean, image_scale); int64_t concat_axis = 0; - ov::NodeVector inputs_to_concat = {img_normalized_1, img_normalized_2}; + ov::OutputVector inputs_to_concat = {img_normalized_1->output(0), img_normalized_2->output(0)}; auto temporal_images = std::make_shared(inputs_to_concat, concat_axis); auto result_temperal_images = std::make_shared(temporal_images); @@ -157,12 +157,13 @@ std::pair, std::shared_ptr> patch "else_body"), result_temperal_images}; } - +#define WITH_IF_NODE 0 std::shared_ptr patch_preprocess_into_model(std::shared_ptr model_org, - const ov::Tensor& image_mean_tensor, - const ov::Tensor& image_scale_tensor) { + const ov::op::v0::Constant& image_mean_tensor, + const ov::op::v0::Constant& image_scale_tensor) { +#if WITH_IF_NODE auto same_image = std::make_shared(ov::element::f32, ov::Shape{1}); - +#endif auto raw_images_1 = std::make_shared(ov::element::u8, ov::PartialShape{-1, -1, -1, -1}); auto raw_images_2 = std::make_shared(ov::element::u8, ov::PartialShape{-1, -1, -1, -1}); @@ -172,9 +173,10 @@ std::shared_ptr patch_preprocess_into_model(std::shared_ptr(ov::element::i64, ov::PartialShape{4}); auto reshape_shape2d = std::make_shared(ov::element::i64, ov::PartialShape{2}); +#if WITH_IF_NODE same_image->set_friendly_name("same_image"); same_image->output(0).get_tensor().set_names({"same_image"}); - +#endif raw_images_1->set_friendly_name("raw_images_1"); raw_images_1->output(0).get_tensor().set_names({"raw_images_1"}); raw_images_2->set_friendly_name("raw_images_2"); @@ -195,7 +197,8 @@ std::shared_ptr patch_preprocess_into_model(std::shared_ptr(image_mean_tensor); auto image_scale = std::make_shared(image_scale_tensor); - // If + // with If +#if WITH_IF_NODE auto then_raw_images_1 = std::make_shared(ov::element::u8, ov::PartialShape{-1, -1, -1, -1}); auto then_resize_target_shape = std::make_shared(ov::element::i64, ov::PartialShape{2}); auto then_tile_shape = std::make_shared(ov::element::i64, ov::PartialShape{4}); @@ -230,13 +233,33 @@ std::shared_ptr patch_preprocess_into_model(std::shared_ptrset_input(tile_shape->output(0), then_tile_shape, nullptr); auto temporal_images = if_op->set_output(model_then.second, model_else.second); - auto img_8d = create_transpose_patches(temporal_images.get_node_shared_ptr(), reshape_shape8d, std::make_shared(ov::element::i32, Shape{8}, std::vector{0, 2, 5, 3, 6, 1, 4, 7})); +#else + auto img_f32_nchw_1 = create_f32_nchw_input(raw_images_1); + auto img_resized_1 = create_bicubic_resize(img_f32_nchw_1, resize_shape); + auto img_normalized_1 = create_normalization(img_resized_1, image_mean, image_scale); + + auto img_f32_nchw_2 = create_f32_nchw_input(raw_images_2); + auto img_resized_2 = create_bicubic_resize(img_f32_nchw_2, resize_shape); + auto img_normalized_2 = create_normalization(img_resized_2, image_mean, image_scale); + + int64_t concat_axis = 0; + ov::OutputVector inputs_to_concat = {img_normalized_1->output(0), img_normalized_2->output(0)}; + auto temporal_images = std::make_shared(inputs_to_concat, concat_axis); + // auto temporal_images = std::make_shared(img_normalized_1, tile_shape); + + auto img_8d = + create_transpose_patches(temporal_images, + reshape_shape8d, + std::make_shared(ov::element::i32, + Shape{8}, + std::vector{0, 2, 5, 3, 6, 1, 4, 7})); +#endif auto img_4d = create_transpose_patches( img_8d, @@ -251,7 +274,7 @@ std::shared_ptr patch_preprocess_into_model(std::shared_ptrget_results(); - +#if WITH_IF_NODE return std::make_shared(results, ov::ParameterVector{same_image, raw_images_1, @@ -261,6 +284,16 @@ std::shared_ptr patch_preprocess_into_model(std::shared_ptr(results, + ov::ParameterVector{raw_images_1, + raw_images_2, + resize_shape, + tile_shape, + reshape_shape8d, + reshape_shape4d, + reshape_shape2d}); +#endif } } // namespace @@ -594,8 +627,8 @@ std::unique_ptr> create_vision_encoder_ire for (auto& v : a_image_scale) v = 1.0f / (v * 255.0f); - ov::Tensor image_mean(ov::element::f32, {1, a_image_mean.size(), 1, 1}, a_image_mean.data()); - ov::Tensor image_scale(ov::element::f32, {1, a_image_scale.size(), 1, 1}, a_image_scale.data()); + auto image_mean = ov::op::v0::Constant(ov::element::f32, ov::Shape{1, a_image_mean.size(), 1, 1}, a_image_mean.data()); + auto image_scale = ov::op::v0::Constant(ov::element::f32, ov::Shape{1, a_image_scale.size(), 1, 1}, a_image_scale.data()); auto model = patch_preprocess_into_model(model_org, image_mean, image_scale); auto compiled_model = utils::singleton_core().compile_model(model, device, config); @@ -727,8 +760,10 @@ EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const std::vec OPENVINO_ASSERT(config.temporal_patch_size == 2u, "temporal_patch_size != 2."); +#if WITH_IF_NODE std::vector same_image_data{images.size() == 2u ? 1.f : 0.f}; ov::Tensor same_image(ov::element::f32, ov::Shape{1}, same_image_data.data()); +#endif ov::Tensor input_image_1(ov::element::u8, image_shape, images[0].data()); ov::Tensor input_image_2(ov::element::u8, image_shape, @@ -767,7 +802,9 @@ EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const std::vec ov::Tensor reshape_shape2d(ov::element::i64, ov::Shape{2}, last_output_shape); // Same image means just duplicating input_image_1 as input_image_2 or not. +#if WITH_IF_NODE encoder.set_tensor("same_image", same_image); +#endif encoder.set_tensor("raw_images_1", input_image_1); encoder.set_tensor("raw_images_2", input_image_2); encoder.set_tensor("resize_shape", target_shape); From fe5e7093419dbfd570959fe2261c90bcac404627 Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Fri, 10 Oct 2025 10:28:04 +0800 Subject: [PATCH 062/118] pass video for sdpa backend. Signed-off-by: xiping.yan --- src/cpp/src/visual_language/pipeline.cpp | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 3e6df7b42b..83a63a7f2f 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -236,9 +236,22 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ auto start_get_inputs_embeds = std::chrono::steady_clock::now(); if (m_inputs_embedder->has_token_type_ids()) { - std::tie(inputs_embeds, token_type_ids) = m_inputs_embedder->get_inputs_embeds_with_token_type_ids(norm_prompt.unified_prompt, encoded_images, perf_metrics, encoded_images.size() > 0, norm_prompt.images_sequence); + std::tie(inputs_embeds, token_type_ids) = + m_inputs_embedder->get_inputs_embeds_with_token_type_ids(norm_prompt.unified_prompt, + encoded_images, + encoded_videos, + perf_metrics, + encoded_images.size() > 0 || encoded_videos.size() > 0, + norm_prompt.images_sequence, + norm_prompt.videos_sequence); } else { - inputs_embeds = m_inputs_embedder->get_inputs_embeds(norm_prompt.unified_prompt, encoded_images, perf_metrics, encoded_images.size() > 0, norm_prompt.images_sequence); + inputs_embeds = m_inputs_embedder->get_inputs_embeds(norm_prompt.unified_prompt, + encoded_images, + encoded_videos, + perf_metrics, + encoded_images.size() > 0 || encoded_videos.size() > 0, + norm_prompt.images_sequence, + norm_prompt.videos_sequence); } auto end_get_inputs_embeds = std::chrono::steady_clock::now(); From 1ad75e6627fa9198a1a9ec889a9195cddded96cb Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Fri, 10 Oct 2025 11:39:19 +0800 Subject: [PATCH 063/118] Comparing with Optimum, test pass. Signed-off-by: xiping.yan --- tests/python_tests/test_vlm_pipeline.py | 27 +++++++++---------------- 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index 48697399ff..ec245ca2b3 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -1167,7 +1167,6 @@ def get_nanollava_processor(): def test_vlm_pipeline_match_optimum_video_input(request, model_id, video_name, backend): video_ov_tensor = request.getfixturevalue(video_name) assert(isinstance(video_ov_tensor, openvino.Tensor)) - video_torch_tensor = torch.from_numpy(video_ov_tensor.data) prompt = "Describe this video." max_new_tokens = 20 @@ -1176,25 +1175,19 @@ def test_vlm_pipeline_match_optimum_video_input(request, model_id, video_name, b # Run the model with optimum-intel model = OVModelForVisualCausalLM.from_pretrained(model_path, trust_remote_code=True) - conversation = [ - { - "role": "user", - "content": [ - {"type": "video"}, - {"type": "text", "text": prompt}, - ], - } - ] - processor = transformers.AutoProcessor.from_pretrained(model_path, trust_remote_code=True) - templated_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) - inputs = processor(text=[templated_prompt], video=video_torch_tensor, padding=True, return_tensors="pt") - output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False) - input_ids = inputs["input_ids"] if isinstance(inputs, dict) else inputs.input_ids - generated_ids = [output_ids[len(input_ids) :] for input_ids, output_ids in zip(input_ids, output_ids)] + inputs = model.preprocess_inputs(text=prompt, video=video_ov_tensor.data, processor=processor) + generation_args = { + "max_new_tokens": max_new_tokens, + "temperature": 0.0, + "do_sample": False + } + + generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args) - optimum_output = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) + generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:] + optimum_output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) optimum_text = optimum_output[0] # Run the model with GenAI From 1cdee9dfc407c43d63885cea466e7afab7d54d73 Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Fri, 10 Oct 2025 14:28:49 +0800 Subject: [PATCH 064/118] update genai.pyi, rename video to vidoes. Signed-off-by: xiping.yan --- src/python/openvino_genai/py_openvino_genai.pyi | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index 60846fa120..ae041d5a43 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -429,7 +429,7 @@ class ContinuousBatchingPipeline: def add_request(self, request_id: typing.SupportsInt, prompt: str, images: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig) -> GenerationHandle: ... @typing.overload - def add_request(self, request_id: typing.SupportsInt, prompt: str, images: collections.abc.Sequence[openvino._pyopenvino.Tensor], video: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig) -> GenerationHandle: + def add_request(self, request_id: typing.SupportsInt, prompt: str, images: collections.abc.Sequence[openvino._pyopenvino.Tensor], videos: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig) -> GenerationHandle: ... def finish_chat(self) -> None: ... @@ -3510,8 +3510,8 @@ class VLMPipeline: :param images: image or list of images :type images: list[ov.Tensor] or ov.Tensor - :param video: list of frames - :type video: list[ov.Tensor] + :param videos: list of frames + :type videos: list[ov.Tensor] :param generation_config: generation_config :type generation_config: GenerationConfig or a dict @@ -3526,7 +3526,7 @@ class VLMPipeline: :rtype: VLMDecodedResults """ @typing.overload - def generate(self, prompt: str, video: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig, streamer: collections.abc.Callable[[str], int | None] | openvino_genai.py_openvino_genai.StreamerBase | None = None, **kwargs) -> VLMDecodedResults: + def generate(self, prompt: str, videos: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig, streamer: collections.abc.Callable[[str], int | None] | openvino_genai.py_openvino_genai.StreamerBase | None = None, **kwargs) -> VLMDecodedResults: """ Generates sequences for VLMs. @@ -3552,8 +3552,8 @@ class VLMPipeline: :param images: image or list of images :type images: list[ov.Tensor] or ov.Tensor - :param video: list of frames - :type video: list[ov.Tensor] + :param videos: list of frames + :type videos: list[ov.Tensor] :param generation_config: generation_config :type generation_config: GenerationConfig or a dict @@ -3596,8 +3596,8 @@ class VLMPipeline: :param images: image or list of images :type images: list[ov.Tensor] or ov.Tensor - :param video: list of frames - :type video: list[ov.Tensor] + :param videos: list of frames + :type videos: list[ov.Tensor] :param generation_config: generation_config :type generation_config: GenerationConfig or a dict From 06c029e9212cfca64f467238484f9d620b07540b Mon Sep 17 00:00:00 2001 From: Xiping Yan Date: Fri, 10 Oct 2025 15:04:51 +0800 Subject: [PATCH 065/118] Update tests/python_tests/test_vlm_pipeline.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/python_tests/test_vlm_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index ec245ca2b3..aec521cf00 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -258,7 +258,7 @@ def streamer(word: str) -> bool: generation_config.max_new_tokens = 30 generation_config.set_eos_token_id(ov_pipe.get_tokenizer().get_eos_token_id()) - for images, videos in [[], [countdown_video]], [[cat_tensor], [countdown_video]]: + for images, videos in [([], [countdown_video]), ([cat_tensor], [countdown_video])]: result_from_streamer = [] res = ov_pipe.generate( prompts[0], From 584c54684b7017595560ce4c8304745748cc04a9 Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Fri, 10 Oct 2025 15:20:51 +0800 Subject: [PATCH 066/118] Fix ci issues Signed-off-by: xiping.yan --- .../src/visual_language/qwen2vl/classes.cpp | 45 +------------------ .../openvino_genai/py_openvino_genai.pyi | 2 + 2 files changed, 4 insertions(+), 43 deletions(-) diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index 4034866f2e..f4c9f24db7 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -157,13 +157,11 @@ std::pair, std::shared_ptr> patch "else_body"), result_temperal_images}; } -#define WITH_IF_NODE 0 + std::shared_ptr patch_preprocess_into_model(std::shared_ptr model_org, const ov::op::v0::Constant& image_mean_tensor, const ov::op::v0::Constant& image_scale_tensor) { -#if WITH_IF_NODE auto same_image = std::make_shared(ov::element::f32, ov::Shape{1}); -#endif auto raw_images_1 = std::make_shared(ov::element::u8, ov::PartialShape{-1, -1, -1, -1}); auto raw_images_2 = std::make_shared(ov::element::u8, ov::PartialShape{-1, -1, -1, -1}); @@ -173,10 +171,8 @@ std::shared_ptr patch_preprocess_into_model(std::shared_ptr(ov::element::i64, ov::PartialShape{4}); auto reshape_shape2d = std::make_shared(ov::element::i64, ov::PartialShape{2}); -#if WITH_IF_NODE same_image->set_friendly_name("same_image"); same_image->output(0).get_tensor().set_names({"same_image"}); -#endif raw_images_1->set_friendly_name("raw_images_1"); raw_images_1->output(0).get_tensor().set_names({"raw_images_1"}); raw_images_2->set_friendly_name("raw_images_2"); @@ -197,8 +193,7 @@ std::shared_ptr patch_preprocess_into_model(std::shared_ptr(image_mean_tensor); auto image_scale = std::make_shared(image_scale_tensor); - // with If -#if WITH_IF_NODE + // If auto then_raw_images_1 = std::make_shared(ov::element::u8, ov::PartialShape{-1, -1, -1, -1}); auto then_resize_target_shape = std::make_shared(ov::element::i64, ov::PartialShape{2}); auto then_tile_shape = std::make_shared(ov::element::i64, ov::PartialShape{4}); @@ -239,27 +234,6 @@ std::shared_ptr patch_preprocess_into_model(std::shared_ptr(ov::element::i32, Shape{8}, std::vector{0, 2, 5, 3, 6, 1, 4, 7})); -#else - auto img_f32_nchw_1 = create_f32_nchw_input(raw_images_1); - auto img_resized_1 = create_bicubic_resize(img_f32_nchw_1, resize_shape); - auto img_normalized_1 = create_normalization(img_resized_1, image_mean, image_scale); - - auto img_f32_nchw_2 = create_f32_nchw_input(raw_images_2); - auto img_resized_2 = create_bicubic_resize(img_f32_nchw_2, resize_shape); - auto img_normalized_2 = create_normalization(img_resized_2, image_mean, image_scale); - - int64_t concat_axis = 0; - ov::OutputVector inputs_to_concat = {img_normalized_1->output(0), img_normalized_2->output(0)}; - auto temporal_images = std::make_shared(inputs_to_concat, concat_axis); - // auto temporal_images = std::make_shared(img_normalized_1, tile_shape); - - auto img_8d = - create_transpose_patches(temporal_images, - reshape_shape8d, - std::make_shared(ov::element::i32, - Shape{8}, - std::vector{0, 2, 5, 3, 6, 1, 4, 7})); -#endif auto img_4d = create_transpose_patches( img_8d, @@ -274,7 +248,6 @@ std::shared_ptr patch_preprocess_into_model(std::shared_ptrget_results(); -#if WITH_IF_NODE return std::make_shared(results, ov::ParameterVector{same_image, raw_images_1, @@ -284,16 +257,6 @@ std::shared_ptr patch_preprocess_into_model(std::shared_ptr(results, - ov::ParameterVector{raw_images_1, - raw_images_2, - resize_shape, - tile_shape, - reshape_shape8d, - reshape_shape4d, - reshape_shape2d}); -#endif } } // namespace @@ -760,10 +723,8 @@ EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const std::vec OPENVINO_ASSERT(config.temporal_patch_size == 2u, "temporal_patch_size != 2."); -#if WITH_IF_NODE std::vector same_image_data{images.size() == 2u ? 1.f : 0.f}; ov::Tensor same_image(ov::element::f32, ov::Shape{1}, same_image_data.data()); -#endif ov::Tensor input_image_1(ov::element::u8, image_shape, images[0].data()); ov::Tensor input_image_2(ov::element::u8, image_shape, @@ -802,9 +763,7 @@ EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const std::vec ov::Tensor reshape_shape2d(ov::element::i64, ov::Shape{2}, last_output_shape); // Same image means just duplicating input_image_1 as input_image_2 or not. -#if WITH_IF_NODE encoder.set_tensor("same_image", same_image); -#endif encoder.set_tensor("raw_images_1", input_image_1); encoder.set_tensor("raw_images_2", input_image_2); encoder.set_tensor("resize_shape", target_shape); diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index ae041d5a43..11cf310865 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -3540,6 +3540,8 @@ class VLMPipeline: InternVL2: \\n llava-1.5-7b-hf: LLaVA-NeXT: + nanoLLaVA: \\n + nanoLLaVA-1.5: \\n MiniCPM-V-2_6: (./)\\n Phi-3-vision: <|image_i|>\\n - the index starts with one Phi-4-multimodal-instruct: <|image_i|>\\n - the index starts with one From a77ce489dc12700a9982a516202f6f108bf76364 Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Fri, 10 Oct 2025 15:31:33 +0800 Subject: [PATCH 067/118] pass video for add request. Signed-off-by: xiping.yan --- src/cpp/src/continuous_batching/pipeline_base.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/cpp/src/continuous_batching/pipeline_base.cpp b/src/cpp/src/continuous_batching/pipeline_base.cpp index 43c9179fde..5d246cd058 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.cpp +++ b/src/cpp/src/continuous_batching/pipeline_base.cpp @@ -305,6 +305,8 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( for (size_t idx = 0; idx < encoded_images.size(); idx++) { m_history_image_ids.pop_back(); m_history_images.pop_back(); + } + for (size_t idx = 0; idx < encoded_videos.size(); idx++) { m_history_video_ids.pop_back(); m_history_videos.pop_back(); } @@ -355,7 +357,7 @@ GenerationHandle ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_re } auto norm_prompt = m_inputs_embedder->normalize_prompt(prompt, 0, 0, encoded_images, encoded_videos); - inputs = m_inputs_embedder->get_inputs_embeds(norm_prompt.unified_prompt, encoded_images, metrics, true, norm_prompt.images_sequence); + inputs = m_inputs_embedder->get_inputs_embeds(norm_prompt.unified_prompt, encoded_images, encoded_videos, metrics, true, norm_prompt.images_sequence, norm_prompt.videos_sequence); } return add_request(request_id, inputs, sampling_params); } From 890bc031458974f4e09f5d6210ee554d702943fd Mon Sep 17 00:00:00 2001 From: Xiping Yan Date: Fri, 10 Oct 2025 16:30:58 +0800 Subject: [PATCH 068/118] Update tests/python_tests/test_vlm_pipeline.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/python_tests/test_vlm_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index aec521cf00..e5d5b332ff 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -1166,7 +1166,7 @@ def get_nanollava_processor(): ) def test_vlm_pipeline_match_optimum_video_input(request, model_id, video_name, backend): video_ov_tensor = request.getfixturevalue(video_name) - assert(isinstance(video_ov_tensor, openvino.Tensor)) + assert isinstance(video_ov_tensor, openvino.Tensor) prompt = "Describe this video." max_new_tokens = 20 From 29e8b276a7316b29e2eac5eeb37639e2057c3f04 Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Fri, 10 Oct 2025 21:15:22 +0800 Subject: [PATCH 069/118] Add docstring and some comments based on copilot's suggestion. Signed-off-by: xiping.yan --- src/cpp/src/visual_language/qwen2vl/classes.cpp | 3 ++- tests/python_tests/test_vlm_pipeline.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index f4c9f24db7..7febbe2621 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -721,8 +721,9 @@ EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const std::vec config.max_pixels ); + // The default value of temporal_patch_size for original QWen2-VL and QWen2.5-VL is 2. + // Only 2 frames are processed at a time, so the following check is required. OPENVINO_ASSERT(config.temporal_patch_size == 2u, "temporal_patch_size != 2."); - std::vector same_image_data{images.size() == 2u ? 1.f : 0.f}; ov::Tensor same_image(ov::element::f32, ov::Shape{1}, same_image_data.data()); ov::Tensor input_image_1(ov::element::u8, image_shape, images[0].data()); diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index e5d5b332ff..d0ec920e85 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -157,7 +157,8 @@ def handwritten_tensor(pytestconfig): handwritten_url = "https://github.com/user-attachments/assets/8c9ae017-7837-4abc-ae92-c1054c9ec350" return openvino.Tensor(from_cache_or_download(pytestconfig, handwritten_url, "handwritten.png")) -# Return video with shape: [num_frames, height, width, 3] +# Creates a 5-frame countdown video with white text on black background for testing video preprocessing. +# Video shape: [num_frames, height, width, 3] def create_countdown_frames(): frames_count = 5 height = 240 From 02feed7be94114f3df2d6d88730339308906c8b0 Mon Sep 17 00:00:00 2001 From: Xiping Yan Date: Sat, 11 Oct 2025 09:14:39 +0800 Subject: [PATCH 070/118] Update tests/python_tests/test_vlm_pipeline.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/python_tests/test_vlm_pipeline.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index d0ec920e85..e6018cfe4e 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -364,8 +364,9 @@ def test_vlm_continuous_batching_generate_vs_add_request_video_input(config, bac ) tokenizer = cb_pipe.get_tokenizer() + empty_images = [] for idx, videos in enumerate(video_links_list): - handle = cb_pipe.add_request(idx, prompts[0], [], videos, generation_config) + handle = cb_pipe.add_request(idx, prompts[0], empty_images, videos, generation_config) while handle.get_status() != GenerationStatus.FINISHED: cb_pipe.step() From 9e258696a4a2415b481c431ca91c5f0705fbf215 Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Sat, 11 Oct 2025 09:27:22 +0800 Subject: [PATCH 071/118] encode token separately, based on copilot's suggestion. Signed-off-by: xiping.yan --- src/cpp/src/visual_language/qwen2vl/classes.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index 7febbe2621..fc3a624e7d 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -560,10 +560,12 @@ ov::Tensor merge_text_and_image_embeddings( const float* vision_embeds_data = processed_vision_embeds.data(); size_t vision_embed_idx = 0; + const int64_t img_token = image_pad_token_id; + const int64_t vid_token = video_pad_token_id; for (size_t batch_idx = 0; batch_idx < batch_size; ++batch_idx) { for (size_t seq_idx = 0; seq_idx < seq_length; ++seq_idx) { size_t flat_idx = batch_idx * seq_length + seq_idx; - if (input_ids_data[flat_idx] == image_pad_token_id || input_ids_data[flat_idx] == video_pad_token_id) { + if (input_ids_data[flat_idx] == img_token || input_ids_data[flat_idx] == vid_token) { std::copy_n( vision_embeds_data + vision_embed_idx * hidden_size, hidden_size, @@ -976,13 +978,14 @@ ov::Tensor InputsEmbedderQwen2VL::get_inputs_embeds(const std::string& unified_p auto start_tokenizer_time = std::chrono::steady_clock::now(); ov::Tensor encoded_vision_start_token = m_tokenizer.encode(m_vlm_config.vision_start_token, ov::genai::add_special_tokens(false)).input_ids; - ov::Tensor encoded_image_pad_token = m_tokenizer.encode(m_vlm_config.image_pad_token + m_vlm_config.video_pad_token, ov::genai::add_special_tokens(false)).input_ids; + ov::Tensor encoded_image_pad_token = m_tokenizer.encode(m_vlm_config.image_pad_token, ov::genai::add_special_tokens(false)).input_ids; + ov::Tensor encoded_video_pad_token = m_tokenizer.encode(m_vlm_config.video_pad_token, ov::genai::add_special_tokens(false)).input_ids; auto end_tokenizer_time = std::chrono::steady_clock::now(); OPENVINO_ASSERT(metrics.raw_metrics.tokenization_durations.size() > 0); metrics.raw_metrics.tokenization_durations[metrics.raw_metrics.tokenization_durations.size() - 1] += ov::genai::MicroSeconds(PerfMetrics::get_microsec(end_tokenizer_time - start_tokenizer_time)); int64_t vision_start_token_id = encoded_vision_start_token.data()[encoded_vision_start_token.get_size() - 1]; - int64_t image_pad_token_id = encoded_image_pad_token.data()[encoded_image_pad_token.get_size() - 2]; - int64_t video_pad_token_id = encoded_image_pad_token.data()[encoded_image_pad_token.get_size() - 1]; + int64_t image_pad_token_id = encoded_image_pad_token.data()[encoded_image_pad_token.get_size() - 1]; + int64_t video_pad_token_id = encoded_video_pad_token.data()[encoded_video_pad_token.get_size() - 1]; m_position_ids = create_position_ids(input_ids, images_grid_thw, images_sequence, 0, video_grid_thw, videos_sequence, 0, vision_start_token_id); From a06d7f9492da5347d03353f4b983917ecee1874a Mon Sep 17 00:00:00 2001 From: Xiping Yan Date: Sat, 11 Oct 2025 09:36:53 +0800 Subject: [PATCH 072/118] Update src/cpp/src/visual_language/vision_encoder.hpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/cpp/src/visual_language/vision_encoder.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/visual_language/vision_encoder.hpp b/src/cpp/src/visual_language/vision_encoder.hpp index f930b0d19a..9ba25e1c33 100644 --- a/src/cpp/src/visual_language/vision_encoder.hpp +++ b/src/cpp/src/visual_language/vision_encoder.hpp @@ -58,7 +58,7 @@ struct EncodedImage { ResampledImage resampled_image; }; -struct NormlizedPrompt { +struct NormalizedPrompt { std::string unified_prompt; std::vector images_sequence; std::vector videos_sequence; From 654233f4734b32c35a1fbd70b53c6e5c2b5773c0 Mon Sep 17 00:00:00 2001 From: Xiping Yan Date: Sat, 11 Oct 2025 09:37:06 +0800 Subject: [PATCH 073/118] Update src/cpp/src/visual_language/vision_encoder.hpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/cpp/src/visual_language/vision_encoder.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/visual_language/vision_encoder.hpp b/src/cpp/src/visual_language/vision_encoder.hpp index 9ba25e1c33..5c029e340d 100644 --- a/src/cpp/src/visual_language/vision_encoder.hpp +++ b/src/cpp/src/visual_language/vision_encoder.hpp @@ -107,7 +107,7 @@ class VisionEncoder { /// its slices. virtual EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map = {}) = 0; - /// @brief Compute embeddings of a or mulitple video given + /// @brief Compute embeddings of a or multiple video given virtual std::vector encode_frames(const std::vector& frames, const ov::AnyMap& config_map = {}) { OPENVINO_THROW("The current model does not support 'video' input, please use 'images' instead."); } From c8b4b2d80880c52216c8198bc719d2b312ce02e0 Mon Sep 17 00:00:00 2001 From: Xiping Yan Date: Sat, 11 Oct 2025 09:37:25 +0800 Subject: [PATCH 074/118] Update src/cpp/src/visual_language/qwen2vl/classes.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/cpp/src/visual_language/qwen2vl/classes.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index fc3a624e7d..629340e243 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -879,7 +879,7 @@ InputsEmbedderQwen2VL::InputsEmbedderQwen2VL( }); } -NormlizedPrompt InputsEmbedderQwen2VL::normalize_prompt( +NormalizedPrompt InputsEmbedderQwen2VL::normalize_prompt( const std::string& prompt, size_t base_id, size_t video_base_id, From 0596f579cd5e383d8f824e4f1e70c5980e76eafc Mon Sep 17 00:00:00 2001 From: Xiping Yan Date: Sat, 11 Oct 2025 09:37:32 +0800 Subject: [PATCH 075/118] Update src/cpp/src/visual_language/inputs_embedder.hpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/cpp/src/visual_language/inputs_embedder.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index 3e8ba541ee..6d7d1bd5bf 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -92,7 +92,7 @@ class InputsEmbedder { const std::vector& images ) const; - virtual NormlizedPrompt normalize_prompt( + virtual NormalizedPrompt normalize_prompt( const std::string& prompt, size_t base_id, size_t video_base_id, From 52e4971e12d6eebc764e3bd4294017f4443f7e71 Mon Sep 17 00:00:00 2001 From: Xiping Yan Date: Sat, 11 Oct 2025 09:37:48 +0800 Subject: [PATCH 076/118] Update src/cpp/src/visual_language/inputs_embedder.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/cpp/src/visual_language/inputs_embedder.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index b9f1583fb7..23e8ddef28 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -393,7 +393,7 @@ std::pair> InputsEmbedder::normalize_prompt( return {norm_prompt.unified_prompt, norm_prompt.images_sequence}; } -NormlizedPrompt InputsEmbedder::normalize_prompt( +NormalizedPrompt InputsEmbedder::normalize_prompt( const std::string& prompt, size_t base_id, size_t video_base_id, From 1ff0b7c3c70ec7394a9d7a286a258ade1cfbc1c0 Mon Sep 17 00:00:00 2001 From: Xiping Yan Date: Sat, 11 Oct 2025 09:38:29 +0800 Subject: [PATCH 077/118] Update src/cpp/src/visual_language/inputs_embedder.hpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/cpp/src/visual_language/inputs_embedder.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index 6d7d1bd5bf..6dc325a37f 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -182,7 +182,7 @@ class InputsEmbedder { const std::vector& images ) const = 0; - virtual NormlizedPrompt normalize_prompt( + virtual NormalizedPrompt normalize_prompt( const std::string& prompt, size_t base_id, size_t video_base_id, From 243c4f8e7e21336920b7448de22ee401720bb8ad Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Sat, 11 Oct 2025 09:44:44 +0800 Subject: [PATCH 078/118] Rename NormlizedPrompt to NormalizedPrompt --- src/cpp/src/visual_language/inputs_embedder.cpp | 4 ++-- src/cpp/src/visual_language/qwen2vl/classes.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index 23e8ddef28..20a19f5bac 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -193,7 +193,7 @@ std::vector InputsEmbedder::IInputsEmbedder::encode_vid OPENVINO_THROW("The model doesn't support 'videos' preprocessing yet. Please use 'images' instead."); } -NormlizedPrompt InputsEmbedder::IInputsEmbedder::normalize_prompt( +NormalizedPrompt InputsEmbedder::IInputsEmbedder::normalize_prompt( const std::string& prompt, size_t base_id, size_t video_base_id, @@ -202,7 +202,7 @@ NormlizedPrompt InputsEmbedder::IInputsEmbedder::normalize_prompt( if (videos.size() > 0) { OPENVINO_THROW("The model doesn't support 'videos' preprocessing yet. Please use 'images' instead."); } else { - NormlizedPrompt norm_prompt; + NormalizedPrompt norm_prompt; std::tie(norm_prompt.unified_prompt, norm_prompt.images_sequence) = normalize_prompt(prompt, base_id, images); return norm_prompt; } diff --git a/src/cpp/src/visual_language/qwen2vl/classes.hpp b/src/cpp/src/visual_language/qwen2vl/classes.hpp index 7a74d4aee0..52adb69248 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.hpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.hpp @@ -67,7 +67,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { return {norm_prompt.unified_prompt, norm_prompt.images_sequence}; } - NormlizedPrompt normalize_prompt( + NormalizedPrompt normalize_prompt( const std::string& prompt, size_t base_id, size_t video_base_id, From 4b98644a81bb8e4d5585d5af194fb456dca5072e Mon Sep 17 00:00:00 2001 From: Xiping Yan Date: Sat, 11 Oct 2025 09:51:17 +0800 Subject: [PATCH 079/118] Update src/python/py_continuous_batching_pipeline.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/python/py_continuous_batching_pipeline.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp index 56dc6990e8..098d0c7938 100644 --- a/src/python/py_continuous_batching_pipeline.cpp +++ b/src/python/py_continuous_batching_pipeline.cpp @@ -497,7 +497,7 @@ void init_continuous_batching_pipeline(py::module_& m) { const ov::genai::GenerationConfig& generation_config, const pyutils::PyBindStreamerVariant& streamer) -> py::typing::Union> { - std::vector prompts = { prompts }; + std::vector prompts = { prompt }; std::vector generation_configs = { generation_config }; return __call_cb_generate(pipe, prompts, generation_configs, streamer); }, From 0885a63527fac4573b7045baa4d31019b57a7f6b Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Sat, 11 Oct 2025 10:41:30 +0800 Subject: [PATCH 080/118] 1: if condition node "same_image" is confuse, just rename to cond_img_vid, 2: add some docstring. Signed-off-by: xiping.yan --- .../src/visual_language/qwen2vl/classes.cpp | 41 +++++++++++-------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index 629340e243..665390b5f3 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -132,7 +132,7 @@ std::pair, std::shared_ptr> patch } std::pair, std::shared_ptr> patch_preprocess_branch_video( - std::shared_ptr same_image, + std::shared_ptr cond_img_vid, std::shared_ptr raw_images_1, std::shared_ptr raw_images_2, std::shared_ptr resize_shape, @@ -151,9 +151,11 @@ std::pair, std::shared_ptr> patch auto temporal_images = std::make_shared(inputs_to_concat, concat_axis); auto result_temperal_images = std::make_shared(temporal_images); - auto result_ignore = std::make_shared(same_image); + + // If node's limitation: condition node must be output. + auto result_ignore = std::make_shared(cond_img_vid); return {std::make_shared(ov::ResultVector{result_temperal_images, result_ignore}, - ov::ParameterVector{same_image, raw_images_1, raw_images_2, resize_shape}, + ov::ParameterVector{cond_img_vid, raw_images_1, raw_images_2, resize_shape}, "else_body"), result_temperal_images}; } @@ -161,7 +163,7 @@ std::pair, std::shared_ptr> patch std::shared_ptr patch_preprocess_into_model(std::shared_ptr model_org, const ov::op::v0::Constant& image_mean_tensor, const ov::op::v0::Constant& image_scale_tensor) { - auto same_image = std::make_shared(ov::element::f32, ov::Shape{1}); + auto cond_img_vid = std::make_shared(ov::element::f32, ov::Shape{1}); auto raw_images_1 = std::make_shared(ov::element::u8, ov::PartialShape{-1, -1, -1, -1}); auto raw_images_2 = std::make_shared(ov::element::u8, ov::PartialShape{-1, -1, -1, -1}); @@ -171,8 +173,8 @@ std::shared_ptr patch_preprocess_into_model(std::shared_ptr(ov::element::i64, ov::PartialShape{4}); auto reshape_shape2d = std::make_shared(ov::element::i64, ov::PartialShape{2}); - same_image->set_friendly_name("same_image"); - same_image->output(0).get_tensor().set_names({"same_image"}); + cond_img_vid->set_friendly_name("cond_img_vid"); + cond_img_vid->output(0).get_tensor().set_names({"cond_img_vid"}); raw_images_1->set_friendly_name("raw_images_1"); raw_images_1->output(0).get_tensor().set_names({"raw_images_1"}); raw_images_2->set_friendly_name("raw_images_2"); @@ -203,11 +205,11 @@ std::shared_ptr patch_preprocess_into_model(std::shared_ptr(ov::element::f32, ov::Shape{1}); + auto else_video = std::make_shared(ov::element::f32, ov::Shape{1}); auto else_raw_images_1 = std::make_shared(ov::element::u8, ov::PartialShape{-1, -1, -1, -1}); auto else_raw_images_2 = std::make_shared(ov::element::u8, ov::PartialShape{-1, -1, -1, -1}); auto else_resize_target_shape = std::make_shared(ov::element::i64, ov::PartialShape{2}); - auto model_else = patch_preprocess_branch_video(else_same_image, + auto model_else = patch_preprocess_branch_video(else_video, else_raw_images_1, else_raw_images_2, else_resize_target_shape, @@ -217,7 +219,7 @@ std::shared_ptr patch_preprocess_into_model(std::shared_ptr(); if_op->set_then_body(model_then.first); if_op->set_else_body(model_else.first); - if_op->set_input(same_image->output(0), nullptr, else_same_image); + if_op->set_input(cond_img_vid->output(0), nullptr, else_video); if_op->set_input(raw_images_1->output(0), nullptr, else_raw_images_1); if_op->set_input(raw_images_2->output(0), nullptr, else_raw_images_2); @@ -243,13 +245,13 @@ std::shared_ptr patch_preprocess_into_model(std::shared_ptrget_parameters(); - OPENVINO_ASSERT(params_org.size() == 1); + OPENVINO_ASSERT(params_org.size() == 1u); ov::replace_node(params_org[0], img_2d); auto results = model_org->get_results(); return std::make_shared(results, - ov::ParameterVector{same_image, + ov::ParameterVector{cond_img_vid, raw_images_1, raw_images_2, resize_shape, @@ -640,7 +642,11 @@ EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_cpp(const std::ve CircularBufferQueueElementGuard infer_request_guard(this->m_ireq_queue_vision_encoder.get()); ov::InferRequest& encoder = infer_request_guard.get(); ProcessorConfig config = utils::from_any_map(config_map, m_processor_config); - + + // The default value of temporal_patch_size for original QWen2-VL and QWen2.5-VL is 2. + // If images.size() == 1: means processing image. + // If images.size() == 2: means processing video. + // If images.size() == others: undefined behaviour. so the following check is required. OPENVINO_ASSERT(config.temporal_patch_size == 2u, "temporal_patch_size != 2."); if (images.size() > 1) OPENVINO_ASSERT(config.temporal_patch_size == images.size(), "temporal_patch_size != images.size()"); @@ -724,10 +730,12 @@ EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const std::vec ); // The default value of temporal_patch_size for original QWen2-VL and QWen2.5-VL is 2. - // Only 2 frames are processed at a time, so the following check is required. + // In this model, Only 2 frames are processed at a time, so the following check is required. + // If cond_img_vid = 1: means image branch, just duplicating input_image_1 as input_image_2 + // If cond_img_vid = 0: means video branch, processing adjacent frames. OPENVINO_ASSERT(config.temporal_patch_size == 2u, "temporal_patch_size != 2."); - std::vector same_image_data{images.size() == 2u ? 1.f : 0.f}; - ov::Tensor same_image(ov::element::f32, ov::Shape{1}, same_image_data.data()); + std::vector cond_img_vid_data{images.size() == 2u ? 0.f : 1.f}; + ov::Tensor cond_img_vid(ov::element::f32, ov::Shape{1}, cond_img_vid_data.data()); ov::Tensor input_image_1(ov::element::u8, image_shape, images[0].data()); ov::Tensor input_image_2(ov::element::u8, image_shape, @@ -765,8 +773,7 @@ EncodedImage VisionEncoderQwen2VL::encode_with_imagepreprocess_ov(const std::vec ov::Tensor reshape_shape4d(ov::element::i64, ov::Shape{4}, a_temp_shape4d); ov::Tensor reshape_shape2d(ov::element::i64, ov::Shape{2}, last_output_shape); - // Same image means just duplicating input_image_1 as input_image_2 or not. - encoder.set_tensor("same_image", same_image); + encoder.set_tensor("cond_img_vid", cond_img_vid); encoder.set_tensor("raw_images_1", input_image_1); encoder.set_tensor("raw_images_2", input_image_2); encoder.set_tensor("resize_shape", target_shape); From 95c208b026545cdf15adf762af9e0252aa5b341d Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Sat, 11 Oct 2025 16:45:30 +0800 Subject: [PATCH 081/118] Fix bugs after merging master. Signed-off-by: xiping.yan --- .../src/continuous_batching/pipeline_base.cpp | 31 +++-------- .../src/visual_language/gemma3/classes.cpp | 2 +- .../src/visual_language/gemma3/classes.hpp | 2 +- .../src/visual_language/inputs_embedder.cpp | 25 ++------- .../src/visual_language/inputs_embedder.hpp | 10 ++-- .../visual_language/internvl_chat/classes.cpp | 2 +- .../visual_language/internvl_chat/classes.hpp | 2 +- src/cpp/src/visual_language/llava/classes.cpp | 2 +- src/cpp/src/visual_language/llava/classes.hpp | 2 +- .../visual_language/llava_next/classes.cpp | 2 +- .../visual_language/llava_next/classes.hpp | 2 +- .../llava_next_video/classes.cpp | 4 +- .../llava_next_video/classes.hpp | 4 +- .../src/visual_language/minicpm/classes.cpp | 2 +- .../src/visual_language/minicpm/classes.hpp | 2 +- .../src/visual_language/nanollava/classes.cpp | 2 +- .../src/visual_language/nanollava/classes.hpp | 2 +- .../visual_language/phi3_vision/classes.cpp | 2 +- .../visual_language/phi3_vision/classes.hpp | 2 +- .../src/visual_language/phi4mm/classes.cpp | 2 +- .../src/visual_language/phi4mm/classes.hpp | 2 +- src/cpp/src/visual_language/pipeline.cpp | 19 +++---- .../visual_language/qwen2_5_vl/classes.cpp | 2 +- .../visual_language/qwen2_5_vl/classes.hpp | 2 +- .../src/visual_language/qwen2vl/classes.cpp | 46 ++++++++-------- .../src/visual_language/qwen2vl/classes.hpp | 14 ++--- .../src/visual_language/vision_encoder.hpp | 9 +-- .../py_continuous_batching_pipeline.cpp | 55 +++++++------------ tests/python_tests/test_vlm_pipeline.py | 2 +- 29 files changed, 103 insertions(+), 152 deletions(-) diff --git a/src/cpp/src/continuous_batching/pipeline_base.cpp b/src/cpp/src/continuous_batching/pipeline_base.cpp index bf4db889e2..40f7d454fe 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.cpp +++ b/src/cpp/src/continuous_batching/pipeline_base.cpp @@ -165,21 +165,10 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( const std::vector>& videos_vector, const std::vector& sampling_params, const StreamerVariant& streamer) { - return generate(prompts, rgbs_vector, {}, sampling_params, streamer); -} - -std::vector -ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( - const std::vector& prompts, - const std::vector>& rgbs_vector, - const std::vector>& video_vector, - const std::vector& sampling_params, - const StreamerVariant& streamer) { auto generate_start_time = std::chrono::steady_clock::now(); OPENVINO_ASSERT(m_model_input_type == ModelInputType::EMBEDDINGS); OPENVINO_ASSERT(prompts.size() == sampling_params.size(), "Number of prompts should be equal to the number of generation configs."); - OPENVINO_ASSERT(prompts.size() == images_vector.size() && prompts.size() == videos_vector.size(), "Number of prompts should be equal to the number of images or video vectors."); std::vector input_embeds_list; std::vector token_type_ids_list; @@ -247,14 +236,13 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( m_inputs_embedder->set_apply_chat_template_status(sampling_params[i].apply_chat_template); if (m_inputs_embedder->has_token_type_ids()) { - auto [embeds, tt_ids] = - m_inputs_embedder->get_inputs_embeds_with_token_type_ids(norm_prompt.unified_prompt, - encoded_images, - encoded_videos, - vlm_perf_metrics[i], - true, - norm_prompt.images_sequence, - norm_prompt.videos_sequence); + auto [embeds, tt_ids] = m_inputs_embedder->get_inputs_embeds_with_token_type_ids(unified_prompt, + encoded_images, + encoded_videos, + vlm_perf_metrics[i], + true, + image_sequence, + video_sequence); input_embeds_list.push_back(std::move(embeds)); token_type_ids_list.push_back(std::move(tt_ids)); } else { @@ -322,7 +310,6 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(uint64_t re { std::lock_guard lock(m_embeddings_mutex); m_inputs_embedder->set_apply_chat_template_status(sampling_params.apply_chat_template); - auto encoded_images = m_inputs_embedder->encode_images(rgbs); const auto [unified_prompt, image_sequence, video_sequence] = m_inputs_embedder->normalize_prompt(prompt, 0, encoded_images); @@ -331,14 +318,14 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(uint64_t re return add_request(request_id, inputs, sampling_params); } -GenerationHandle ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request( +GenerationHandle +ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request( uint64_t request_id, const std::string& prompt, const std::vector& images, const std::vector& videos, GenerationConfig sampling_params) { OPENVINO_ASSERT(m_model_input_type == ModelInputType::EMBEDDINGS, "Model doesn't support embeddings."); - ov::genai::VLMPerfMetrics metrics; ov::Tensor inputs; { diff --git a/src/cpp/src/visual_language/gemma3/classes.cpp b/src/cpp/src/visual_language/gemma3/classes.cpp index 8da0ad21c3..c458fe70e2 100644 --- a/src/cpp/src/visual_language/gemma3/classes.cpp +++ b/src/cpp/src/visual_language/gemma3/classes.cpp @@ -85,7 +85,7 @@ std::vector InputsEmbedderGemma3::encode_images(const s return embeds; } -NormlizedPrompt InputsEmbedderGemma3::normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const { +NormalizedPrompt InputsEmbedderGemma3::normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const { std::string start_of_image = m_vlm_config.start_of_image; std::string image_token = m_vlm_config.image_soft_token; std::string end_of_image = m_vlm_config.end_of_image; diff --git a/src/cpp/src/visual_language/gemma3/classes.hpp b/src/cpp/src/visual_language/gemma3/classes.hpp index d8e44327c1..9ddee34b9c 100644 --- a/src/cpp/src/visual_language/gemma3/classes.hpp +++ b/src/cpp/src/visual_language/gemma3/classes.hpp @@ -43,7 +43,7 @@ class InputsEmbedderGemma3 : public InputsEmbedder::IInputsEmbedder { std::vector encode_images(const std::vector& images) override; - NormlizedPrompt normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const override; + NormalizedPrompt normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const override; std::pair> get_position_ids(const size_t inputs_embeds_size, const size_t history_size) override; diff --git a/src/cpp/src/visual_language/inputs_embedder.cpp b/src/cpp/src/visual_language/inputs_embedder.cpp index c6215193f5..75f8e23376 100644 --- a/src/cpp/src/visual_language/inputs_embedder.cpp +++ b/src/cpp/src/visual_language/inputs_embedder.cpp @@ -197,7 +197,7 @@ std::vector InputsEmbedder::IInputsEmbedder::encode_vid OPENVINO_THROW("Current model doesn't support video preprocess currently. Input images are processed as separate images."); } -NormlizedPrompt InputsEmbedder::IInputsEmbedder::normalize_prompt( +NormalizedPrompt InputsEmbedder::IInputsEmbedder::normalize_prompt( const std::string& prompt, size_t base_image_id, size_t base_video_id, @@ -353,8 +353,8 @@ std::vector InputsEmbedder::encode_images(const std::ve return m_impl->encode_images(images); } -std::vector InputsEmbedder::encode_video(const std::vector& videos) { - return m_impl->encode_video(videos); +std::vector InputsEmbedder::encode_videos(const std::vector& videos) { + return m_impl->encode_videos(videos); } std::pair> InputsEmbedder::get_position_ids(const size_t inputs_embeds_size, const size_t history_size) { @@ -389,7 +389,7 @@ void InputsEmbedder::finish_chat() { return m_impl->finish_chat(); } -NormlizedPrompt InputsEmbedder::normalize_prompt( +NormalizedPrompt InputsEmbedder::normalize_prompt( const std::string& prompt, size_t base_id, const std::vector& images @@ -398,18 +398,7 @@ NormlizedPrompt InputsEmbedder::normalize_prompt( return {norm_prompt.unified_prompt, norm_prompt.images_sequence}; } -NormalizedPrompt InputsEmbedder::normalize_prompt( - const std::string& prompt, - size_t base_id, - size_t video_base_id, - const std::vector& images, - const std::vector>& videos -) const { - return m_impl->normalize_prompt(prompt, base_id, video_base_id, images, videos); -} - - -NormlizedPrompt InputsEmbedder::normalize_prompt(const std::string& prompt, +NormalizedPrompt InputsEmbedder::normalize_prompt(const std::string& prompt, size_t base_image_id, size_t base_video_id, const std::vector& images, @@ -418,10 +407,6 @@ NormlizedPrompt InputsEmbedder::normalize_prompt(const std::string& prompt, return m_impl->normalize_prompt(prompt, base_image_id, base_video_id, images, videos); } -std::vector InputsEmbedder::encode_videos(const std::vector& videos) { - return m_impl->encode_videos(videos); -} - void verify_ids(const std::vector& image_ids, size_t base_id, size_t n_images) { for (size_t idx : image_ids) { OPENVINO_ASSERT(base_id <= idx, "Referring to older images isn't implemented"); diff --git a/src/cpp/src/visual_language/inputs_embedder.hpp b/src/cpp/src/visual_language/inputs_embedder.hpp index 99e444de73..fd8a50913c 100644 --- a/src/cpp/src/visual_language/inputs_embedder.hpp +++ b/src/cpp/src/visual_language/inputs_embedder.hpp @@ -22,7 +22,7 @@ namespace ov::genai { struct VLMPerfMetrics; const static std::regex UNIVERSAL_PATTERN{R"()"}; -struct NormlizedPrompt { +struct NormalizedPrompt { std::string unified_prompt; std::vector images_sequence; std::vector videos_sequence; @@ -93,13 +93,13 @@ class InputsEmbedder { // finishes chat and clears a chat history void finish_chat(); - virtual NormlizedPrompt normalize_prompt( + virtual NormalizedPrompt normalize_prompt( const std::string& prompt, size_t base_id, const std::vector& images ) const; - virtual NormlizedPrompt normalize_prompt( + virtual NormalizedPrompt normalize_prompt( const std::string& prompt, size_t base_image_id, size_t base_video_id, @@ -193,13 +193,13 @@ class InputsEmbedder { virtual void finish_chat(); - virtual NormlizedPrompt normalize_prompt( + virtual NormalizedPrompt normalize_prompt( const std::string& prompt, size_t base_id, const std::vector& images ) const = 0; - virtual NormlizedPrompt normalize_prompt( + virtual NormalizedPrompt normalize_prompt( const std::string& prompt, size_t base_image_id, size_t base_video_id, diff --git a/src/cpp/src/visual_language/internvl_chat/classes.cpp b/src/cpp/src/visual_language/internvl_chat/classes.cpp index f4c3ce7ae4..127affc11e 100644 --- a/src/cpp/src/visual_language/internvl_chat/classes.cpp +++ b/src/cpp/src/visual_language/internvl_chat/classes.cpp @@ -229,7 +229,7 @@ InputsEmbedderInternVLChat::InputsEmbedderInternVLChat( IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) { } -NormlizedPrompt InputsEmbedderInternVLChat::normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const { +NormalizedPrompt InputsEmbedderInternVLChat::normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const { auto [unified_prompt, images_sequence] = normalize(prompt, NATIVE_TAG, NATIVE_TAG + '\n', base_id, images.size()); std::string image_start_token = m_vlm_config.image_start_token; diff --git a/src/cpp/src/visual_language/internvl_chat/classes.hpp b/src/cpp/src/visual_language/internvl_chat/classes.hpp index b2fadd3304..ffa10a8151 100644 --- a/src/cpp/src/visual_language/internvl_chat/classes.hpp +++ b/src/cpp/src/visual_language/internvl_chat/classes.hpp @@ -37,7 +37,7 @@ class InputsEmbedderInternVLChat : public InputsEmbedder::IInputsEmbedder { ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}) override; - NormlizedPrompt normalize_prompt( + NormalizedPrompt normalize_prompt( const std::string& prompt, size_t base_id, const std::vector& images diff --git a/src/cpp/src/visual_language/llava/classes.cpp b/src/cpp/src/visual_language/llava/classes.cpp index 6848b64770..a4de9c5b4b 100644 --- a/src/cpp/src/visual_language/llava/classes.cpp +++ b/src/cpp/src/visual_language/llava/classes.cpp @@ -86,7 +86,7 @@ std::vector InputsEmbedderLLaVA::encode_images(const st return embeds; } -NormlizedPrompt InputsEmbedderLLaVA::normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const { +NormalizedPrompt InputsEmbedderLLaVA::normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const { std::string image_token = m_vlm_config.im_start; auto [unified_prompt, images_sequence] = normalize(prompt, image_token, image_token, base_id, images.size()); diff --git a/src/cpp/src/visual_language/llava/classes.hpp b/src/cpp/src/visual_language/llava/classes.hpp index 46afd9b75d..d6a1a8450e 100644 --- a/src/cpp/src/visual_language/llava/classes.hpp +++ b/src/cpp/src/visual_language/llava/classes.hpp @@ -39,7 +39,7 @@ class InputsEmbedderLLaVA : public InputsEmbedder::IInputsEmbedder { std::vector encode_images(const std::vector& images) override; - NormlizedPrompt normalize_prompt( + NormalizedPrompt normalize_prompt( const std::string& prompt, size_t base_id, const std::vector& images diff --git a/src/cpp/src/visual_language/llava_next/classes.cpp b/src/cpp/src/visual_language/llava_next/classes.cpp index 07019640fe..4068134c1b 100644 --- a/src/cpp/src/visual_language/llava_next/classes.cpp +++ b/src/cpp/src/visual_language/llava_next/classes.cpp @@ -340,7 +340,7 @@ std::vector InputsEmbedderLLaVANext::encode_images(cons return embeds; } -NormlizedPrompt InputsEmbedderLLaVANext::normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const { +NormalizedPrompt InputsEmbedderLLaVANext::normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const { std::string image_token = m_vlm_config.im_start; auto [unified_prompt, images_sequence] = normalize(prompt, image_token, image_token, base_id, images.size()); std::vector image_embeds; diff --git a/src/cpp/src/visual_language/llava_next/classes.hpp b/src/cpp/src/visual_language/llava_next/classes.hpp index a1734c3d74..a3f87c8442 100644 --- a/src/cpp/src/visual_language/llava_next/classes.hpp +++ b/src/cpp/src/visual_language/llava_next/classes.hpp @@ -30,7 +30,7 @@ class InputsEmbedderLLaVANext : public InputsEmbedderLLaVA { std::vector encode_images(const std::vector& images) override; - NormlizedPrompt normalize_prompt( + NormalizedPrompt normalize_prompt( const std::string& prompt, size_t base_id, const std::vector& images diff --git a/src/cpp/src/visual_language/llava_next_video/classes.cpp b/src/cpp/src/visual_language/llava_next_video/classes.cpp index 0d2eb22dd2..66220991a4 100644 --- a/src/cpp/src/visual_language/llava_next_video/classes.cpp +++ b/src/cpp/src/visual_language/llava_next_video/classes.cpp @@ -158,7 +158,7 @@ EncodedImage VisionEncoderLLaVANextVideo::encode(const ov::Tensor& image, const } -NormlizedPrompt InputsEmbedderLLaVANextVideo::normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const { +NormalizedPrompt InputsEmbedderLLaVANextVideo::normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const { std::string image_token = m_vlm_config.im_start; auto [unified_prompt, images_sequence] = normalize(prompt, image_token, image_token, base_id, images.size()); size_t searched_pos = 0; @@ -324,7 +324,7 @@ std::vector InputsEmbedderLLaVANextVideo::encode_videos return encoded_videos; } -NormlizedPrompt InputsEmbedderLLaVANextVideo::normalize_prompt(const std::string& prompt, +NormalizedPrompt InputsEmbedderLLaVANextVideo::normalize_prompt(const std::string& prompt, size_t base_image_id, size_t base_video_id, const std::vector& images, diff --git a/src/cpp/src/visual_language/llava_next_video/classes.hpp b/src/cpp/src/visual_language/llava_next_video/classes.hpp index 26c4e41482..9f57ba6537 100644 --- a/src/cpp/src/visual_language/llava_next_video/classes.hpp +++ b/src/cpp/src/visual_language/llava_next_video/classes.hpp @@ -69,7 +69,7 @@ class InputsEmbedderLLaVANextVideo : public InputsEmbedderLLaVANext { std::vector encode_videos(const std::vector& videos) override; - NormlizedPrompt normalize_prompt( + NormalizedPrompt normalize_prompt( const std::string& prompt, size_t base_image_id, size_t base_video_id, @@ -77,7 +77,7 @@ class InputsEmbedderLLaVANextVideo : public InputsEmbedderLLaVANext { const std::vector& videos) const override; - NormlizedPrompt normalize_prompt( + NormalizedPrompt normalize_prompt( const std::string& prompt, size_t base_id, const std::vector& images) const override; diff --git a/src/cpp/src/visual_language/minicpm/classes.cpp b/src/cpp/src/visual_language/minicpm/classes.cpp index 64dbdb356a..dea85e43a3 100644 --- a/src/cpp/src/visual_language/minicpm/classes.cpp +++ b/src/cpp/src/visual_language/minicpm/classes.cpp @@ -591,7 +591,7 @@ void adjust_pos_cache( } // namespace -NormlizedPrompt InputsEmbedderMiniCPM::normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const { +NormalizedPrompt InputsEmbedderMiniCPM::normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const { auto [unified_prompt, image_sequence] = normalize( prompt, diff --git a/src/cpp/src/visual_language/minicpm/classes.hpp b/src/cpp/src/visual_language/minicpm/classes.hpp index 4d0f255ba4..9b36a05c93 100644 --- a/src/cpp/src/visual_language/minicpm/classes.hpp +++ b/src/cpp/src/visual_language/minicpm/classes.hpp @@ -61,7 +61,7 @@ class InputsEmbedderMiniCPM : public InputsEmbedder::IInputsEmbedder { ov::Tensor get_inputs_embeds(const std::string& prompt, const std::vector& images, ov::genai::VLMPerfMetrics& metrics, bool recalculate_merged_embeddings = true, const std::vector& image_sequence = {}) override; - NormlizedPrompt normalize_prompt( + NormalizedPrompt normalize_prompt( const std::string& prompt, size_t base_id, const std::vector& images diff --git a/src/cpp/src/visual_language/nanollava/classes.cpp b/src/cpp/src/visual_language/nanollava/classes.cpp index d51831b2c6..7ebad167b8 100644 --- a/src/cpp/src/visual_language/nanollava/classes.cpp +++ b/src/cpp/src/visual_language/nanollava/classes.cpp @@ -138,7 +138,7 @@ std::vector InputsEmbedderNanoLLaVA::encode_images(cons return embeds; } -NormlizedPrompt InputsEmbedderNanoLLaVA::normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const { +NormalizedPrompt InputsEmbedderNanoLLaVA::normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const { auto norm_res = normalize(prompt, NATIVE_TAG, NATIVE_TAG, base_id, images.size()); return {norm_res.first, norm_res.second, {}}; } diff --git a/src/cpp/src/visual_language/nanollava/classes.hpp b/src/cpp/src/visual_language/nanollava/classes.hpp index 0c63e63aac..a09a7e19e9 100644 --- a/src/cpp/src/visual_language/nanollava/classes.hpp +++ b/src/cpp/src/visual_language/nanollava/classes.hpp @@ -39,7 +39,7 @@ class InputsEmbedderNanoLLaVA : public InputsEmbedder::IInputsEmbedder { std::vector encode_images(const std::vector& images) override; - NormlizedPrompt normalize_prompt( + NormalizedPrompt normalize_prompt( const std::string& prompt, size_t base_id, const std::vector& images diff --git a/src/cpp/src/visual_language/phi3_vision/classes.cpp b/src/cpp/src/visual_language/phi3_vision/classes.cpp index 54ee608e06..e613042bc0 100644 --- a/src/cpp/src/visual_language/phi3_vision/classes.cpp +++ b/src/cpp/src/visual_language/phi3_vision/classes.cpp @@ -668,7 +668,7 @@ InputsEmbedderPhi3V::InputsEmbedderPhi3V( const ov::AnyMap device_config) : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {} -NormlizedPrompt InputsEmbedderPhi3V::normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const { +NormalizedPrompt InputsEmbedderPhi3V::normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const { return {phi_utils::normalize_prompt(prompt, base_id, images.size(), NATIVE_PATTERN, write_native), {}}; } diff --git a/src/cpp/src/visual_language/phi3_vision/classes.hpp b/src/cpp/src/visual_language/phi3_vision/classes.hpp index 44ad6da016..3383fbde41 100644 --- a/src/cpp/src/visual_language/phi3_vision/classes.hpp +++ b/src/cpp/src/visual_language/phi3_vision/classes.hpp @@ -67,7 +67,7 @@ class InputsEmbedderPhi3V : public InputsEmbedder::IInputsEmbedder { void finish_chat() override; - NormlizedPrompt normalize_prompt( + NormalizedPrompt normalize_prompt( const std::string& prompt, size_t base_id, const std::vector& images diff --git a/src/cpp/src/visual_language/phi4mm/classes.cpp b/src/cpp/src/visual_language/phi4mm/classes.cpp index 45cba35776..be568a0740 100644 --- a/src/cpp/src/visual_language/phi4mm/classes.cpp +++ b/src/cpp/src/visual_language/phi4mm/classes.cpp @@ -776,7 +776,7 @@ InputsEmbedderPhi4MM::InputsEmbedderPhi4MM( const ov::AnyMap device_config) : IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {} -NormlizedPrompt InputsEmbedderPhi4MM::normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const { +NormalizedPrompt InputsEmbedderPhi4MM::normalize_prompt(const std::string& prompt, size_t base_id, const std::vector& images) const { return {phi_utils::normalize_prompt(prompt, base_id, images.size(), NATIVE_PATTERN, write_native), {}, {}}; } diff --git a/src/cpp/src/visual_language/phi4mm/classes.hpp b/src/cpp/src/visual_language/phi4mm/classes.hpp index d1d07e22ac..b7dc551afd 100644 --- a/src/cpp/src/visual_language/phi4mm/classes.hpp +++ b/src/cpp/src/visual_language/phi4mm/classes.hpp @@ -66,7 +66,7 @@ class InputsEmbedderPhi4MM : public InputsEmbedder::IInputsEmbedder { void finish_chat() override; - NormlizedPrompt normalize_prompt( + NormalizedPrompt normalize_prompt( const std::string& prompt, size_t base_id, const std::vector& images diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp index 62e93df9d4..720036d4a6 100644 --- a/src/cpp/src/visual_language/pipeline.cpp +++ b/src/cpp/src/visual_language/pipeline.cpp @@ -213,14 +213,11 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ auto [unified_prompt, image_sequence, video_sequence] = m_inputs_embedder->normalize_prompt(prompt, m_image_id, m_video_id, encoded_images, encoded_videos); if (m_is_chat_conversation) { - m_history.push_back({{"role", "user"}, {"content", norm_prompt.unified_prompt}}); - norm_prompt.unified_prompt = m_tokenizer.apply_chat_template(m_history, true); + m_history.push_back({{"role", "user"}, {"content", unified_prompt}}); + unified_prompt = m_tokenizer.apply_chat_template(m_history, true); - for (size_t idx = 0; idx < norm_prompt.images_sequence.size(); idx++) { - norm_prompt.images_sequence[idx] -= m_image_id; - } - for (size_t idx = 0; idx < norm_prompt.videos_sequence.size(); idx++) { - norm_prompt.videos_sequence[idx] -= m_video_id; + for (size_t idx = 0; idx < image_sequence.size(); idx++) { + image_sequence[idx] -= m_image_id; } for (size_t idx = 0; idx < video_sequence.size(); idx++) { video_sequence[idx] -= m_video_id; @@ -235,15 +232,15 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{ auto start_get_inputs_embeds = std::chrono::steady_clock::now(); if (m_inputs_embedder->has_token_type_ids()) { std::tie(inputs_embeds, token_type_ids) = - m_inputs_embedder->get_inputs_embeds_with_token_type_ids(norm_prompt.unified_prompt, + m_inputs_embedder->get_inputs_embeds_with_token_type_ids(unified_prompt, encoded_images, encoded_videos, perf_metrics, encoded_images.size() > 0 || encoded_videos.size() > 0, - norm_prompt.images_sequence, - norm_prompt.videos_sequence); + image_sequence, + video_sequence); } else { - inputs_embeds = m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, encoded_videos, perf_metrics, encoded_images.size() > 0, image_sequence, video_sequence); + inputs_embeds = m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, encoded_videos, perf_metrics, encoded_images.size() > 0 || encoded_videos.size() > 0, image_sequence, video_sequence); } auto end_get_inputs_embeds = std::chrono::steady_clock::now(); diff --git a/src/cpp/src/visual_language/qwen2_5_vl/classes.cpp b/src/cpp/src/visual_language/qwen2_5_vl/classes.cpp index 78b30322af..c069c983a8 100644 --- a/src/cpp/src/visual_language/qwen2_5_vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2_5_vl/classes.cpp @@ -110,7 +110,7 @@ InputsEmbedderQwen2_5_VL::InputsEmbedderQwen2_5_VL( ov::Tensor InputsEmbedderQwen2_5_VL::run_image_embeddings_merger( const std::vector& images, const std::vector& images_sequence, - const std::vector>& videos, + const std::vector& videos, const std::vector& videos_sequence ) { auto [reordered_image_embeds, reordered_images_grid_thw] = qwen2_vl_utils::reorder_image_video_embeds_and_grid_thw(images, images_sequence, videos, videos_sequence); diff --git a/src/cpp/src/visual_language/qwen2_5_vl/classes.hpp b/src/cpp/src/visual_language/qwen2_5_vl/classes.hpp index 081ac1cb82..f044d12bc8 100644 --- a/src/cpp/src/visual_language/qwen2_5_vl/classes.hpp +++ b/src/cpp/src/visual_language/qwen2_5_vl/classes.hpp @@ -38,7 +38,7 @@ class InputsEmbedderQwen2_5_VL : public InputsEmbedderQwen2VL { ov::Tensor run_image_embeddings_merger( const std::vector& images, const std::vector& images_sequence, - const std::vector>& videos, + const std::vector& videos, const std::vector& videos_sequence) override; }; diff --git a/src/cpp/src/visual_language/qwen2vl/classes.cpp b/src/cpp/src/visual_language/qwen2vl/classes.cpp index d11e8bf9c1..215323a112 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.cpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.cpp @@ -419,14 +419,14 @@ ov::Tensor transpose_image_patches(const ov::Tensor& reshaped_patches) { std::pair, std::vector>> reorder_image_video_embeds_and_grid_thw( const std::vector& encoded_images, const std::vector& images_sequence, - const std::vector>& videos, + const std::vector& videos, const std::vector& videos_sequence ) { std::vector image_embeds; std::vector> images_grid_thw; size_t video_frames_sz = 0; for (const auto& encoded_video : videos) { - video_frames_sz += encoded_video.size(); + video_frames_sz += encoded_video.video_frames_features.size(); } // From here on, treat the video frames as images completely. @@ -434,12 +434,12 @@ std::pair, std::vector>> reorder_i images_grid_thw.reserve(encoded_images.size() + video_frames_sz); for (const auto& encoded_video : videos) { - for (const auto& encoded_frame : encoded_video) { + for (const auto& encoded_frame : encoded_video.video_frames_features) { ov::Tensor single_image_embeds = encoded_frame.resized_source; image_embeds.push_back(std::move(single_image_embeds)); size_t grid_t = 1; - size_t grid_h = encoded_video[0].resized_source_size.height; - size_t grid_w = encoded_video[0].resized_source_size.width; + size_t grid_h = encoded_video.video_frames_features[0].resized_source_size.height; + size_t grid_w = encoded_video.video_frames_features[0].resized_source_size.width; images_grid_thw.push_back({grid_t, grid_h, grid_w}); } } @@ -800,7 +800,7 @@ EncodedImage VisionEncoderQwen2VL::encode(const ov::Tensor& image, const ov::Any return encode_with_imagepreprocess_ov({image}, config_map); } -std::vector VisionEncoderQwen2VL::encode_frames(const std::vector& frames, +EncodedVideo VisionEncoderQwen2VL::encode_frames(const std::vector& frames, const ov::AnyMap& config_map) { ProcessorConfig config = utils::from_any_map(config_map, m_processor_config); std::vector encoded_imgs; @@ -830,7 +830,9 @@ std::vector VisionEncoderQwen2VL::encode_frames(const std::vector< encoded_imgs.push_back(encoded_img); } - return encoded_imgs; + EncodedVideo encoded_video; + encoded_video.video_frames_features = std::move(encoded_imgs); + return encoded_video; } InputsEmbedderQwen2VL::InputsEmbedderQwen2VL( @@ -887,15 +889,15 @@ InputsEmbedderQwen2VL::InputsEmbedderQwen2VL( } NormalizedPrompt InputsEmbedderQwen2VL::normalize_prompt(const std::string& prompt, - size_t base_id, + size_t image_base_id, size_t video_base_id, const std::vector& images, const std::vector& videos) const { // Images - auto [unified_prompt, images_sequence] = normalize(prompt, NATIVE_TAG, NATIVE_TAG, base_id, images.size()); + auto [unified_prompt, images_sequence] = normalize(prompt, NATIVE_TAG, NATIVE_TAG, image_base_id, images.size()); std::vector> images_grid_thw; images_grid_thw.reserve(images.size()); - + for (const auto& encoded_image : images) { size_t grid_t = 1; size_t grid_h = encoded_image.resized_source_size.height; @@ -904,7 +906,7 @@ NormalizedPrompt InputsEmbedderQwen2VL::normalize_prompt(const std::string& prom } for (size_t new_image_id : images_sequence) { - auto [grid_t, grid_h, grid_w] = images_grid_thw.at(new_image_id - base_id); + auto [grid_t, grid_h, grid_w] = images_grid_thw.at(new_image_id - image_base_id); size_t merge_length = std::pow(m_vision_encoder->get_processor_config().merge_size, 2); size_t num_image_pad_tokens = grid_t * grid_h * grid_w / merge_length; @@ -924,10 +926,10 @@ NormalizedPrompt InputsEmbedderQwen2VL::normalize_prompt(const std::string& prom video_grid_thw.reserve(videos.size()); for (const auto& encoded_vd : videos) { - size_t grid_t = encoded_vd.size(); + size_t grid_t = encoded_vd.video_frames_features.size(); OPENVINO_ASSERT(grid_t > 0, "Input at least one frame for video."); - size_t grid_h = encoded_vd[0].resized_source_size.height; - size_t grid_w = encoded_vd[0].resized_source_size.width; + size_t grid_h = encoded_vd.video_frames_features[0].resized_source_size.height; + size_t grid_w = encoded_vd.video_frames_features[0].resized_source_size.width; video_grid_thw.push_back({grid_t, grid_h, grid_w}); } @@ -970,10 +972,10 @@ ov::Tensor InputsEmbedderQwen2VL::get_inputs_embeds(const std::string& unified_p std::vector> video_grid_thw; video_grid_thw.reserve(videos.size()); for (const auto& encoded_video : videos) { - size_t grid_t = encoded_video.size(); + size_t grid_t = encoded_video.video_frames_features.size(); OPENVINO_ASSERT(grid_t > 0, "Input at least one frame for video."); - size_t grid_h = encoded_video[0].resized_source_size.height; - size_t grid_w = encoded_video[0].resized_source_size.width; + size_t grid_h = encoded_video.video_frames_features[0].resized_source_size.height; + size_t grid_w = encoded_video.video_frames_features[0].resized_source_size.width; video_grid_thw.push_back({grid_t, grid_h, grid_w}); } @@ -1012,12 +1014,12 @@ ov::Tensor InputsEmbedderQwen2VL::get_inputs_embeds(const std::string& unified_p return qwen2_vl_utils::merge_text_and_image_embeddings(input_ids, text_embeds, merged_image_embeddings_tensor, image_pad_token_id, video_pad_token_id); } -std::vector InputsEmbedderQwen2VL::encode_video(const std::vector& videos) { - std::vector embeds; +std::vector InputsEmbedderQwen2VL::encode_videos(const std::vector& videos) { + std::vector embeds; for (const ov::Tensor& single_video : videos) { std::vector single_frames = to_single_image_tensors({single_video}); - auto embeds_video = m_vision_encoder->encode_frames(single_frames); - embeds.insert(embeds.end(), embeds_video.begin(), embeds_video.end()); + auto encoded_video = m_vision_encoder->encode_frames(single_frames); + embeds.emplace_back(encoded_video); } return embeds; } @@ -1051,7 +1053,7 @@ void InputsEmbedderQwen2VL::finish_chat() { ov::Tensor InputsEmbedderQwen2VL::run_image_embeddings_merger( const std::vector& images, const std::vector& images_sequence, - const std::vector>& videos, + const std::vector& videos, const std::vector& videos_sequence ) { auto [reordered_image_embeds, reordered_images_grid_thw] = qwen2_vl_utils::reorder_image_video_embeds_and_grid_thw(images, images_sequence, videos, videos_sequence); diff --git a/src/cpp/src/visual_language/qwen2vl/classes.hpp b/src/cpp/src/visual_language/qwen2vl/classes.hpp index 554281e5e8..6e8700c34c 100644 --- a/src/cpp/src/visual_language/qwen2vl/classes.hpp +++ b/src/cpp/src/visual_language/qwen2vl/classes.hpp @@ -18,7 +18,7 @@ class VisionEncoderQwen2VL : public VisionEncoder { explicit VisionEncoderQwen2VL(const ModelsMap& models_map, const std::filesystem::path& config_dir_path, const std::string& device, const ov::AnyMap properties); EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map) override; - std::vector encode_frames(const std::vector& frames, const ov::AnyMap& config_map) override; + EncodedVideo encode_frames(const std::vector& frames, const ov::AnyMap& config_map) override; private: EncodedImage encode_with_imagepreprocess_cpp(const std::vector& image, const ov::AnyMap& config_map); @@ -51,7 +51,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { const std::vector& image_sequence = {}, const std::vector& videos_sequence = {}) override; - std::vector encode_video(const std::vector& videos) override; + std::vector encode_videos(const std::vector& videos) override; std::pair> get_position_ids(const size_t inputs_embeds_size, const size_t history_size) override; @@ -59,7 +59,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { void finish_chat() override; - NormlizedPrompt normalize_prompt( + NormalizedPrompt normalize_prompt( const std::string& prompt, size_t base_id, const std::vector& images) const override { @@ -69,10 +69,10 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { NormalizedPrompt normalize_prompt( const std::string& prompt, - size_t base_id, + size_t image_base_id, size_t video_base_id, const std::vector& images, - const std::vector>& videos) const override; + const std::vector& videos) const override; protected: // A model for merging image embeddings (hidden states), rotary_pos_emb and attension_mask. @@ -92,7 +92,7 @@ class InputsEmbedderQwen2VL : public InputsEmbedder::IInputsEmbedder { virtual ov::Tensor run_image_embeddings_merger( const std::vector& images, const std::vector& images_sequence, - const std::vector>& videos, + const std::vector& videos, const std::vector& videos_sequence); ov::Tensor get_rotary_pos_emb(const std::vector>& grids_thw); @@ -114,7 +114,7 @@ namespace qwen2_vl_utils { std::pair, std::vector>> reorder_image_video_embeds_and_grid_thw( const std::vector& encoded_images, const std::vector& images_sequence, - const std::vector>& videos, + const std::vector& videos, const std::vector& videos_sequence ); diff --git a/src/cpp/src/visual_language/vision_encoder.hpp b/src/cpp/src/visual_language/vision_encoder.hpp index 4c69c47248..419f1fbc66 100644 --- a/src/cpp/src/visual_language/vision_encoder.hpp +++ b/src/cpp/src/visual_language/vision_encoder.hpp @@ -68,12 +68,9 @@ struct EncodedVideo { /// @brief Number of video tokens required to append to a normalized prompt size_t num_video_tokens; -}; -struct NormalizedPrompt { - std::string unified_prompt; - std::vector images_sequence; - std::vector videos_sequence; + /// @brief Some models'video processing is similiar. Reuse EncodedImage for model: QWen2-VL, QWen2.5-VL. + std::vector video_frames_features; }; /// @brief A class used to infer embeddings of an image using @@ -120,7 +117,7 @@ class VisionEncoder { virtual EncodedImage encode(const ov::Tensor& image, const ov::AnyMap& config_map = {}) = 0; /// @brief Compute embeddings of a or multiple video given - virtual std::vector encode_frames(const std::vector& frames, const ov::AnyMap& config_map = {}) { + virtual EncodedVideo encode_frames(const std::vector& frames, const ov::AnyMap& config_map = {}) { OPENVINO_THROW("The current model does not support 'video' input, please use 'images' instead."); } diff --git a/src/python/py_continuous_batching_pipeline.cpp b/src/python/py_continuous_batching_pipeline.cpp index dfb4f32d67..8466b7616f 100644 --- a/src/python/py_continuous_batching_pipeline.cpp +++ b/src/python/py_continuous_batching_pipeline.cpp @@ -379,23 +379,12 @@ void init_continuous_batching_pipeline(py::module_& m) { .def_readonly("avg_cache_usage", &PipelineMetrics::avg_cache_usage) .def_readonly("max_cache_usage", &PipelineMetrics::max_cache_usage); - py::class_(m, - "ContinuousBatchingPipeline", - "This class is used for generation with LLMs with continuous batchig") - .def(py::init([](const std::filesystem::path& models_path, - const SchedulerConfig& scheduler_config, - const std::string& device, - const std::map& llm_plugin_config, - const std::map& tokenizer_plugin_config, - const std::map& inputs_embedder_plugin_config) { + py::class_(m, "ContinuousBatchingPipeline", "This class is used for generation with LLMs with continuous batchig") + .def(py::init([](const std::filesystem::path& models_path, const SchedulerConfig& scheduler_config, const std::string& device, const std::map& llm_plugin_config, + const std::map& tokenizer_plugin_config, const std::map& inputs_embedder_plugin_config) { ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); - return std::make_unique( - models_path, - scheduler_config, - device, - pyutils::properties_to_any_map(llm_plugin_config), - pyutils::properties_to_any_map(tokenizer_plugin_config), - pyutils::properties_to_any_map(inputs_embedder_plugin_config)); + return std::make_unique(models_path, scheduler_config, device, pyutils::properties_to_any_map(llm_plugin_config), + pyutils::properties_to_any_map(tokenizer_plugin_config), pyutils::properties_to_any_map(inputs_embedder_plugin_config)); }), py::arg("models_path"), py::arg("scheduler_config"), @@ -404,17 +393,9 @@ void init_continuous_batching_pipeline(py::module_& m) { py::arg("tokenizer_properties") = ov::AnyMap({}), py::arg("vision_encoder_properties") = ov::AnyMap({})) - .def(py::init([](const std::filesystem::path& models_path, - const ov::genai::Tokenizer& tokenizer, - const SchedulerConfig& scheduler_config, - const std::string& device, - const py::kwargs& kwargs) { + .def(py::init([](const std::filesystem::path& models_path, const ov::genai::Tokenizer& tokenizer, const SchedulerConfig& scheduler_config, const std::string& device, const py::kwargs& kwargs) { ScopedVar env_manager(pyutils::ov_tokenizers_module_path()); - return std::make_unique(models_path, - tokenizer, - scheduler_config, - device, - pyutils::kwargs_to_any_map(kwargs)); + return std::make_unique(models_path, tokenizer, scheduler_config, device, pyutils::kwargs_to_any_map(kwargs)); }), py::arg("models_path"), py::arg("tokenizer"), @@ -439,34 +420,36 @@ void init_continuous_batching_pipeline(py::module_& m) { [](ContinuousBatchingPipeline& pipe, const std::vector& input_ids, const std::vector& generation_config, - const pyutils::PyBindStreamerVariant& streamer) - -> py::typing::Union> { + const pyutils::PyBindStreamerVariant& streamer + ) -> py::typing::Union> { return __call_cb_generate(pipe, input_ids, generation_config, streamer); }, py::arg("input_ids"), py::arg("generation_config"), - py::arg("streamer") = std::monostate{}) + py::arg("streamer") = std::monostate{} + ) .def( "generate", [](ContinuousBatchingPipeline& pipe, const std::vector& prompts, const std::vector& generation_config, - const pyutils::PyBindStreamerVariant& streamer) - -> py::typing::Union> { + const pyutils::PyBindStreamerVariant& streamer + ) -> py::typing::Union> { return __call_cb_generate(pipe, prompts, generation_config, streamer); }, py::arg("prompts"), py::arg("generation_config"), - py::arg("streamer") = std::monostate{}) + py::arg("streamer") = std::monostate{} + ) .def( "generate", [](ContinuousBatchingPipeline& pipe, const std::string& prompt, const ov::genai::GenerationConfig& generation_config, - const pyutils::PyBindStreamerVariant& streamer) - -> py::typing::Union> { + const pyutils::PyBindStreamerVariant& streamer + ) -> py::typing::Union> { std::vector prompts = { prompt }; std::vector generation_configs = { generation_config }; return __call_cb_generate(pipe, prompts, generation_configs, streamer); @@ -504,8 +487,8 @@ void init_continuous_batching_pipeline(py::module_& m) { const std::vector& prompts, const std::vector>& images, const std::vector& generation_config, - const pyutils::PyBindStreamerVariant& py_streamer) - -> py::typing::Union> { + const pyutils::PyBindStreamerVariant& py_streamer + ) -> py::typing::Union> { ov::genai::StreamerVariant streamer = pyutils::pystreamer_to_streamer(py_streamer); std::vector generated_results; { diff --git a/tests/python_tests/test_vlm_pipeline.py b/tests/python_tests/test_vlm_pipeline.py index cfe49bd97b..c6f853f71a 100644 --- a/tests/python_tests/test_vlm_pipeline.py +++ b/tests/python_tests/test_vlm_pipeline.py @@ -287,7 +287,7 @@ def streamer(word: str) -> bool: gc.collect() @pytest.mark.precommit -@pytest.mark.parametrize("model_id", model_video_ids) +@pytest.mark.parametrize("model_id", video_model_ids) @pytest.mark.parametrize("backend", attention_backend) def test_vlm_pipeline_video_input(model_id, backend, cat_tensor, countdown_video): def streamer(word: str) -> bool: From 08e09673366b488e89a8fcc5c3822cbba17ceb9e Mon Sep 17 00:00:00 2001 From: "xiping.yan" Date: Sun, 12 Oct 2025 13:26:40 +0800 Subject: [PATCH 082/118] Remove duplicated add_request. Signed-off-by: xiping.yan --- src/cpp/src/continuous_batching/pipeline_base.cpp | 3 +-- src/python/openvino_genai/py_openvino_genai.pyi | 11 +++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/cpp/src/continuous_batching/pipeline_base.cpp b/src/cpp/src/continuous_batching/pipeline_base.cpp index 40f7d454fe..ede1a0d257 100644 --- a/src/cpp/src/continuous_batching/pipeline_base.cpp +++ b/src/cpp/src/continuous_batching/pipeline_base.cpp @@ -197,7 +197,6 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::generate( std::string templated_history = m_tokenizer.apply_chat_template(m_history, true); m_inputs_embedder->set_apply_chat_template_status(false); - if (m_inputs_embedder->has_token_type_ids()) { auto [embeds, tt_ids] = m_inputs_embedder->get_inputs_embeds_with_token_type_ids(templated_history, m_history_images, @@ -310,7 +309,7 @@ ContinuousBatchingPipeline::IContinuousBatchingPipeline::add_request(uint64_t re { std::lock_guard lock(m_embeddings_mutex); m_inputs_embedder->set_apply_chat_template_status(sampling_params.apply_chat_template); - auto encoded_images = m_inputs_embedder->encode_images(rgbs); + const auto encoded_images = m_inputs_embedder->encode_images(rgbs); const auto [unified_prompt, image_sequence, video_sequence] = m_inputs_embedder->normalize_prompt(prompt, 0, encoded_images); inputs = m_inputs_embedder->get_inputs_embeds(unified_prompt, encoded_images, metrics, true, image_sequence); diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi index c51df5042d..2486b1a253 100644 --- a/src/python/openvino_genai/py_openvino_genai.pyi +++ b/src/python/openvino_genai/py_openvino_genai.pyi @@ -431,9 +431,6 @@ class ContinuousBatchingPipeline: @typing.overload def add_request(self, request_id: typing.SupportsInt, prompt: str, images: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig) -> GenerationHandle: ... - @typing.overload - def add_request(self, request_id: typing.SupportsInt, prompt: str, images: collections.abc.Sequence[openvino._pyopenvino.Tensor], videos: collections.abc.Sequence[openvino._pyopenvino.Tensor], generation_config: GenerationConfig) -> GenerationHandle: - ... def finish_chat(self) -> None: ... @typing.overload @@ -3519,6 +3516,9 @@ class VLMPipeline: :param images: image or list of images :type images: list[ov.Tensor] or ov.Tensor + :param videos: list of frames + :type videos: list[ov.Tensor] + :param generation_config: generation_config :type generation_config: GenerationConfig or a dict @@ -3593,6 +3593,7 @@ class VLMPipeline: InternVL2: \\n llava-1.5-7b-hf: LLaVA-NeXT: + LLaVa-NeXT-Video: nanoLLaVA: \\n nanoLLaVA-1.5: \\n MiniCPM-V-2_6: (./)\\n @@ -3601,7 +3602,9 @@ class VLMPipeline: Qwen2-VL: <|vision_start|><|image_pad|><|vision_end|> Qwen2.5-VL: <|vision_start|><|image_pad|><|vision_end|> gemma-3-4b-it: - If the prompt doesn't contain image tags, but images are + Model's native video tag can be used to refer to a video: + LLaVa-NeXT-Video: