Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/cpp/src/lm_encoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ ov::genai::utils::GenerationFinishInfo get_lm_encoded_results(
utils::KVCacheState& kv_cache_state,
EmbeddingsModel::Ptr m_embedding,
std::optional<int64_t> rope_delta,
const size_t max_kv_cache_size
const size_t max_kv_cache_size,
const bool use_intermediate_remote_tensor
) {
std::vector<GenerationHandle> generations;
for (SequenceGroup::Ptr sequence_group : sequence_groups) {
Expand Down Expand Up @@ -228,10 +229,9 @@ ov::genai::utils::GenerationFinishInfo get_lm_encoded_results(
}

if (m_embedding) {
constexpr bool return_remote_tensor = true;
CircularBufferQueueElementGuard<EmbeddingsRequest> embeddings_request_guard(m_embedding->get_request_queue().get());
EmbeddingsRequest& req = embeddings_request_guard.get();
const ov::Tensor& embed_prompt_tensor = m_embedding->infer(req, new_input_ids, return_remote_tensor);
const ov::Tensor& embed_prompt_tensor = m_embedding->infer(req, new_input_ids, use_intermediate_remote_tensor);
m_llm.set_tensor("inputs_embeds", embed_prompt_tensor);
if (token_type_ids.has_value()) {
ov::Tensor new_token_type_ids(ov::element::i64, {total_num_tokens, 1});
Expand Down Expand Up @@ -276,7 +276,7 @@ ov::genai::utils::GenerationFinishInfo get_lm_encoded_results(

sampler_output = sampler.sample(active_sequence_groups, m_llm.get_tensor("logits"));
free_non_running_requests(); // handle sampler output

raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
raw_perf_counters.m_batch_sizes.emplace_back(sampler_output.num_generated_tokens);
}
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/lm_encoding.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ namespace genai {
ov::genai::utils::GenerationFinishInfo get_lm_encoded_results(ov::InferRequest& m_llm, const ov::Tensor& input_ids, const ov::Tensor& attention_mask,
const std::shared_ptr<StreamerBase>& streamer_ptr, Sampler& sampler, std::vector<SequenceGroup::Ptr> sequence_groups,
std::optional<ov::Tensor> position_ids, std::optional<ov::Tensor> token_type_ids, utils::KVCacheState& m_kv_cache_state, EmbeddingsModel::Ptr m_embedding,
std::optional<int64_t> rope_delta = std::nullopt, const size_t max_kv_cache_size = std::numeric_limits<size_t>::max());
std::optional<int64_t> rope_delta = std::nullopt, const size_t max_kv_cache_size = std::numeric_limits<size_t>::max(), const bool use_intermediate_remote_tensor = true);


void align_kv_cache_and_history(const ov::Tensor& new_chat_tokens, utils::KVCacheState& kv_cache_state);
Expand Down
67 changes: 43 additions & 24 deletions src/cpp/src/visual_language/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#include "openvino/genai/visual_language/perf_metrics.hpp"
#include "openvino/genai/tokenizer.hpp"
#include "openvino/genai/text_streamer.hpp"
#include "openvino/runtime/properties.hpp"
#include "openvino/runtime/auto/properties.hpp"

#include "visual_language/vlm_config.hpp"
#include "visual_language/inputs_embedder.hpp"
Expand Down Expand Up @@ -36,6 +38,14 @@ void update_npu_properties(const std::filesystem::path& models_dir, ov::AnyMap&
break;
}
}

void npu_auto_default_properties(ov::AnyMap& device_properties) {
auto auto_propeties = utils::pop_or_default<ov::AnyMap>(device_properties, "AUTO", {});
auto_propeties.insert(ov::device::priorities("GPU", "CPU"));
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets keep CPU by default but give a way (and document it) on how to put GPU in the config.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The config isn't there yet. OVMS are working on fully exposing vision models

auto_propeties.insert(ov::intel_auto::enable_startup_fallback(false));

Copy link

Copilot AI Jan 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The device priorities are hardcoded as 'GPU', 'CPU'. Consider making these configurable through parameters or allowing users to override them via device_properties to maintain consistency with the PR's goal of making properties user-configurable.

Suggested change
auto_propeties.insert(ov::device::priorities("GPU", "CPU"));
auto_propeties.insert(ov::intel_auto::enable_startup_fallback(false));
// Apply default device priorities only if the user did not specify them.
const std::string priorities_name = ov::device::priorities.name();
if (auto_propeties.find(priorities_name) == auto_propeties.end() &&
auto_propeties.find("DEVICE_PRIORITIES") == auto_propeties.end()) {
auto_propeties.insert(ov::device::priorities("GPU", "CPU"));
}
// Apply default startup fallback only if the user did not specify it.
const std::string startup_fallback_name = ov::intel_auto::enable_startup_fallback.name();
if (auto_propeties.find(startup_fallback_name) == auto_propeties.end()) {
auto_propeties.insert(ov::intel_auto::enable_startup_fallback(false));
}

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

insert doesn't modify if key already present

device_properties["AUTO"] = auto_propeties;
Copy link

Copilot AI Jan 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Corrected spelling of 'propeties' to 'properties'.

Suggested change
auto auto_propeties = utils::pop_or_default<ov::AnyMap>(device_properties, "AUTO", {});
auto_propeties.insert(ov::device::priorities("GPU", "CPU"));
auto_propeties.insert(ov::intel_auto::enable_startup_fallback(false));
device_properties["AUTO"] = auto_propeties;
auto auto_properties = utils::pop_or_default<ov::AnyMap>(device_properties, "AUTO", {});
auto_properties.insert(ov::device::priorities("GPU", "CPU"));
auto_properties.insert(ov::intel_auto::enable_startup_fallback(false));
device_properties["AUTO"] = auto_properties;

Copilot uses AI. Check for mistakes.
Copy link

Copilot AI Jan 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Corrected spelling of 'auto_propeties' to 'auto_properties'.

Suggested change
auto auto_propeties = utils::pop_or_default<ov::AnyMap>(device_properties, "AUTO", {});
auto_propeties.insert(ov::device::priorities("GPU", "CPU"));
auto_propeties.insert(ov::intel_auto::enable_startup_fallback(false));
device_properties["AUTO"] = auto_propeties;
auto auto_properties = utils::pop_or_default<ov::AnyMap>(device_properties, "AUTO", {});
auto_properties.insert(ov::device::priorities("GPU", "CPU"));
auto_properties.insert(ov::intel_auto::enable_startup_fallback(false));
device_properties["AUTO"] = auto_properties;

Copilot uses AI. Check for mistakes.
}
}

class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
Expand Down Expand Up @@ -94,23 +104,24 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
// ov::device::properties("NPU", ...),
// ov::device::properties("CPU", ...)
// }
auto device_propertes = utils::pop_or_default<ov::AnyMap>(
auto device_properties = utils::pop_or_default<ov::AnyMap>(
properties_copy, ov::device::properties.name(), { }
);
// Otherwise, the same properties are used for all models and devices
auto lm_properties = device_propertes.empty()
auto lm_properties = device_properties.empty()
? properties_copy
: utils::pop_or_default<ov::AnyMap>(device_propertes, device, {});
: utils::pop_or_default<ov::AnyMap>(device_properties, device, {});

ov::CompiledModel compiled_language_model;
auto embedder_device = device;
if (m_is_npu) {
embedder_device = "CPU";
embedder_device = "AUTO";
utils::KVDesc kv_desc;
update_npu_properties(models_dir, lm_properties);
std::tie(compiled_language_model, kv_desc) = utils::compile_decoder_for_npu(language_model, lm_properties, kv_pos);
m_max_prompt_len = kv_desc.max_prompt_len;
m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len;
npu_auto_default_properties(device_properties);
} else {
compiled_language_model = utils::singleton_core().compile_model(language_model, device, lm_properties);
}
Expand All @@ -119,9 +130,9 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
m_language = compiled_language_model.create_infer_request();
m_language.get_tensor("attention_mask").set_shape({1, 0});

auto embedder_properties = device_propertes.empty()
auto embedder_properties = device_properties.empty()
? properties_copy
: utils::pop_or_default<ov::AnyMap>(device_propertes, embedder_device, {});
: utils::pop_or_default<ov::AnyMap>(device_properties, embedder_device, {});

m_inputs_embedder = std::make_shared<InputsEmbedder>(models_dir, embedder_device, embedder_properties);
m_tokenizer = m_inputs_embedder->get_tokenizer();
Expand Down Expand Up @@ -207,8 +218,10 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{

setup_generation_config(generation_config);

bool intermediate_remote_tensor = true;
if (m_is_npu) {
validate_inputs_for_npu(images, videos, generation_config);
intermediate_remote_tensor = false;
}

m_inputs_embedder->set_vision_token_pruning_config(generation_config.pruning_ratio,
Expand Down Expand Up @@ -250,7 +263,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
video_sequence,
generation_config,
perf_metrics,
streamer
streamer,
intermediate_remote_tensor
);

EncodedResults& encoded_result = finish_info.results;
Expand Down Expand Up @@ -334,13 +348,15 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
auto generate_start_time = std::chrono::steady_clock::now();
VLMPerfMetrics perf_metrics;
auto& raw_counters = perf_metrics.raw_metrics;

m_is_chat_conversation = true;

setup_generation_config(generation_config);


bool intermediate_remote_tensor = true;
if (m_is_npu) {
validate_inputs_for_npu(images, videos, generation_config);
intermediate_remote_tensor = false;
}

VLMChatContext chat_context(history, m_vision_registry, *m_inputs_embedder);
Expand All @@ -362,19 +378,19 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{

ov::genai::utils::GenerationFinishInfo generation_finish_info;

const auto& images_embeds = use_full_history
? processed_chat_data.encoded_images
const auto& images_embeds = use_full_history
? processed_chat_data.encoded_images
: processed_chat_data.new_encoded_images;
const auto& videos_embeds = use_full_history
? processed_chat_data.encoded_videos
const auto& videos_embeds = use_full_history
? processed_chat_data.encoded_videos
: processed_chat_data.new_encoded_videos;
const auto& image_seq = use_full_history
? processed_chat_data.image_sequence
const auto& image_seq = use_full_history
? processed_chat_data.image_sequence
: processed_chat_data.new_image_sequence;
const auto& video_seq = use_full_history
? processed_chat_data.video_sequence
const auto& video_seq = use_full_history
? processed_chat_data.video_sequence
: processed_chat_data.new_video_sequence;

generation_finish_info = prepare_inputs_and_generate(
templated_history,
images_embeds,
Expand All @@ -383,7 +399,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
video_seq,
generation_config,
perf_metrics,
streamer
streamer,
intermediate_remote_tensor
);

EncodedResults& encoded_result = generation_finish_info.results;
Expand All @@ -397,7 +414,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
auto decode_end_time = std::chrono::steady_clock::now();

std::string decoded_text = decoded.texts.at(0);

m_inputs_embedder->update_chat_history(decoded_text, generation_finish_info.streaming_finish_status);

if (generation_finish_info.streaming_finish_status == ov::genai::GenerationStatus::CANCEL) {
Expand Down Expand Up @@ -515,7 +532,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
const std::vector<size_t>& video_sequence,
GenerationConfig& generation_config,
VLMPerfMetrics& perf_metrics,
const StreamerVariant& streamer
const StreamerVariant& streamer,
const bool use_intermediate_remote_tensor
) {
ov::Tensor inputs_embeds;
std::optional<ov::Tensor> token_type_ids;
Expand Down Expand Up @@ -569,7 +587,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
OPENVINO_ASSERT(prompt_ids.get_size() >= tokenized_history.size(), "Prompt ids size is less than tokenized history size");
std::fill_n(prompt_ids.data<int64_t>(), prompt_ids.get_size(), m_tokenizer.get_pad_token_id());
std::copy(tokenized_history.begin(), tokenized_history.end(), prompt_ids.data<int64_t>());

// Update perf metrics with num_input_tokens
perf_metrics.num_input_tokens = prompt_ids.get_size();

Expand All @@ -595,7 +613,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{

return ov::genai::get_lm_encoded_results(
m_language, inputs_embeds, new_atten_mask, streamer_ptr, m_sampler, std::move(requests),
position_ids, token_type_ids, kv_cache_state, m_embedding, rope_delta, m_max_kv_cache_size
position_ids, token_type_ids, kv_cache_state, m_embedding, rope_delta, m_max_kv_cache_size,
use_intermediate_remote_tensor
);
}
};
Expand Down
Loading
Loading