Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/cpp/src/lm_encoding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ ov::genai::utils::GenerationFinishInfo get_lm_encoded_results(
utils::KVCacheState& kv_cache_state,
EmbeddingsModel::Ptr m_embedding,
std::optional<int64_t> rope_delta,
const size_t max_kv_cache_size
const size_t max_kv_cache_size,
const bool use_intermediate_remote_tensor
) {
std::vector<GenerationHandle> generations;
for (SequenceGroup::Ptr sequence_group : sequence_groups) {
Expand Down Expand Up @@ -228,10 +229,9 @@ ov::genai::utils::GenerationFinishInfo get_lm_encoded_results(
}

if (m_embedding) {
constexpr bool return_remote_tensor = true;
CircularBufferQueueElementGuard<EmbeddingsRequest> embeddings_request_guard(m_embedding->get_request_queue().get());
EmbeddingsRequest& req = embeddings_request_guard.get();
const ov::Tensor& embed_prompt_tensor = m_embedding->infer(req, new_input_ids, return_remote_tensor);
const ov::Tensor& embed_prompt_tensor = m_embedding->infer(req, new_input_ids, use_intermediate_remote_tensor);
m_llm.set_tensor("inputs_embeds", embed_prompt_tensor);
if (token_type_ids.has_value()) {
ov::Tensor new_token_type_ids(ov::element::i64, {total_num_tokens, 1});
Expand Down Expand Up @@ -276,7 +276,7 @@ ov::genai::utils::GenerationFinishInfo get_lm_encoded_results(

sampler_output = sampler.sample(active_sequence_groups, m_llm.get_tensor("logits"));
free_non_running_requests(); // handle sampler output

raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
raw_perf_counters.m_batch_sizes.emplace_back(sampler_output.num_generated_tokens);
}
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/lm_encoding.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ namespace genai {
ov::genai::utils::GenerationFinishInfo get_lm_encoded_results(ov::InferRequest& m_llm, const ov::Tensor& input_ids, const ov::Tensor& attention_mask,
const std::shared_ptr<StreamerBase>& streamer_ptr, Sampler& sampler, std::vector<SequenceGroup::Ptr> sequence_groups,
std::optional<ov::Tensor> position_ids, std::optional<ov::Tensor> token_type_ids, utils::KVCacheState& m_kv_cache_state, EmbeddingsModel::Ptr m_embedding,
std::optional<int64_t> rope_delta = std::nullopt, const size_t max_kv_cache_size = std::numeric_limits<size_t>::max());
std::optional<int64_t> rope_delta = std::nullopt, const size_t max_kv_cache_size = std::numeric_limits<size_t>::max(), const bool use_intermediate_remote_tensor = true);


void align_kv_cache_and_history(const ov::Tensor& new_chat_tokens, utils::KVCacheState& kv_cache_state);
Expand Down
39 changes: 29 additions & 10 deletions src/cpp/src/visual_language/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#include "openvino/genai/visual_language/perf_metrics.hpp"
#include "openvino/genai/tokenizer.hpp"
#include "openvino/genai/text_streamer.hpp"
#include "openvino/runtime/properties.hpp"
#include "openvino/runtime/auto/properties.hpp"

#include "visual_language/vlm_config.hpp"
#include "visual_language/inputs_embedder.hpp"
Expand Down Expand Up @@ -35,6 +37,14 @@ void update_npu_properties(const std::filesystem::path& models_dir, ov::AnyMap&
break;
}
}

void npu_auto_default_properties(ov::AnyMap& device_properties) {
auto auto_properties = utils::pop_or_default<ov::AnyMap>(device_properties, "AUTO", {});
auto_properties.insert(ov::device::priorities("CPU"));
auto_properties.insert(ov::intel_auto::enable_startup_fallback(false));

device_properties["AUTO"] = auto_properties;
}
}

class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
Expand Down Expand Up @@ -93,23 +103,24 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
// ov::device::properties("NPU", ...),
// ov::device::properties("CPU", ...)
// }
auto device_propertes = utils::pop_or_default<ov::AnyMap>(
auto device_properties = utils::pop_or_default<ov::AnyMap>(
properties_copy, ov::device::properties.name(), { }
);
// Otherwise, the same properties are used for all models and devices
auto lm_properties = device_propertes.empty()
auto lm_properties = device_properties.empty()
? properties_copy
: utils::pop_or_default<ov::AnyMap>(device_propertes, device, {});
: utils::pop_or_default<ov::AnyMap>(device_properties, device, {});

ov::CompiledModel compiled_language_model;
auto embedder_device = device;
if (m_is_npu) {
embedder_device = "CPU";
embedder_device = "AUTO";
utils::KVDesc kv_desc;
update_npu_properties(models_dir, lm_properties);
std::tie(compiled_language_model, kv_desc) = utils::compile_decoder_for_npu(language_model, lm_properties, kv_pos);
m_max_prompt_len = kv_desc.max_prompt_len;
m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len;
npu_auto_default_properties(device_properties);
} else {
compiled_language_model = utils::singleton_core().compile_model(language_model, device, lm_properties);
}
Expand All @@ -118,9 +129,9 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
m_language = compiled_language_model.create_infer_request();
m_language.get_tensor("attention_mask").set_shape({1, 0});

auto embedder_properties = device_propertes.empty()
auto embedder_properties = device_properties.empty()
? properties_copy
: utils::pop_or_default<ov::AnyMap>(device_propertes, embedder_device, {});
: utils::pop_or_default<ov::AnyMap>(device_properties, embedder_device, {});

m_inputs_embedder = std::make_shared<InputsEmbedder>(models_dir, embedder_device, embedder_properties);
m_tokenizer = m_inputs_embedder->get_tokenizer();
Expand Down Expand Up @@ -206,8 +217,10 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{

setup_generation_config(generation_config);

bool intermediate_remote_tensor = true;
if (m_is_npu) {
validate_inputs_for_npu(images, videos, generation_config);
intermediate_remote_tensor = false;
}

m_inputs_embedder->set_vision_token_pruning_config(generation_config.pruning_ratio,
Expand Down Expand Up @@ -249,7 +262,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
video_sequence,
generation_config,
perf_metrics,
streamer
streamer,
intermediate_remote_tensor
);

EncodedResults& encoded_result = finish_info.results;
Expand Down Expand Up @@ -338,8 +352,10 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{

setup_generation_config(generation_config);

bool intermediate_remote_tensor = true;
if (m_is_npu) {
validate_inputs_for_npu(images, videos, generation_config);
intermediate_remote_tensor = false;
}

VLMChatContext chat_context(history, m_vision_registry, *m_inputs_embedder);
Expand Down Expand Up @@ -382,7 +398,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
video_seq,
generation_config,
perf_metrics,
streamer
streamer,
intermediate_remote_tensor
);

EncodedResults& encoded_result = generation_finish_info.results;
Expand Down Expand Up @@ -513,7 +530,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
const std::vector<size_t>& video_sequence,
GenerationConfig& generation_config,
VLMPerfMetrics& perf_metrics,
const StreamerVariant& streamer
const StreamerVariant& streamer,
const bool use_intermediate_remote_tensor
) {
ov::Tensor inputs_embeds;
std::optional<ov::Tensor> token_type_ids;
Expand Down Expand Up @@ -593,7 +611,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{

return ov::genai::get_lm_encoded_results(
m_language, inputs_embeds, new_atten_mask, streamer_ptr, m_sampler, std::move(requests),
position_ids, token_type_ids, kv_cache_state, m_embedding, rope_delta, m_max_kv_cache_size
position_ids, token_type_ids, kv_cache_state, m_embedding, rope_delta, m_max_kv_cache_size,
use_intermediate_remote_tensor
);
}
};
Expand Down
20 changes: 20 additions & 0 deletions tests/python_tests/test_vlm_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -1177,6 +1177,26 @@ def test_vlm_npu_no_image(ov_npu_pipe_model: VlmModelInfo):
)


@pytest.mark.skipif(
sys.platform == "darwin" or platform.machine() in ["aarch64", "arm64", "ARM64"],
reason="NPU plugin is available only on Linux and Windows x86_64",
)
def test_vlm_npu_auto_config(cat_tensor):
models_path = _get_ov_model(NPU_SUPPORTED_MODELS[0])
properties = {
"DEVICE_PROPERTIES": {
"NPU": {"NPUW_DEVICES": "CPU", "NPUW_ONLINE_PIPELINE": "NONE", "MAX_PROMPT_LEN": 2048},
"AUTO": {openvino.properties.device.priorities: "CPU,GPU"},
}
}

ov_pipe = VLMPipeline(models_path, "NPU", config=properties)

generation_config = _setup_generation_config(ov_pipe)

ov_pipe.generate(PROMPTS[0], images=[cat_tensor], generation_config=generation_config)


@parametrize_one_model_npu
@pytest.mark.skipif(
sys.platform == "darwin" or platform.machine() in ["aarch64", "arm64", "ARM64"],
Expand Down
Loading