Skip to content

Commit b4ed057

Browse files
AsyaProninailya-lavrenovWovchena
authored
Raise exception if input prompt exceeds its configured max size on NPU (openvinotoolkit#1996)
Raise exception if input prompt (including chat history if it exists) exceeds its configured maximum size on NPU Tickets: 1. EISW-162272 2. CVS-164960 --------- Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com> Co-authored-by: Vladimir Zlobin <vladimir.zlobin@intel.com>
1 parent bb2be76 commit b4ed057

File tree

3 files changed

+22
-2
lines changed

3 files changed

+22
-2
lines changed

src/cpp/src/llm_pipeline_stateful.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ StatefulLLMPipeline::StatefulLLMPipeline(
2323
if (execution_devices[0].find("NPU") != std::string::npos) {
2424
OPENVINO_ASSERT(execution_devices.size() == 1u);
2525
m_is_npu = true;
26-
const auto max_prompt_len = compiled_model.get_property("NPUW_LLM_MAX_PROMPT_LEN").as<uint32_t>();
26+
m_max_prompt_len = compiled_model.get_property("NPUW_LLM_MAX_PROMPT_LEN").as<uint32_t>();
2727
const auto min_response_len = compiled_model.get_property("NPUW_LLM_MIN_RESPONSE_LEN").as<uint32_t>();
28-
m_max_kv_cache_size = max_prompt_len + min_response_len;
28+
m_max_kv_cache_size = m_max_prompt_len + min_response_len;
2929
}
3030
}
3131

@@ -69,6 +69,7 @@ StatefulLLMPipeline::StatefulLLMPipeline(
6969
if (m_is_npu) {
7070
utils::KVDesc kv_desc;
7171
std::tie(compiled_model, kv_desc) = utils::compile_decoder_for_npu(model, *filtered_properties, kv_pos);
72+
m_max_prompt_len = kv_desc.max_prompt_len;
7273
m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len;
7374
} else {
7475
compiled_model = utils::singleton_core().compile_model(model, device, *filtered_properties);
@@ -157,6 +158,14 @@ DecodedResults StatefulLLMPipeline::generate(
157158
}
158159
}
159160

161+
if (m_is_npu) {
162+
// Prefill model in NPU is reshaped to NPUW_LLM_MAX_PROMPT_LEN x NPUW_LLM_MAX_PROMPT_LEN
163+
OPENVINO_ASSERT(encoded_input.input_ids.get_size() <= m_max_prompt_len,
164+
"Stateful LLM pipeline on NPU may only process prompts or hold chat history up to ",
165+
m_max_prompt_len, " tokens. ", encoded_input.input_ids.get_size(), " is passed.\n"
166+
"Set the \"MAX_PROMPT_LEN\" config option to increase the limit.");
167+
}
168+
160169
auto encode_stop_time = std::chrono::steady_clock::now();
161170
auto encoded_results = generate(encoded_input, config, streamer);
162171

src/cpp/src/llm_pipeline_stateful.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
2424
ov::genai::GenerationStatus m_chat_generation_finish_status = ov::genai::GenerationStatus::RUNNING;
2525
// if True, full history will be used as prompt on each chat generation
2626
bool m_use_full_chat_history = false;
27+
size_t m_max_prompt_len = std::numeric_limits<size_t>::max();
2728
size_t m_max_kv_cache_size = std::numeric_limits<size_t>::max();
2829
bool m_is_npu = false;
2930
// include reflection of tokens contained in the kv cache and amount of tokens, which are needed to trim from kv cache on the next step of chat

src/cpp/src/visual_language/pipeline.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
4242
std::shared_ptr<InputsEmbedder> m_inputs_embedder;
4343
// Component for applying sampling to lm outputs
4444
Sampler m_sampler;
45+
size_t m_max_prompt_len = std::numeric_limits<size_t>::max();
4546
size_t m_max_kv_cache_size = std::numeric_limits<size_t>::max();
4647
bool m_is_npu = false;
4748
public:
@@ -81,6 +82,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
8182
embedder_device = "CPU";
8283
utils::KVDesc kv_desc;
8384
std::tie(compiled_language_model, kv_desc) = utils::compile_decoder_for_npu(language_model, lm_properties, kv_pos);
85+
m_max_prompt_len = kv_desc.max_prompt_len;
8486
m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len;
8587
} else {
8688
compiled_language_model = utils::singleton_core().compile_model(language_model, device, lm_properties);
@@ -183,6 +185,14 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
183185
ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics);
184186
auto end_get_inputs_embeds = std::chrono::steady_clock::now();
185187

188+
if (m_is_npu) {
189+
// Prefill model in NPU is reshaped to NPUW_LLM_MAX_PROMPT_LEN x NPUW_LLM_MAX_PROMPT_LEN
190+
OPENVINO_ASSERT(inputs_embeds.get_shape().at(1) <= m_max_prompt_len,
191+
"VLM pipeline on NPU may only process input embeddings up to ", m_max_prompt_len,
192+
" tokens. ", inputs_embeds.get_shape().at(1), " is passed.\nSet the \"MAX_PROMPT_LEN\""
193+
" config option to increase the limit.");
194+
}
195+
186196
utils::KVCacheState& kv_cache_state = m_inputs_embedder->get_kv_cache_state();
187197
if (m_is_chat_conversation)
188198
utils::trim_kv_cache(m_language, kv_cache_state, std::nullopt);

0 commit comments

Comments
 (0)