Raise exception if input prompt exceeds its configured max size on NPU (openvinotoolkit#1996)

AsyaPronina · ilya-lavrenov · Wovchena · web-flow · commit b4ed0579a4c0 · 2025-04-20T20:38:58.000+04:00
Raise exception if input prompt (including chat history if it exists)
exceeds its configured maximum size on NPU


Tickets:
1. EISW-162272
2. CVS-164960

---------

Co-authored-by: Ilya Lavrenov &lt;ilya.lavrenov@intel.com&gt;
Co-authored-by: Vladimir Zlobin &lt;vladimir.zlobin@intel.com&gt;
diff --git a/src/cpp/src/llm_pipeline_stateful.cpp b/src/cpp/src/llm_pipeline_stateful.cpp
@@ -23,9 +23,9 @@ StatefulLLMPipeline::StatefulLLMPipeline(
     if (execution_devices[0].find("NPU") != std::string::npos) {
         OPENVINO_ASSERT(execution_devices.size() == 1u);
         m_is_npu = true;
-        const auto max_prompt_len = compiled_model.get_property("NPUW_LLM_MAX_PROMPT_LEN").as<uint32_t>();
+        m_max_prompt_len = compiled_model.get_property("NPUW_LLM_MAX_PROMPT_LEN").as<uint32_t>();
         const auto min_response_len = compiled_model.get_property("NPUW_LLM_MIN_RESPONSE_LEN").as<uint32_t>();
-        m_max_kv_cache_size = max_prompt_len + min_response_len;
+        m_max_kv_cache_size = m_max_prompt_len + min_response_len;
     }
 }
 
@@ -69,6 +69,7 @@ StatefulLLMPipeline::StatefulLLMPipeline(
     if (m_is_npu) {
         utils::KVDesc kv_desc;
         std::tie(compiled_model, kv_desc) = utils::compile_decoder_for_npu(model, *filtered_properties, kv_pos);
+        m_max_prompt_len = kv_desc.max_prompt_len;
         m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len;
     } else {
        compiled_model = utils::singleton_core().compile_model(model, device, *filtered_properties);
@@ -157,6 +158,14 @@ DecodedResults StatefulLLMPipeline::generate(
         }
     }
 
+    if (m_is_npu) {
+        // Prefill model in NPU is reshaped to NPUW_LLM_MAX_PROMPT_LEN x NPUW_LLM_MAX_PROMPT_LEN
+        OPENVINO_ASSERT(encoded_input.input_ids.get_size() <= m_max_prompt_len,
+            "Stateful LLM pipeline on NPU may only process prompts or hold chat history up to ",
+            m_max_prompt_len, " tokens. ", encoded_input.input_ids.get_size(), " is passed.\n"
+            "Set the \"MAX_PROMPT_LEN\" config option to increase the limit.");
+    }
+
     auto encode_stop_time =  std::chrono::steady_clock::now();
     auto encoded_results = generate(encoded_input, config, streamer);
 
diff --git a/src/cpp/src/llm_pipeline_stateful.hpp b/src/cpp/src/llm_pipeline_stateful.hpp
@@ -24,6 +24,7 @@ class StatefulLLMPipeline final : public LLMPipelineImplBase {
     ov::genai::GenerationStatus m_chat_generation_finish_status = ov::genai::GenerationStatus::RUNNING;
     // if True, full history will be used as prompt on each chat generation
     bool m_use_full_chat_history = false;
+    size_t m_max_prompt_len = std::numeric_limits<size_t>::max();
     size_t m_max_kv_cache_size = std::numeric_limits<size_t>::max();
     bool m_is_npu = false;
     // include reflection of tokens contained in the kv cache and amount of tokens, which are needed to trim from kv cache on the next step of chat
diff --git a/src/cpp/src/visual_language/pipeline.cpp b/src/cpp/src/visual_language/pipeline.cpp
@@ -42,6 +42,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
     std::shared_ptr<InputsEmbedder> m_inputs_embedder;
     // Component for applying sampling to lm outputs
     Sampler m_sampler;
+    size_t m_max_prompt_len = std::numeric_limits<size_t>::max();
     size_t m_max_kv_cache_size = std::numeric_limits<size_t>::max();
     bool m_is_npu = false;
 public:
@@ -81,6 +82,7 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
             embedder_device = "CPU";
             utils::KVDesc kv_desc;
             std::tie(compiled_language_model, kv_desc) = utils::compile_decoder_for_npu(language_model, lm_properties, kv_pos);
+            m_max_prompt_len = kv_desc.max_prompt_len;
             m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len;
         } else {
             compiled_language_model = utils::singleton_core().compile_model(language_model, device, lm_properties);
@@ -183,6 +185,14 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
         ov::Tensor inputs_embeds = m_inputs_embedder->get_inputs_embeds(prompt, rgbs, perf_metrics);
         auto end_get_inputs_embeds = std::chrono::steady_clock::now();
 
+        if (m_is_npu) {
+            // Prefill model in NPU is reshaped to NPUW_LLM_MAX_PROMPT_LEN x NPUW_LLM_MAX_PROMPT_LEN
+            OPENVINO_ASSERT(inputs_embeds.get_shape().at(1) <= m_max_prompt_len,
+                "VLM pipeline on NPU may only process input embeddings up to ", m_max_prompt_len,
+                " tokens. ", inputs_embeds.get_shape().at(1), " is passed.\nSet the \"MAX_PROMPT_LEN\""
+                " config option to increase the limit.");
+        }
+
         utils::KVCacheState& kv_cache_state = m_inputs_embedder->get_kv_cache_state();
         if (m_is_chat_conversation)
             utils::trim_kv_cache(m_language, kv_cache_state, std::nullopt);