@@ -59,10 +59,9 @@ void copy_columns_by_row_chunks(ov::SoPtr<ov::ITensor> src, ov::SoPtr<ov::ITenso
5959}
6060} // anonymous namespace
6161
62- ov::npuw::LLMInferRequest::LLMInferRequest (const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model,
63- const ov::npuw::LLMCompiledModel::KVCacheDesc& kvcache_desc)
62+ ov::npuw::LLMInferRequest::LLMInferRequest (const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model)
6463 : ov::ISyncInferRequest(compiled_model),
65- m_kvcache_desc(kvcache_desc ) {
64+ m_npuw_llm_compiled_model(compiled_model ) {
6665 m_kvcache_request = compiled_model->m_kvcache_compiled ->create_infer_request ();
6766 m_prefill_request = compiled_model->m_prefill_compiled ->create_infer_request ();
6867
@@ -82,13 +81,11 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
8281}
8382
8483void ov::npuw::LLMInferRequest::prepare_for_new_conversation () {
85- // FIXME: for input_ids it must be padding from tokenizer that not available from here
86- // Get it from NPUW options
87- fill_tensor<int64_t >(m_prefill_request->get_tensor (m_prefill_in_ports.at (" input_ids" )), 0u );
88- fill_tensor<int64_t >(m_prefill_request->get_tensor (m_prefill_in_ports.at (" attention_mask" )), 0u );
89- fill_tensor<int64_t >(m_prefill_request->get_tensor (m_prefill_in_ports.at (" position_ids" )), 0u );
90- fill_tensor<int64_t >(m_kvcache_request->get_tensor (m_kvcache_in_ports.at (" attention_mask" )), 0u );
91- m_kvcache_desc.num_stored_tokens = 0u ;
84+ fill_tensor<int64_t >(m_prefill_request->get_tensor (m_prefill_in_ports.at (" input_ids" )), 0 );
85+ fill_tensor<int64_t >(m_prefill_request->get_tensor (m_prefill_in_ports.at (" attention_mask" )), 0 );
86+ fill_tensor<int64_t >(m_prefill_request->get_tensor (m_prefill_in_ports.at (" position_ids" )), 0 );
87+ fill_tensor<int64_t >(m_kvcache_request->get_tensor (m_kvcache_in_ports.at (" attention_mask" )), 0 );
88+ m_npuw_llm_compiled_model->m_kvcache_desc .num_stored_tokens = 0u ;
9289}
9390
9491void ov::npuw::LLMInferRequest::infer_prefill (ov::SoPtr<ov::ITensor> input_ids,
@@ -112,7 +109,7 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
112109 std::copy_n (position_ids->data <int64_t >(), position_ids->get_size (), padded_position_ids->data <int64_t >() + offset);
113110
114111 m_prefill_request->infer ();
115- m_kvcache_desc.num_stored_tokens += static_cast <uint32_t >(input_ids->get_size ());
112+ m_npuw_llm_compiled_model-> m_kvcache_desc .num_stored_tokens += static_cast <uint32_t >(input_ids->get_size ());
116113 m_need_copy_kvcache = true ;
117114
118115 m_logits = m_prefill_request->get_tensor (m_prefill_out_ports.at (" logits" ));
@@ -126,8 +123,9 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
126123 LOG_DEBUG (" Calling inference for generate model..." );
127124 LOG_BLOCK ();
128125
126+ auto & kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc ;
129127 // NB: KV-cache is full, further generation is impossible
130- if (m_kvcache_desc .num_stored_tokens == m_kvcache_desc .total_size ) {
128+ if (kvcache_desc .num_stored_tokens == kvcache_desc .total_size ) {
131129 OPENVINO_THROW (" KV-Cache is full." );
132130 }
133131
@@ -146,17 +144,16 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
146144 // taking into account kvcache dimension.
147145 fill_tensor<ov::float16>(kvcache_in_tensor, 0 );
148146
149- const auto & kv_dim = (output_name.find (" value" ) != std::string::npos && m_kvcache_desc .v_tensors_transposed )
147+ const auto & kv_dim = (output_name.find (" value" ) != std::string::npos && kvcache_desc .v_tensors_transposed )
150148 ? 3u
151- : m_kvcache_desc .dim ;
149+ : kvcache_desc .dim ;
152150
153- auto prefill_out_slice =
154- make_tensor_slice (prefill_out_tensor,
155- kv_dim,
156- m_kvcache_desc.max_prompt_size - m_kvcache_desc.num_stored_tokens ,
157- m_kvcache_desc.max_prompt_size );
151+ auto prefill_out_slice = make_tensor_slice (prefill_out_tensor,
152+ kv_dim,
153+ kvcache_desc.max_prompt_size - kvcache_desc.num_stored_tokens ,
154+ kvcache_desc.max_prompt_size );
158155
159- auto kvcache_in_slice = make_tensor_slice (kvcache_in_tensor, kv_dim, 0u , m_kvcache_desc .num_stored_tokens );
156+ auto kvcache_in_slice = make_tensor_slice (kvcache_in_tensor, kv_dim, 0u , kvcache_desc .num_stored_tokens );
160157
161158 if (kv_dim == 3u ) {
162159 copy_columns_by_row_chunks (prefill_out_slice, kvcache_in_slice);
@@ -168,7 +165,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
168165 LOG_DEBUG (" Prepare attention mask pattern." );
169166 auto * attention_mask_data =
170167 m_kvcache_request->get_tensor (m_kvcache_in_ports.at (" attention_mask" ))->data <int64_t >();
171- attention_mask_data[m_kvcache_desc .total_size - 1 ] = 1 ;
168+ attention_mask_data[kvcache_desc .total_size - 1 ] = 1 ;
172169
173170 m_need_copy_kvcache = false ;
174171 }
@@ -185,7 +182,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
185182
186183 m_kvcache_request->infer ();
187184 m_logits = m_kvcache_request->get_tensor (m_kvcache_out_ports.at (" logits" ));
188- m_kvcache_desc .num_stored_tokens += 1 ;
185+ kvcache_desc .num_stored_tokens += 1 ;
189186
190187 LOG_DEBUG (" Write KV-cache for the new token to the correct input position for next iteration." );
191188 const std::size_t kStartOutputKVCacheLayers = 1u ;
@@ -194,13 +191,13 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
194191 const auto & output_name = kvcache_compiled->outputs ()[kStartOutputKVCacheLayers + i].get_any_name ();
195192 const auto & input_name = std::regex_replace (output_name, std::regex (" present" ), " past_key_values" );
196193 auto kvcache_in_tensor = m_kvcache_request->get_tensor (m_kvcache_in_ports.at (input_name));
197- const auto & kv_dim = (output_name.find (" value" ) != std::string::npos && m_kvcache_desc .v_tensors_transposed )
194+ const auto & kv_dim = (output_name.find (" value" ) != std::string::npos && kvcache_desc .v_tensors_transposed )
198195 ? 3u
199- : m_kvcache_desc .dim ;
196+ : kvcache_desc .dim ;
200197 auto kvcache_in_slice = make_tensor_slice (kvcache_in_tensor,
201198 kv_dim,
202- m_kvcache_desc .num_stored_tokens - 1 ,
203- m_kvcache_desc .num_stored_tokens );
199+ kvcache_desc .num_stored_tokens - 1 ,
200+ kvcache_desc .num_stored_tokens );
204201 auto kvcache_out_tensor = m_kvcache_request->get_tensor (m_kvcache_out_ports.at (output_name));
205202 kvcache_out_tensor->copy_to (kvcache_in_slice._ptr );
206203 }
0 commit comments