@@ -654,90 +654,99 @@ void ov::npuw::LLMInferRequest::infer_chunked_prefill(ov::SoPtr<ov::ITensor> inp
654654 }
655655
656656 while (remaining_prompts > 0 ) {
657- // NB: input_ids can be either fp32(VLM) or i64(LLM)
658- // The last chunk may not be completely filled if the actual length of the prompts is not evenly divisible by
659- // the chunk size
660- auto current_prompts_len = std::min (remaining_prompts, chunk_prompt_len);
657+ m_llm_profile[" 1/prefill:3a.prepare_chunk" ].record ([&]() {
658+ // NB: input_ids can be either fp32(VLM) or i64(LLM)
659+ // The last chunk may not be completely filled if the actual length of the prompts is not evenly divisible
660+ // by the chunk size
661+ auto current_prompts_len = std::min (remaining_prompts, chunk_prompt_len);
662+
663+ // Handle first chunk with prefix caching: populate attention mask for restored cache
664+ if (enable_prefix_caching && cache_context.restore_prefix_cache ) {
665+ m_prefix_caching_helper->populate_attention_mask_for_restored_cache (attention_mask,
666+ attn_mask_in_tensor,
667+ kvcache_desc.num_stored_tokens );
668+ cache_context.restore_prefix_cache = false ;
669+ }
661670
662- // Handle first chunk with prefix caching: populate attention mask for restored cache
663- if (enable_prefix_caching && cache_context.restore_prefix_cache ) {
664- m_prefix_caching_helper->populate_attention_mask_for_restored_cache (attention_mask,
665- attn_mask_in_tensor,
666- kvcache_desc.num_stored_tokens );
667- cache_context.restore_prefix_cache = false ;
668- }
671+ // Populate the attention mask for the present chunk
672+ // For the already processed tokens, they will be added into the attention mask after inference call
673+ size_t last_chunk_offset = attn_mask_in_tensor->get_size () - chunk_prompt_len;
674+ if (current_prompts_len < chunk_prompt_len) {
675+ // We will populate current_prompts_len on the right side of attention mask for the processing tokens
676+ // If the current prompt length is smaller than the chunk prompt length,
677+ // clear the last chunk of the attention mask to ensure non-relevant tokens are masked
678+ ov::npuw::util::fill_tensor<int64_t >(attn_mask_in_tensor, 0 , last_chunk_offset);
679+ }
669680
670- // Populate the attention mask for the present chunk
671- // For the already processed tokens, they will be added into the attention mask after inference call
672- size_t last_chunk_offset = attn_mask_in_tensor->get_size () - chunk_prompt_len;
673- if (current_prompts_len < chunk_prompt_len) {
674- // We will populate current_prompts_len on the right side of attention mask for the processing tokens
675- // If the current prompt length is smaller than the chunk prompt length,
676- // clear the last chunk of the attention mask to ensure non-relevant tokens are masked
677- ov::npuw::util::fill_tensor<int64_t >(attn_mask_in_tensor, 0 , last_chunk_offset);
678- }
681+ std::copy_n (attention_mask->data <int64_t >() + kvcache_desc.num_stored_tokens ,
682+ current_prompts_len,
683+ attn_mask_in_tensor->data <int64_t >() + attn_mask_in_tensor->get_size () - current_prompts_len);
679684
680- std::copy_n (attention_mask->data <int64_t >() + kvcache_desc.num_stored_tokens ,
681- current_prompts_len,
682- attn_mask_in_tensor->data <int64_t >() + attn_mask_in_tensor->get_size () - current_prompts_len);
685+ auto current_prefill_bytes = current_prompts_len * input_ids_elem_size;
686+ auto prefilled_bytes = kvcache_desc.num_stored_tokens * input_ids_elem_size;
687+ if (is_input_embeds) {
688+ current_prefill_bytes *= input_ids->get_shape ().back ();
689+ prefilled_bytes *= input_ids->get_shape ().back ();
690+ }
683691
684- auto current_prefill_bytes = current_prompts_len * input_ids_elem_size;
685- auto prefilled_bytes = kvcache_desc.num_stored_tokens * input_ids_elem_size;
686- if (is_input_embeds) {
687- current_prefill_bytes *= input_ids->get_shape ().back ();
688- prefilled_bytes *= input_ids->get_shape ().back ();
689- }
692+ ov::npuw::util::fill_tensor_bytes (input_ids_in_tensor, 0u );
693+ std::copy_n (reinterpret_cast <uint8_t *>(input_ids->data ()) + prefilled_bytes,
694+ current_prefill_bytes,
695+ reinterpret_cast <uint8_t *>(input_ids_in_tensor->data ()) +
696+ input_ids_in_tensor->get_byte_size () - current_prefill_bytes);
697+
698+ // NB: Regular LLM uses 2D position_ids [BATCH, SEQ_LEN], Qwen2.5 VL/Omni uses 3D position_ids [3,
699+ // BATCH, SEQ_LEN]
700+ // Copy postion ids with considering the 3D position_ids
701+ auto last_dim = position_ids->get_shape ().size () - 1 ;
702+ auto actual_position_ids_slice = ov::npuw::util::make_tensor_slice (
703+ position_ids,
704+ static_cast <uint32_t >(last_dim),
705+ kvcache_desc.num_stored_tokens ,
706+ kvcache_desc.num_stored_tokens + static_cast <uint32_t >(current_prompts_len));
707+
708+ auto pos_ids_slice =
709+ ov::npuw::util::make_tensor_slice (pos_ids_in_tensor,
710+ static_cast <uint32_t >(last_dim),
711+ static_cast <uint32_t >(chunk_prompt_len - current_prompts_len),
712+ static_cast <uint32_t >(chunk_prompt_len));
713+
714+ // Copy with proper stride handling
715+ actual_position_ids_slice->copy_to (pos_ids_slice._ptr );
716+
717+ if (m_eagle3_ext.is_eagle3_model ()) {
718+ m_eagle3_ext.prepare_inputs_for_chunk (m_prefill_request,
719+ m_prefill_in_ports,
720+ kvcache_desc.num_stored_tokens ,
721+ static_cast <uint32_t >(current_prompts_len));
722+ }
690723
691- ov::npuw::util::fill_tensor_bytes (input_ids_in_tensor, 0u );
692- std::copy_n (reinterpret_cast <uint8_t *>(input_ids->data ()) + prefilled_bytes,
693- current_prefill_bytes,
694- reinterpret_cast <uint8_t *>(input_ids_in_tensor->data ()) + input_ids_in_tensor->get_byte_size () -
695- current_prefill_bytes);
696-
697- // NB: Regular LLM uses 2D position_ids [BATCH, SEQ_LEN], Qwen2.5 VL/Omni uses 3D position_ids [3, BATCH,
698- // SEQ_LEN]
699- // Copy postion ids with considering the 3D position_ids
700- auto last_dim = position_ids->get_shape ().size () - 1 ;
701- auto actual_position_ids_slice = ov::npuw::util::make_tensor_slice (
702- position_ids,
703- static_cast <uint32_t >(last_dim),
704- kvcache_desc.num_stored_tokens ,
705- kvcache_desc.num_stored_tokens + static_cast <uint32_t >(current_prompts_len));
706-
707- auto pos_ids_slice =
708- ov::npuw::util::make_tensor_slice (pos_ids_in_tensor,
709- static_cast <uint32_t >(last_dim),
710- static_cast <uint32_t >(chunk_prompt_len - current_prompts_len),
711- static_cast <uint32_t >(chunk_prompt_len));
712-
713- // Copy with proper stride handling
714- actual_position_ids_slice->copy_to (pos_ids_slice._ptr );
724+ // Update history size for dynamic context:
725+ // dynamic attention selector needs history size to determin the past KV shape and attention mask shape
726+ m_prefill_base_request->update_history_size (kvcache_desc.num_stored_tokens );
727+ });
715728
716- if (m_eagle3_ext.is_eagle3_model ()) {
717- m_eagle3_ext.prepare_inputs_for_chunk (m_prefill_request,
718- m_prefill_in_ports,
719- kvcache_desc.num_stored_tokens ,
720- static_cast <uint32_t >(current_prompts_len));
721- }
729+ auto current_prompts_len = std::min (remaining_prompts, chunk_prompt_len);
722730
723- // Update history size for dynamic context:
724- // dynamic attention selector needs history size to determin the past KV shape and attention mask shape
725- m_prefill_base_request->update_history_size (kvcache_desc.num_stored_tokens );
726- m_prefill_request->infer ();
731+ m_llm_profile[" 1/prefill:3b.infer" ].record ([&]() {
732+ m_prefill_request->infer ();
733+ });
727734
728- // Accumulate Eagle3 last_hidden_state from this chunk
729- if (m_eagle3_ext.is_eagle3_model ()) {
730- m_eagle3_ext.accumulate_chunk_last_hidden_state (m_prefill_request,
731- m_prefill_out_ports,
732- static_cast <uint32_t >(current_prompts_len),
733- static_cast <uint32_t >(input_prompt_len));
734- }
735+ m_llm_profile[" 1/prefill:3c.post_chunk" ].record ([&]() {
736+ // Accumulate Eagle3 last_hidden_state from this chunk
737+ if (m_eagle3_ext.is_eagle3_model ()) {
738+ m_eagle3_ext.accumulate_chunk_last_hidden_state (m_prefill_request,
739+ m_prefill_out_ports,
740+ static_cast <uint32_t >(current_prompts_len),
741+ static_cast <uint32_t >(input_prompt_len));
742+ }
735743
736- if (enable_prefix_caching) {
737- m_prefix_caching_helper->store_computed_blocks (current_prompts_len,
738- cache_context.prompt_hashes ,
739- cache_context.token_idx );
740- }
744+ if (enable_prefix_caching) {
745+ m_prefix_caching_helper->store_computed_blocks (current_prompts_len,
746+ cache_context.prompt_hashes ,
747+ cache_context.token_idx );
748+ }
749+ });
741750
742751 remaining_prompts -= current_prompts_len;
743752 kvcache_desc.num_stored_tokens += static_cast <uint32_t >(current_prompts_len);
@@ -746,20 +755,26 @@ void ov::npuw::LLMInferRequest::infer_chunked_prefill(ov::SoPtr<ov::ITensor> inp
746755 if (remaining_prompts <= 0 ) {
747756 LOG_DEBUG (" All prompts have been prefilled in chunks" );
748757 m_tokens_in_present_chunk = current_prompts_len;
749- break ;
758+ return ;
750759 }
751760
752- // Copy calculated key/values chunk from present k/v layer to past k/v layer for storage
753- update_kvcache_for (m_prefill_request,
754- m_prefill_in_ports,
755- m_prefill_out_ports,
756- static_cast <uint32_t >(current_prompts_len),
757- kvcache_desc.v_tensors_transposed_pre );
758-
759- // Update attention mask for the next iteration
760- std::copy_n (attn_mask_in_tensor->data <int64_t >() + attn_mask_in_tensor->get_size () - current_prompts_len,
761- current_prompts_len,
762- attn_mask_in_tensor->data <int64_t >() + kvcache_desc.num_stored_tokens - current_prompts_len);
761+ m_llm_profile[" 1/prefill:3d.update_kvcache" ].record ([&]() {
762+ // Copy calculated key/values chunk from present k/v layer to past k/v layer for storage
763+ update_kvcache_for (m_prefill_request,
764+ m_prefill_in_ports,
765+ m_prefill_out_ports,
766+ static_cast <uint32_t >(current_prompts_len),
767+ kvcache_desc.v_tensors_transposed_pre );
768+
769+ // Update attention mask for the next iteration
770+ std::copy_n (attn_mask_in_tensor->data <int64_t >() + attn_mask_in_tensor->get_size () - current_prompts_len,
771+ current_prompts_len,
772+ attn_mask_in_tensor->data <int64_t >() + kvcache_desc.num_stored_tokens - current_prompts_len);
773+ });
774+
775+ if (remaining_prompts <= 0 ) {
776+ break ;
777+ }
763778 }
764779
765780 LOG_DEBUG (" Done." );
@@ -776,34 +791,43 @@ void ov::npuw::LLMInferRequest::infer_whole_prefill(ov::SoPtr<ov::ITensor> input
776791 LOG_DEBUG (" Calling inference for prefill model in a single launch." );
777792 LOG_BLOCK ();
778793
779- // NB: padded_input can be either fp32(VLM) or i64(LLM)
780- auto padded_input = m_prefill_request->get_tensor (m_prefill_in_ports.at (m_input_ids_name));
781- std::copy_n (
782- reinterpret_cast <uint8_t *>(input_ids->data ()),
783- input_ids->get_byte_size (),
784- reinterpret_cast <uint8_t *>(padded_input->data ()) + padded_input->get_byte_size () - input_ids->get_byte_size ());
794+ m_llm_profile[" 1/prefill:3a.prepare" ].record ([&]() {
795+ // NB: padded_input can be either fp32(VLM) or i64(LLM)
796+ auto padded_input = m_prefill_request->get_tensor (m_prefill_in_ports.at (m_input_ids_name));
797+ std::copy_n (reinterpret_cast <uint8_t *>(input_ids->data ()),
798+ input_ids->get_byte_size (),
799+ reinterpret_cast <uint8_t *>(padded_input->data ()) + padded_input->get_byte_size () -
800+ input_ids->get_byte_size ());
785801
786- auto padded_attention_mask = m_prefill_request->get_tensor (m_prefill_in_ports.at (layer_names::attention_mask));
787- std::copy_n (
788- attention_mask->data <int64_t >(),
789- attention_mask->get_size (),
790- padded_attention_mask->data <int64_t >() + padded_attention_mask->get_size () - attention_mask->get_size ());
802+ auto padded_attention_mask =
803+ m_prefill_request->get_tensor (m_prefill_in_ports.at (layer_names::attention_mask));
804+ std::copy_n (attention_mask->data <int64_t >(),
805+ attention_mask->get_size (),
806+ padded_attention_mask->data <int64_t >() + padded_attention_mask->get_size () -
807+ attention_mask->get_size ());
791808
792- if (token_type_ids) {
793- auto padded_token_type_ids = m_prefill_request->get_tensor (m_prefill_in_ports.at (layer_names::token_type_ids));
809+ if (token_type_ids) {
810+ auto padded_token_type_ids =
811+ m_prefill_request->get_tensor (m_prefill_in_ports.at (layer_names::token_type_ids));
794812
795- std::fill_n (reinterpret_cast <uint8_t *>(padded_token_type_ids->data ()), token_type_ids->get_byte_size (), 0 );
796- util::copy_to_right (token_type_ids, padded_token_type_ids);
797- }
813+ std::fill_n (reinterpret_cast <uint8_t *>(padded_token_type_ids->data ()),
814+ token_type_ids->get_byte_size (),
815+ 0 );
816+ util::copy_to_right (token_type_ids, padded_token_type_ids);
817+ }
798818
799- auto padded_position_ids = m_prefill_request->get_tensor (m_prefill_in_ports.at (layer_names::position_ids));
800- ov::npuw::util::pad_position_ids (padded_position_ids, position_ids);
819+ auto padded_position_ids = m_prefill_request->get_tensor (m_prefill_in_ports.at (layer_names::position_ids));
820+ ov::npuw::util::pad_position_ids (padded_position_ids, position_ids);
801821
802- if (m_eagle3_ext.is_eagle3_model ()) {
803- m_eagle3_ext.prepare_inputs (m_prefill_request, m_prefill_in_ports);
804- }
822+ if (m_eagle3_ext.is_eagle3_model ()) {
823+ m_eagle3_ext.prepare_inputs (m_prefill_request, m_prefill_in_ports);
824+ }
825+ });
826+
827+ m_llm_profile[" 1/prefill:3b.infer" ].record ([&]() {
828+ m_prefill_request->infer ();
829+ });
805830
806- m_prefill_request->infer ();
807831 auto & kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc ;
808832 kvcache_desc.num_stored_tokens += static_cast <uint32_t >(input_ids->get_shape ()[layer_ids::INPUT_IDS_SEQ_LEN_DIM]);
809833
@@ -826,6 +850,8 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
826850 " \" NPUW_LLM_MAX_PROMPT_LEN\" or shorten the prompt." );
827851 }
828852
853+ process_longrope (m_prefill_request, m_prefill_in_ports, position_ids);
854+
829855 m_llm_profile[" 1/prefill:1.prepare_for_new_conversation" ].record ([&]() {
830856 prepare_for_new_conversation (prompt_length);
831857 });
@@ -834,8 +860,6 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
834860 apply_lora ();
835861 });
836862
837- process_longrope (m_prefill_request, m_prefill_in_ports, position_ids);
838-
839863 const bool use_chunk_prefill = m_npuw_llm_compiled_model->m_use_chunk_prefill ;
840864 m_llm_profile[" 1/prefill:3.infer" ].record ([&]() {
841865 if (use_chunk_prefill) {
@@ -963,36 +987,31 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
963987 if (m_lm_head_request) {
964988 LOG_DEBUG (" Calling inference for LM head model asynchronously" );
965989 m_lm_head_request->start_async ();
966- m_llm_profile[" N/generate:3.update_kvcache" ].record ([&]() {
967- if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size ) {
968- update_kvcache_for (m_kvcache_request,
969- m_kvcache_in_ports,
970- m_kvcache_out_ports,
971- input_tokens_len,
972- kvcache_desc.v_tensors_transposed_gen );
973- }
974- });
975- m_lm_head_request->wait ();
976- LOG_DEBUG (" Calling inference for LM head model -- done." );
990+ }
977991
978- m_logits = m_lm_head_request->get_tensor (m_lm_head_logits_port);
979- } else {
980- m_llm_profile[" N/generate:3.update_kvcache" ].record ([&]() {
981- if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size ) {
982- update_kvcache_for (m_kvcache_request,
983- m_kvcache_in_ports,
984- m_kvcache_out_ports,
985- input_tokens_len,
986- kvcache_desc.v_tensors_transposed_gen );
987- }
988- });
992+ m_llm_profile[" N/generate:3.update_kvcache" ].record ([&]() {
993+ if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size ) {
994+ update_kvcache_for (m_kvcache_request,
995+ m_kvcache_in_ports,
996+ m_kvcache_out_ports,
997+ input_tokens_len,
998+ kvcache_desc.v_tensors_transposed_gen );
999+ }
1000+ });
9891001
990- m_logits = m_kvcache_request->get_tensor (m_kvcache_out_ports.at (layer_names::logits));
991- }
1002+ m_llm_profile[" N/generate:4.lm_head" ].record ([&]() {
1003+ if (m_lm_head_request) {
1004+ m_lm_head_request->wait ();
1005+ LOG_DEBUG (" Calling inference for LM head model -- done." );
1006+ m_logits = m_lm_head_request->get_tensor (m_lm_head_logits_port);
1007+ } else {
1008+ m_logits = m_kvcache_request->get_tensor (m_kvcache_out_ports.at (layer_names::logits));
1009+ }
9921010
993- if (m_eagle3_ext.is_eagle3_model ()) {
994- m_eagle3_ext.update_last_hidden_state (m_kvcache_request, m_kvcache_out_ports);
995- }
1011+ if (m_eagle3_ext.is_eagle3_model ()) {
1012+ m_eagle3_ext.update_last_hidden_state (m_kvcache_request, m_kvcache_out_ports);
1013+ }
1014+ });
9961015
9971016 LOG_DEBUG (" Done" );
9981017}
0 commit comments