@@ -655,89 +655,97 @@ void ov::npuw::LLMInferRequest::infer_chunked_prefill(ov::SoPtr<ov::ITensor> inp
655655
656656 while (remaining_prompts > 0 ) {
657657 // NB: input_ids can be either fp32(VLM) or i64(LLM)
658- // The last chunk may not be completely filled if the actual length of the prompts is not evenly divisible by
659- // the chunk size
658+ // The last chunk may not be completely filled if the actual length of the prompts is not evenly divisible
659+ // by the chunk size
660660 auto current_prompts_len = std::min (remaining_prompts, chunk_prompt_len);
661661
662- // Handle first chunk with prefix caching: populate attention mask for restored cache
663- if (enable_prefix_caching && cache_context.restore_prefix_cache ) {
664- m_prefix_caching_helper->populate_attention_mask_for_restored_cache (attention_mask,
665- attn_mask_in_tensor,
666- kvcache_desc.num_stored_tokens );
667- cache_context.restore_prefix_cache = false ;
668- }
662+ m_llm_profile[" 1/prefill:3a.prepare_chunk" ].record ([&]() {
669663
670- // Populate the attention mask for the present chunk
671- // For the already processed tokens, they will be added into the attention mask after inference call
672- size_t last_chunk_offset = attn_mask_in_tensor->get_size () - chunk_prompt_len;
673- if (current_prompts_len < chunk_prompt_len) {
674- // We will populate current_prompts_len on the right side of attention mask for the processing tokens
675- // If the current prompt length is smaller than the chunk prompt length,
676- // clear the last chunk of the attention mask to ensure non-relevant tokens are masked
677- ov::npuw::util::fill_tensor<int64_t >(attn_mask_in_tensor, 0 , last_chunk_offset);
678- }
664+ // Handle first chunk with prefix caching: populate attention mask for restored cache
665+ if (enable_prefix_caching && cache_context.restore_prefix_cache ) {
666+ m_prefix_caching_helper->populate_attention_mask_for_restored_cache (attention_mask,
667+ attn_mask_in_tensor,
668+ kvcache_desc.num_stored_tokens );
669+ cache_context.restore_prefix_cache = false ;
670+ }
679671
680- std::copy_n (attention_mask->data <int64_t >() + kvcache_desc.num_stored_tokens ,
681- current_prompts_len,
682- attn_mask_in_tensor->data <int64_t >() + attn_mask_in_tensor->get_size () - current_prompts_len);
672+ // Populate the attention mask for the present chunk
673+ // For the already processed tokens, they will be added into the attention mask after inference call
674+ size_t last_chunk_offset = attn_mask_in_tensor->get_size () - chunk_prompt_len;
675+ if (current_prompts_len < chunk_prompt_len) {
676+ // We will populate current_prompts_len on the right side of attention mask for the processing tokens
677+ // If the current prompt length is smaller than the chunk prompt length,
678+ // clear the last chunk of the attention mask to ensure non-relevant tokens are masked
679+ ov::npuw::util::fill_tensor<int64_t >(attn_mask_in_tensor, 0 , last_chunk_offset);
680+ }
683681
684- auto current_prefill_bytes = current_prompts_len * input_ids_elem_size;
685- auto prefilled_bytes = kvcache_desc.num_stored_tokens * input_ids_elem_size;
686- if (is_input_embeds) {
687- current_prefill_bytes *= input_ids->get_shape ().back ();
688- prefilled_bytes *= input_ids->get_shape ().back ();
689- }
682+ std::copy_n (attention_mask->data <int64_t >() + kvcache_desc.num_stored_tokens ,
683+ current_prompts_len,
684+ attn_mask_in_tensor->data <int64_t >() + attn_mask_in_tensor->get_size () - current_prompts_len);
690685
691- ov::npuw::util::fill_tensor_bytes (input_ids_in_tensor, 0u );
692- std::copy_n (reinterpret_cast <uint8_t *>(input_ids->data ()) + prefilled_bytes,
693- current_prefill_bytes,
694- reinterpret_cast <uint8_t *>(input_ids_in_tensor->data ()) + input_ids_in_tensor->get_byte_size () -
695- current_prefill_bytes);
696-
697- // NB: Regular LLM uses 2D position_ids [BATCH, SEQ_LEN], Qwen2.5 VL/Omni uses 3D position_ids [3, BATCH,
698- // SEQ_LEN]
699- // Copy postion ids with considering the 3D position_ids
700- auto last_dim = position_ids->get_shape ().size () - 1 ;
701- auto actual_position_ids_slice = ov::npuw::util::make_tensor_slice (
702- position_ids,
703- static_cast <uint32_t >(last_dim),
704- kvcache_desc.num_stored_tokens ,
705- kvcache_desc.num_stored_tokens + static_cast <uint32_t >(current_prompts_len));
706-
707- auto pos_ids_slice =
708- ov::npuw::util::make_tensor_slice (pos_ids_in_tensor,
709- static_cast <uint32_t >(last_dim),
710- static_cast <uint32_t >(chunk_prompt_len - current_prompts_len),
711- static_cast <uint32_t >(chunk_prompt_len));
712-
713- // Copy with proper stride handling
714- actual_position_ids_slice->copy_to (pos_ids_slice._ptr );
686+ auto current_prefill_bytes = current_prompts_len * input_ids_elem_size;
687+ auto prefilled_bytes = kvcache_desc.num_stored_tokens * input_ids_elem_size;
688+ if (is_input_embeds) {
689+ current_prefill_bytes *= input_ids->get_shape ().back ();
690+ prefilled_bytes *= input_ids->get_shape ().back ();
691+ }
715692
716- if (m_eagle3_ext.is_eagle3_model ()) {
717- m_eagle3_ext.prepare_inputs_for_chunk (m_prefill_request,
718- m_prefill_in_ports,
719- kvcache_desc.num_stored_tokens ,
720- static_cast <uint32_t >(current_prompts_len));
721- }
693+ ov::npuw::util::fill_tensor_bytes (input_ids_in_tensor, 0u );
694+ std::copy_n (reinterpret_cast <uint8_t *>(input_ids->data ()) + prefilled_bytes,
695+ current_prefill_bytes,
696+ reinterpret_cast <uint8_t *>(input_ids_in_tensor->data ()) + input_ids_in_tensor->get_byte_size () -
697+ current_prefill_bytes);
698+
699+ // NB: Regular LLM uses 2D position_ids [BATCH, SEQ_LEN], Qwen2.5 VL/Omni uses 3D position_ids [3,
700+ // BATCH, SEQ_LEN]
701+ // Copy postion ids with considering the 3D position_ids
702+ auto last_dim = position_ids->get_shape ().size () - 1 ;
703+ auto actual_position_ids_slice = ov::npuw::util::make_tensor_slice (
704+ position_ids,
705+ static_cast <uint32_t >(last_dim),
706+ kvcache_desc.num_stored_tokens ,
707+ kvcache_desc.num_stored_tokens + static_cast <uint32_t >(current_prompts_len));
708+
709+ auto pos_ids_slice =
710+ ov::npuw::util::make_tensor_slice (pos_ids_in_tensor,
711+ static_cast <uint32_t >(last_dim),
712+ static_cast <uint32_t >(chunk_prompt_len - current_prompts_len),
713+ static_cast <uint32_t >(chunk_prompt_len));
714+
715+ // Copy with proper stride handling
716+ actual_position_ids_slice->copy_to (pos_ids_slice._ptr );
717+
718+ if (m_eagle3_ext.is_eagle3_model ()) {
719+ m_eagle3_ext.prepare_inputs_for_chunk (m_prefill_request,
720+ m_prefill_in_ports,
721+ kvcache_desc.num_stored_tokens ,
722+ static_cast <uint32_t >(current_prompts_len));
723+ }
722724
723- // Update history size for dynamic context:
724- // dynamic attention selector needs history size to determin the past KV shape and attention mask shape
725- m_prefill_base_request->update_history_size (kvcache_desc.num_stored_tokens );
726- m_prefill_request-> infer ( );
725+ // Update history size for dynamic context:
726+ // dynamic attention selector needs history size to determin the past KV shape and attention mask shape
727+ m_prefill_base_request->update_history_size (kvcache_desc.num_stored_tokens );
728+ } );
727729
728- // Accumulate Eagle3 last_hidden_state from this chunk
729- if (m_eagle3_ext.is_eagle3_model ()) {
730- m_eagle3_ext.accumulate_chunk_last_hidden_state (m_prefill_request,
731- m_prefill_out_ports,
732- static_cast <uint32_t >(current_prompts_len),
733- static_cast <uint32_t >(input_prompt_len));
734- }
730+ m_llm_profile[" 1/prefill:3b.infer" ].record ([&]() {
731+ m_prefill_request->infer ();
732+ });
735733
736- if (enable_prefix_caching) {
737- m_prefix_caching_helper->store_computed_blocks (current_prompts_len,
738- cache_context.prompt_hashes ,
739- cache_context.token_idx );
740- }
734+ m_llm_profile[" 1/prefill:3c.post_chunk" ].record ([&]() {
735+ // Accumulate Eagle3 last_hidden_state from this chunk
736+ if (m_eagle3_ext.is_eagle3_model ()) {
737+ m_eagle3_ext.accumulate_chunk_last_hidden_state (m_prefill_request,
738+ m_prefill_out_ports,
739+ static_cast <uint32_t >(current_prompts_len),
740+ static_cast <uint32_t >(input_prompt_len));
741+ }
742+
743+ if (enable_prefix_caching) {
744+ m_prefix_caching_helper->store_computed_blocks (current_prompts_len,
745+ cache_context.prompt_hashes ,
746+ cache_context.token_idx );
747+ }
748+ });
741749
742750 remaining_prompts -= current_prompts_len;
743751 kvcache_desc.num_stored_tokens += static_cast <uint32_t >(current_prompts_len);
@@ -749,17 +757,19 @@ void ov::npuw::LLMInferRequest::infer_chunked_prefill(ov::SoPtr<ov::ITensor> inp
749757 break ;
750758 }
751759
752- // Copy calculated key/values chunk from present k/v layer to past k/v layer for storage
753- update_kvcache_for (m_prefill_request,
754- m_prefill_in_ports,
755- m_prefill_out_ports,
756- static_cast <uint32_t >(current_prompts_len),
757- kvcache_desc.v_tensors_transposed_pre );
758-
759- // Update attention mask for the next iteration
760- std::copy_n (attn_mask_in_tensor->data <int64_t >() + attn_mask_in_tensor->get_size () - current_prompts_len,
761- current_prompts_len,
762- attn_mask_in_tensor->data <int64_t >() + kvcache_desc.num_stored_tokens - current_prompts_len);
760+ m_llm_profile[" 1/prefill:3d.update_kvcache" ].record ([&]() {
761+ // Copy calculated key/values chunk from present k/v layer to past k/v layer for storage
762+ update_kvcache_for (m_prefill_request,
763+ m_prefill_in_ports,
764+ m_prefill_out_ports,
765+ static_cast <uint32_t >(current_prompts_len),
766+ kvcache_desc.v_tensors_transposed_pre );
767+
768+ // Update attention mask for the next iteration
769+ std::copy_n (attn_mask_in_tensor->data <int64_t >() + attn_mask_in_tensor->get_size () - current_prompts_len,
770+ current_prompts_len,
771+ attn_mask_in_tensor->data <int64_t >() + kvcache_desc.num_stored_tokens - current_prompts_len);
772+ });
763773 }
764774
765775 LOG_DEBUG (" Done." );
@@ -776,34 +786,40 @@ void ov::npuw::LLMInferRequest::infer_whole_prefill(ov::SoPtr<ov::ITensor> input
776786 LOG_DEBUG (" Calling inference for prefill model in a single launch." );
777787 LOG_BLOCK ();
778788
779- // NB: padded_input can be either fp32(VLM) or i64(LLM)
780- auto padded_input = m_prefill_request->get_tensor (m_prefill_in_ports.at (m_input_ids_name));
781- std::copy_n (
782- reinterpret_cast <uint8_t *>(input_ids->data ()),
783- input_ids->get_byte_size (),
784- reinterpret_cast <uint8_t *>(padded_input->data ()) + padded_input->get_byte_size () - input_ids->get_byte_size ());
789+ m_llm_profile[" 1/prefill:3a.prepare" ].record ([&]() {
790+ // NB: padded_input can be either fp32(VLM) or i64(LLM)
791+ auto padded_input = m_prefill_request->get_tensor (m_prefill_in_ports.at (m_input_ids_name));
792+ std::copy_n (reinterpret_cast <uint8_t *>(input_ids->data ()),
793+ input_ids->get_byte_size (),
794+ reinterpret_cast <uint8_t *>(padded_input->data ()) + padded_input->get_byte_size () -
795+ input_ids->get_byte_size ());
785796
786- auto padded_attention_mask = m_prefill_request->get_tensor (m_prefill_in_ports.at (layer_names::attention_mask));
787- std::copy_n (
788- attention_mask->data <int64_t >(),
789- attention_mask->get_size (),
790- padded_attention_mask->data <int64_t >() + padded_attention_mask->get_size () - attention_mask->get_size ());
797+ auto padded_attention_mask = m_prefill_request->get_tensor (m_prefill_in_ports.at (layer_names::attention_mask));
798+ std::copy_n (
799+ attention_mask->data <int64_t >(),
800+ attention_mask->get_size (),
801+ padded_attention_mask->data <int64_t >() + padded_attention_mask->get_size () - attention_mask->get_size ());
791802
792- if (token_type_ids) {
793- auto padded_token_type_ids = m_prefill_request->get_tensor (m_prefill_in_ports.at (layer_names::token_type_ids));
803+ if (token_type_ids) {
804+ auto padded_token_type_ids =
805+ m_prefill_request->get_tensor (m_prefill_in_ports.at (layer_names::token_type_ids));
794806
795- std::fill_n (reinterpret_cast <uint8_t *>(padded_token_type_ids->data ()), token_type_ids->get_byte_size (), 0 );
796- util::copy_to_right (token_type_ids, padded_token_type_ids);
797- }
807+ std::fill_n (reinterpret_cast <uint8_t *>(padded_token_type_ids->data ()), token_type_ids->get_byte_size (), 0 );
808+ util::copy_to_right (token_type_ids, padded_token_type_ids);
809+ }
798810
799- auto padded_position_ids = m_prefill_request->get_tensor (m_prefill_in_ports.at (layer_names::position_ids));
800- ov::npuw::util::pad_position_ids (padded_position_ids, position_ids);
811+ auto padded_position_ids = m_prefill_request->get_tensor (m_prefill_in_ports.at (layer_names::position_ids));
812+ ov::npuw::util::pad_position_ids (padded_position_ids, position_ids);
801813
802- if (m_eagle3_ext.is_eagle3_model ()) {
803- m_eagle3_ext.prepare_inputs (m_prefill_request, m_prefill_in_ports);
804- }
814+ if (m_eagle3_ext.is_eagle3_model ()) {
815+ m_eagle3_ext.prepare_inputs (m_prefill_request, m_prefill_in_ports);
816+ }
817+ });
818+
819+ m_llm_profile[" 1/prefill:3b.infer" ].record ([&]() {
820+ m_prefill_request->infer ();
821+ });
805822
806- m_prefill_request->infer ();
807823 auto & kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc ;
808824 kvcache_desc.num_stored_tokens += static_cast <uint32_t >(input_ids->get_shape ()[layer_ids::INPUT_IDS_SEQ_LEN_DIM]);
809825
@@ -830,12 +846,12 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
830846 prepare_for_new_conversation (prompt_length);
831847 });
832848
849+ process_longrope (m_prefill_request, m_prefill_in_ports, position_ids);
850+
833851 m_llm_profile[" 1/prefill:2.apply_lora" ].record ([&]() {
834852 apply_lora ();
835853 });
836854
837- process_longrope (m_prefill_request, m_prefill_in_ports, position_ids);
838-
839855 const bool use_chunk_prefill = m_npuw_llm_compiled_model->m_use_chunk_prefill ;
840856 m_llm_profile[" 1/prefill:3.infer" ].record ([&]() {
841857 if (use_chunk_prefill) {
@@ -972,10 +988,12 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
972988 kvcache_desc.v_tensors_transposed_gen );
973989 }
974990 });
975- m_lm_head_request->wait ();
976- LOG_DEBUG (" Calling inference for LM head model -- done." );
991+ m_llm_profile[" N/generate:4.lm_head" ].record ([&]() {
992+ m_lm_head_request->wait ();
993+ LOG_DEBUG (" Calling inference for LM head model -- done." );
977994
978- m_logits = m_lm_head_request->get_tensor (m_lm_head_logits_port);
995+ m_logits = m_lm_head_request->get_tensor (m_lm_head_logits_port);
996+ });
979997 } else {
980998 m_llm_profile[" N/generate:3.update_kvcache" ].record ([&]() {
981999 if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size ) {
0 commit comments