@@ -204,6 +204,9 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
204204 }
205205
206206 m_generate_initialized = false ;
207+
208+ m_llm_profile.report_on_die = ov::npuw::profiling_enabled ();
209+ m_llm_profile.area = " LLM/execution" ;
207210}
208211
209212std::string ov::npuw::LLMInferRequest::init_pre_alloc_device () {
@@ -447,28 +450,38 @@ void ov::npuw::LLMInferRequest::prepare_for_new_conversation() {
447450
448451void ov::npuw::LLMInferRequest::prepare_for_new_conversation (int64_t prompt_length) {
449452 namespace uu = ov::npuw::util;
450- uu::fill_tensor_bytes (m_prefill_request->get_tensor (m_prefill_in_ports.at (m_input_ids_name)), 0u );
451- if (auto type_ids_port = m_prefill_in_ports.find (layer_names::token_type_ids);
452- type_ids_port != m_prefill_in_ports.end ()) {
453- uu::fill_tensor_bytes (m_prefill_request->get_tensor (type_ids_port->second ), 0u );
454- }
455- uu::fill_tensor<int64_t >(m_prefill_request->get_tensor (m_prefill_in_ports.at (layer_names::attention_mask)), 0 );
456- uu::fill_tensor<int64_t >(m_prefill_request->get_tensor (m_prefill_in_ports.at (layer_names::position_ids)), 0 );
453+ namespace pp = ov::npuw::perf;
457454
458- // Clear all past_key_values tensors - use cached ports for efficiency
459- for (const auto & port : m_prefill_past_kv_ports) {
460- uu::fill_tensor_bytes (m_prefill_request->get_tensor (port), 0u );
461- }
455+ m_llm_profile[" 1/prefill:1a.fill_inputs" ] += pp::ms_to_run ([&]() {
456+ uu::fill_tensor_bytes (m_prefill_request->get_tensor (m_prefill_in_ports.at (m_input_ids_name)), 0u );
457+ if (auto type_ids_port = m_prefill_in_ports.find (layer_names::token_type_ids);
458+ type_ids_port != m_prefill_in_ports.end ()) {
459+ uu::fill_tensor_bytes (m_prefill_request->get_tensor (type_ids_port->second ), 0u );
460+ }
461+ uu::fill_tensor<int64_t >(m_prefill_request->get_tensor (m_prefill_in_ports.at (layer_names::attention_mask)), 0 );
462+ uu::fill_tensor<int64_t >(m_prefill_request->get_tensor (m_prefill_in_ports.at (layer_names::position_ids)), 0 );
463+ });
464+
465+ m_llm_profile[" 1/prefill:1b.clear_kvcache" ] += pp::ms_to_run ([&]() {
466+ // Clear all past_key_values tensors - use cached ports for efficiency
467+ for (const auto & port : m_prefill_past_kv_ports) {
468+ uu::fill_tensor_bytes (m_prefill_request->get_tensor (port), 0u );
469+ }
470+ });
462471
463472 m_npuw_llm_compiled_model->m_kvcache_desc .num_stored_tokens = 0u ;
464473
465- // Select the appropriate generate inference request variant based on prompt length
466- // The function internally calculates expected total tokens (prompt + min_response_len)
467- m_kvcache_request = select_generate_request (prompt_length);
468- m_kvcache_in_ports = m_generate_variant_in_ports.at (m_kvcache_request);
469- m_kvcache_out_ports = m_generate_variant_out_ports.at (m_kvcache_request);
474+ m_llm_profile[" 1/prefill:1c.select_gen_req" ] += pp::ms_to_run ([&]() {
475+ // Select the appropriate generate inference request variant based on prompt length
476+ // The function internally calculates expected total tokens (prompt + min_response_len)
477+ m_kvcache_request = select_generate_request (prompt_length);
478+ m_kvcache_in_ports = m_generate_variant_in_ports.at (m_kvcache_request);
479+ m_kvcache_out_ports = m_generate_variant_out_ports.at (m_kvcache_request);
480+ });
470481
471- apply_lora ();
482+ m_llm_profile[" 1/prefill:1d.apply_lora" ] += pp::ms_to_run ([&]() {
483+ apply_lora ();
484+ });
472485}
473486
474487void ov::npuw::LLMInferRequest::copy_kvcache () {
@@ -786,7 +799,9 @@ void ov::npuw::LLMInferRequest::infer_whole_prefill(ov::SoPtr<ov::ITensor> input
786799 m_eagle3_ext.prepare_inputs (m_prefill_request, m_prefill_in_ports);
787800 }
788801
789- m_prefill_request->infer ();
802+ m_llm_profile[" 1/prefill:2a.npu_infer" ] += ov::npuw::perf::ms_to_run ([&]() {
803+ m_prefill_request->infer ();
804+ });
790805 auto & kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc ;
791806 kvcache_desc.num_stored_tokens += static_cast <uint32_t >(input_ids->get_shape ()[layer_ids::INPUT_IDS_SEQ_LEN_DIM]);
792807
@@ -809,28 +824,37 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
809824 " \" NPUW_LLM_MAX_PROMPT_LEN\" or shorten the prompt." );
810825 }
811826
812- prepare_for_new_conversation (prompt_length) ;
827+ namespace pp = ov::npuw::perf ;
813828
814- const bool use_chunk_prefill = m_npuw_llm_compiled_model->m_use_chunk_prefill ;
815- if (use_chunk_prefill) {
816- OPENVINO_ASSERT (!token_type_ids,
817- " Chunking is not implemented for Gemma model family yet. "
818- " Please set NPUW_LLM_PREFILL_HINT to 'STATIC'" );
819- infer_chunked_prefill (input_ids, attention_mask, position_ids);
820- } else {
821- infer_whole_prefill (input_ids, attention_mask, position_ids, token_type_ids);
822- }
829+ m_llm_profile[" 1/prefill:1.prepare" ] += pp::ms_to_run ([&]() {
830+ prepare_for_new_conversation (prompt_length);
831+ });
823832
824- if (m_lm_head_request) {
825- LOG_DEBUG (" Calling inference for LM head model." );
826- m_lm_head_request->infer ();
827- m_logits = m_lm_head_request->get_tensor (m_lm_head_logits_port);
828- } else {
829- m_logits = m_prefill_request->get_tensor (m_prefill_out_ports.at (layer_names::logits));
830- }
833+ m_llm_profile[" 1/prefill:2.infer" ] += pp::ms_to_run ([&]() {
834+ const bool use_chunk_prefill = m_npuw_llm_compiled_model->m_use_chunk_prefill ;
835+ if (use_chunk_prefill) {
836+ OPENVINO_ASSERT (!token_type_ids,
837+ " Chunking is not implemented for Gemma model family yet. "
838+ " Please set NPUW_LLM_PREFILL_HINT to 'STATIC'" );
839+ infer_chunked_prefill (input_ids, attention_mask, position_ids);
840+ } else {
841+ infer_whole_prefill (input_ids, attention_mask, position_ids, token_type_ids);
842+ }
843+ });
844+
845+ m_llm_profile[" 1/prefill:3.lm_head" ] += pp::ms_to_run ([&]() {
846+ if (m_lm_head_request) {
847+ LOG_DEBUG (" Calling inference for LM head model." );
848+ m_lm_head_request->infer ();
849+ m_logits = m_lm_head_request->get_tensor (m_lm_head_logits_port);
850+ } else {
851+ m_logits = m_prefill_request->get_tensor (m_prefill_out_ports.at (layer_names::logits));
852+ }
853+ });
831854
832855 // Update last_hidden_state only for non-chunked prefill
833856 // For chunked prefill, accumulate_chunk_last_hidden_state() already set the tensor
857+ const bool use_chunk_prefill = m_npuw_llm_compiled_model->m_use_chunk_prefill ;
834858 if (m_eagle3_ext.is_eagle3_model () && !use_chunk_prefill) {
835859 m_eagle3_ext.update_last_hidden_state (m_prefill_request, m_prefill_out_ports);
836860 }
@@ -858,20 +882,30 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
858882 // prepare_for_new_conversation()
859883
860884 if (!m_generate_initialized) {
861- LOG_DEBUG (" Copy kv-cache from prefill to generate model." );
862- if (kvcache_desc.num_stored_tokens > 0 ) {
863- copy_kvcache ();
864- }
885+ namespace pp = ov::npuw::perf;
865886
866- LOG_DEBUG (" Prepare inputs." );
867- namespace uu = ov::npuw::util;
868- uu::fill_tensor_bytes (m_kvcache_request->get_tensor (m_kvcache_in_ports.at (m_input_ids_name)), 0u );
869- uu::fill_tensor<int64_t >(m_kvcache_request->get_tensor (m_kvcache_in_ports.at (layer_names::attention_mask)), 0 );
870- uu::fill_tensor<int64_t >(m_kvcache_request->get_tensor (m_kvcache_in_ports.at (layer_names::position_ids)), 0 );
871- if (token_type_ids) {
872- uu::fill_tensor<int64_t >(m_kvcache_request->get_tensor (m_kvcache_in_ports.at (layer_names::token_type_ids)),
887+ m_llm_profile[" N/generate:1.copy_kvcache" ] += pp::ms_to_run ([&]() {
888+ LOG_DEBUG (" Copy kv-cache from prefill to generate model." );
889+ if (kvcache_desc.num_stored_tokens > 0 ) {
890+ copy_kvcache ();
891+ }
892+ });
893+
894+ m_llm_profile[" N/generate:2.init_inputs" ] += pp::ms_to_run ([&]() {
895+ LOG_DEBUG (" Prepare inputs." );
896+ namespace uu = ov::npuw::util;
897+ uu::fill_tensor_bytes (m_kvcache_request->get_tensor (m_kvcache_in_ports.at (m_input_ids_name)), 0u );
898+ uu::fill_tensor<int64_t >(m_kvcache_request->get_tensor (m_kvcache_in_ports.at (layer_names::attention_mask)),
873899 0 );
874- }
900+ uu::fill_tensor<int64_t >(m_kvcache_request->get_tensor (m_kvcache_in_ports.at (layer_names::position_ids)),
901+ 0 );
902+ if (token_type_ids) {
903+ uu::fill_tensor<int64_t >(
904+ m_kvcache_request->get_tensor (m_kvcache_in_ports.at (layer_names::token_type_ids)),
905+ 0 );
906+ }
907+ });
908+
875909 m_generate_initialized = true ;
876910 }
877911
@@ -880,77 +914,86 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
880914 OPENVINO_THROW (" KV-Cache is full." );
881915 }
882916
883- // FIXME: these tensors should be shared between the parent & child models
884- // NB: input_ids can be either fp32(VLM) or i64(LLM)
885- auto kv_input_ids = m_kvcache_request->get_tensor (m_kvcache_in_ports.at (m_input_ids_name));
886- // NOTE: As `input_tokens_len` can be less than the value of `max_generation_token_len`, which
887- // input layers of generation model are resized to, then we need to put
888- // `input_tokens_len` prompt to the right of `max_generation_token_len`-sized tensors.
889- // Attention mask should rule out all left unusable space.
890- std::copy_n (
891- reinterpret_cast <uint8_t *>(input_ids->data ()),
892- input_ids->get_byte_size (),
893- reinterpret_cast <uint8_t *>(kv_input_ids->data ()) + kv_input_ids->get_byte_size () - input_ids->get_byte_size ());
917+ namespace pp = ov::npuw::perf;
894918
895- if (token_type_ids) {
896- auto kv_token_type_ids = m_kvcache_request->get_tensor (m_kvcache_in_ports.at (layer_names::token_type_ids));
897- util::copy_to_right (token_type_ids, kv_token_type_ids);
898- }
919+ m_llm_profile[" N/generate:3.prepare" ] += pp::ms_to_run ([&]() {
920+ // FIXME: these tensors should be shared between the parent & child models
921+ // NB: input_ids can be either fp32(VLM) or i64(LLM)
922+ auto kv_input_ids = m_kvcache_request->get_tensor (m_kvcache_in_ports.at (m_input_ids_name));
923+ // NOTE: As `input_tokens_len` can be less than the value of `max_generation_token_len`, which
924+ // input layers of generation model are resized to, then we need to put
925+ // `input_tokens_len` prompt to the right of `max_generation_token_len`-sized tensors.
926+ // Attention mask should rule out all left unusable space.
927+ std::copy_n (reinterpret_cast <uint8_t *>(input_ids->data ()),
928+ input_ids->get_byte_size (),
929+ reinterpret_cast <uint8_t *>(kv_input_ids->data ()) + kv_input_ids->get_byte_size () -
930+ input_ids->get_byte_size ());
899931
900- // NOTE: Attention mask pattern for generate model requires the set of "1"
901- // units of length of the current prompt on the right (for present
902- // kv layers) and the set of "1" units of number of previously calculated
903- // tokens on the left (for past kv layers).
904- auto kv_attn_mask = m_kvcache_request->get_tensor (m_kvcache_in_ports.at (layer_names::attention_mask));
905- std::copy_n (attention_mask->data <int64_t >(),
906- attention_mask->get_size () - input_tokens_len,
907- kv_attn_mask->data <int64_t >());
908- if (input_tokens_len < kvcache_desc.max_generation_token_len ) {
909- std::fill_n (kv_attn_mask->data <int64_t >() + kv_attn_mask->get_size () - kvcache_desc.max_generation_token_len ,
910- kvcache_desc.max_generation_token_len - input_tokens_len,
911- 0 );
912- }
913- std::fill_n (kv_attn_mask->data <int64_t >() + kv_attn_mask->get_size () - input_tokens_len, input_tokens_len, 1 );
932+ if (token_type_ids) {
933+ auto kv_token_type_ids = m_kvcache_request->get_tensor (m_kvcache_in_ports.at (layer_names::token_type_ids));
934+ util::copy_to_right (token_type_ids, kv_token_type_ids);
935+ }
914936
915- auto kv_pos_ids = m_kvcache_request->get_tensor (m_kvcache_in_ports.at (layer_names::position_ids));
916- ov::npuw::util::pad_position_ids (kv_pos_ids, position_ids);
937+ // NOTE: Attention mask pattern for generate model requires the set of "1"
938+ // units of length of the current prompt on the right (for present
939+ // kv layers) and the set of "1" units of number of previously calculated
940+ // tokens on the left (for past kv layers).
941+ auto kv_attn_mask = m_kvcache_request->get_tensor (m_kvcache_in_ports.at (layer_names::attention_mask));
942+ std::copy_n (attention_mask->data <int64_t >(),
943+ attention_mask->get_size () - input_tokens_len,
944+ kv_attn_mask->data <int64_t >());
945+ if (input_tokens_len < kvcache_desc.max_generation_token_len ) {
946+ std::fill_n (
947+ kv_attn_mask->data <int64_t >() + kv_attn_mask->get_size () - kvcache_desc.max_generation_token_len ,
948+ kvcache_desc.max_generation_token_len - input_tokens_len,
949+ 0 );
950+ }
951+ std::fill_n (kv_attn_mask->data <int64_t >() + kv_attn_mask->get_size () - input_tokens_len, input_tokens_len, 1 );
917952
918- if (m_eagle3_ext.is_eagle3_model ()) {
919- m_eagle3_ext.prepare_inputs (m_kvcache_request, m_kvcache_in_ports);
920- }
953+ auto kv_pos_ids = m_kvcache_request->get_tensor (m_kvcache_in_ports.at (layer_names::position_ids));
954+ ov::npuw::util::pad_position_ids (kv_pos_ids, position_ids);
921955
922- m_kvcache_request->infer ();
956+ if (m_eagle3_ext.is_eagle3_model ()) {
957+ m_eagle3_ext.prepare_inputs (m_kvcache_request, m_kvcache_in_ports);
958+ }
959+ });
960+
961+ m_llm_profile[" N/generate:4.infer" ] += pp::ms_to_run ([&]() {
962+ m_kvcache_request->infer ();
963+ });
923964 kvcache_desc.num_stored_tokens += input_tokens_len;
924965
925- if (m_lm_head_request) {
926- LOG_DEBUG (" Calling inference for LM head model asynchronously" );
927- m_lm_head_request->start_async ();
928- if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size ) {
929- update_kvcache_for (m_kvcache_request,
930- m_kvcache_in_ports,
931- m_kvcache_out_ports,
932- input_tokens_len,
933- kvcache_desc.v_tensors_transposed_gen );
934- }
935- m_lm_head_request->wait ();
936- LOG_DEBUG (" Calling inference for LM head model -- done." );
966+ m_llm_profile[" N/generate:5.post" ] += pp::ms_to_run ([&]() {
967+ if (m_lm_head_request) {
968+ LOG_DEBUG (" Calling inference for LM head model asynchronously" );
969+ m_lm_head_request->start_async ();
970+ if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size ) {
971+ update_kvcache_for (m_kvcache_request,
972+ m_kvcache_in_ports,
973+ m_kvcache_out_ports,
974+ input_tokens_len,
975+ kvcache_desc.v_tensors_transposed_gen );
976+ }
977+ m_lm_head_request->wait ();
978+ LOG_DEBUG (" Calling inference for LM head model -- done." );
937979
938- m_logits = m_lm_head_request->get_tensor (m_lm_head_logits_port);
939- } else {
940- if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size ) {
941- update_kvcache_for (m_kvcache_request,
942- m_kvcache_in_ports,
943- m_kvcache_out_ports,
944- input_tokens_len,
945- kvcache_desc.v_tensors_transposed_gen );
946- }
980+ m_logits = m_lm_head_request->get_tensor (m_lm_head_logits_port);
981+ } else {
982+ if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size ) {
983+ update_kvcache_for (m_kvcache_request,
984+ m_kvcache_in_ports,
985+ m_kvcache_out_ports,
986+ input_tokens_len,
987+ kvcache_desc.v_tensors_transposed_gen );
988+ }
947989
948- m_logits = m_kvcache_request->get_tensor (m_kvcache_out_ports.at (layer_names::logits));
949- }
990+ m_logits = m_kvcache_request->get_tensor (m_kvcache_out_ports.at (layer_names::logits));
991+ }
950992
951- if (m_eagle3_ext.is_eagle3_model ()) {
952- m_eagle3_ext.update_last_hidden_state (m_kvcache_request, m_kvcache_out_ports);
953- }
993+ if (m_eagle3_ext.is_eagle3_model ()) {
994+ m_eagle3_ext.update_last_hidden_state (m_kvcache_request, m_kvcache_out_ports);
995+ }
996+ });
954997
955998 LOG_DEBUG (" Done" );
956999}
0 commit comments