Skip to content

Commit 3370c5a

Browse files
committed
Add profiling to infer_chunked_prefill and infer_whole_prefill
Cherry-pick llm_infer_request.cpp changes from a8f643d to add sub-step profiling for chunked/whole prefill and generate lm_head.
1 parent 67216ae commit 3370c5a

File tree

1 file changed

+157
-138
lines changed

1 file changed

+157
-138
lines changed

src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp

Lines changed: 157 additions & 138 deletions
Original file line numberDiff line numberDiff line change
@@ -654,90 +654,99 @@ void ov::npuw::LLMInferRequest::infer_chunked_prefill(ov::SoPtr<ov::ITensor> inp
654654
}
655655

656656
while (remaining_prompts > 0) {
657-
// NB: input_ids can be either fp32(VLM) or i64(LLM)
658-
// The last chunk may not be completely filled if the actual length of the prompts is not evenly divisible by
659-
// the chunk size
660-
auto current_prompts_len = std::min(remaining_prompts, chunk_prompt_len);
657+
m_llm_profile["1/prefill:3a.prepare_chunk"].record([&]() {
658+
// NB: input_ids can be either fp32(VLM) or i64(LLM)
659+
// The last chunk may not be completely filled if the actual length of the prompts is not evenly divisible
660+
// by the chunk size
661+
auto current_prompts_len = std::min(remaining_prompts, chunk_prompt_len);
662+
663+
// Handle first chunk with prefix caching: populate attention mask for restored cache
664+
if (enable_prefix_caching && cache_context.restore_prefix_cache) {
665+
m_prefix_caching_helper->populate_attention_mask_for_restored_cache(attention_mask,
666+
attn_mask_in_tensor,
667+
kvcache_desc.num_stored_tokens);
668+
cache_context.restore_prefix_cache = false;
669+
}
661670

662-
// Handle first chunk with prefix caching: populate attention mask for restored cache
663-
if (enable_prefix_caching && cache_context.restore_prefix_cache) {
664-
m_prefix_caching_helper->populate_attention_mask_for_restored_cache(attention_mask,
665-
attn_mask_in_tensor,
666-
kvcache_desc.num_stored_tokens);
667-
cache_context.restore_prefix_cache = false;
668-
}
671+
// Populate the attention mask for the present chunk
672+
// For the already processed tokens, they will be added into the attention mask after inference call
673+
size_t last_chunk_offset = attn_mask_in_tensor->get_size() - chunk_prompt_len;
674+
if (current_prompts_len < chunk_prompt_len) {
675+
// We will populate current_prompts_len on the right side of attention mask for the processing tokens
676+
// If the current prompt length is smaller than the chunk prompt length,
677+
// clear the last chunk of the attention mask to ensure non-relevant tokens are masked
678+
ov::npuw::util::fill_tensor<int64_t>(attn_mask_in_tensor, 0, last_chunk_offset);
679+
}
669680

670-
// Populate the attention mask for the present chunk
671-
// For the already processed tokens, they will be added into the attention mask after inference call
672-
size_t last_chunk_offset = attn_mask_in_tensor->get_size() - chunk_prompt_len;
673-
if (current_prompts_len < chunk_prompt_len) {
674-
// We will populate current_prompts_len on the right side of attention mask for the processing tokens
675-
// If the current prompt length is smaller than the chunk prompt length,
676-
// clear the last chunk of the attention mask to ensure non-relevant tokens are masked
677-
ov::npuw::util::fill_tensor<int64_t>(attn_mask_in_tensor, 0, last_chunk_offset);
678-
}
681+
std::copy_n(attention_mask->data<int64_t>() + kvcache_desc.num_stored_tokens,
682+
current_prompts_len,
683+
attn_mask_in_tensor->data<int64_t>() + attn_mask_in_tensor->get_size() - current_prompts_len);
679684

680-
std::copy_n(attention_mask->data<int64_t>() + kvcache_desc.num_stored_tokens,
681-
current_prompts_len,
682-
attn_mask_in_tensor->data<int64_t>() + attn_mask_in_tensor->get_size() - current_prompts_len);
685+
auto current_prefill_bytes = current_prompts_len * input_ids_elem_size;
686+
auto prefilled_bytes = kvcache_desc.num_stored_tokens * input_ids_elem_size;
687+
if (is_input_embeds) {
688+
current_prefill_bytes *= input_ids->get_shape().back();
689+
prefilled_bytes *= input_ids->get_shape().back();
690+
}
683691

684-
auto current_prefill_bytes = current_prompts_len * input_ids_elem_size;
685-
auto prefilled_bytes = kvcache_desc.num_stored_tokens * input_ids_elem_size;
686-
if (is_input_embeds) {
687-
current_prefill_bytes *= input_ids->get_shape().back();
688-
prefilled_bytes *= input_ids->get_shape().back();
689-
}
692+
ov::npuw::util::fill_tensor_bytes(input_ids_in_tensor, 0u);
693+
std::copy_n(reinterpret_cast<uint8_t*>(input_ids->data()) + prefilled_bytes,
694+
current_prefill_bytes,
695+
reinterpret_cast<uint8_t*>(input_ids_in_tensor->data()) +
696+
input_ids_in_tensor->get_byte_size() - current_prefill_bytes);
697+
698+
// NB: Regular LLM uses 2D position_ids [BATCH, SEQ_LEN], Qwen2.5 VL/Omni uses 3D position_ids [3,
699+
// BATCH, SEQ_LEN]
700+
// Copy postion ids with considering the 3D position_ids
701+
auto last_dim = position_ids->get_shape().size() - 1;
702+
auto actual_position_ids_slice = ov::npuw::util::make_tensor_slice(
703+
position_ids,
704+
static_cast<uint32_t>(last_dim),
705+
kvcache_desc.num_stored_tokens,
706+
kvcache_desc.num_stored_tokens + static_cast<uint32_t>(current_prompts_len));
707+
708+
auto pos_ids_slice =
709+
ov::npuw::util::make_tensor_slice(pos_ids_in_tensor,
710+
static_cast<uint32_t>(last_dim),
711+
static_cast<uint32_t>(chunk_prompt_len - current_prompts_len),
712+
static_cast<uint32_t>(chunk_prompt_len));
713+
714+
// Copy with proper stride handling
715+
actual_position_ids_slice->copy_to(pos_ids_slice._ptr);
716+
717+
if (m_eagle3_ext.is_eagle3_model()) {
718+
m_eagle3_ext.prepare_inputs_for_chunk(m_prefill_request,
719+
m_prefill_in_ports,
720+
kvcache_desc.num_stored_tokens,
721+
static_cast<uint32_t>(current_prompts_len));
722+
}
690723

691-
ov::npuw::util::fill_tensor_bytes(input_ids_in_tensor, 0u);
692-
std::copy_n(reinterpret_cast<uint8_t*>(input_ids->data()) + prefilled_bytes,
693-
current_prefill_bytes,
694-
reinterpret_cast<uint8_t*>(input_ids_in_tensor->data()) + input_ids_in_tensor->get_byte_size() -
695-
current_prefill_bytes);
696-
697-
// NB: Regular LLM uses 2D position_ids [BATCH, SEQ_LEN], Qwen2.5 VL/Omni uses 3D position_ids [3, BATCH,
698-
// SEQ_LEN]
699-
// Copy postion ids with considering the 3D position_ids
700-
auto last_dim = position_ids->get_shape().size() - 1;
701-
auto actual_position_ids_slice = ov::npuw::util::make_tensor_slice(
702-
position_ids,
703-
static_cast<uint32_t>(last_dim),
704-
kvcache_desc.num_stored_tokens,
705-
kvcache_desc.num_stored_tokens + static_cast<uint32_t>(current_prompts_len));
706-
707-
auto pos_ids_slice =
708-
ov::npuw::util::make_tensor_slice(pos_ids_in_tensor,
709-
static_cast<uint32_t>(last_dim),
710-
static_cast<uint32_t>(chunk_prompt_len - current_prompts_len),
711-
static_cast<uint32_t>(chunk_prompt_len));
712-
713-
// Copy with proper stride handling
714-
actual_position_ids_slice->copy_to(pos_ids_slice._ptr);
724+
// Update history size for dynamic context:
725+
// dynamic attention selector needs history size to determin the past KV shape and attention mask shape
726+
m_prefill_base_request->update_history_size(kvcache_desc.num_stored_tokens);
727+
});
715728

716-
if (m_eagle3_ext.is_eagle3_model()) {
717-
m_eagle3_ext.prepare_inputs_for_chunk(m_prefill_request,
718-
m_prefill_in_ports,
719-
kvcache_desc.num_stored_tokens,
720-
static_cast<uint32_t>(current_prompts_len));
721-
}
729+
auto current_prompts_len = std::min(remaining_prompts, chunk_prompt_len);
722730

723-
// Update history size for dynamic context:
724-
// dynamic attention selector needs history size to determin the past KV shape and attention mask shape
725-
m_prefill_base_request->update_history_size(kvcache_desc.num_stored_tokens);
726-
m_prefill_request->infer();
731+
m_llm_profile["1/prefill:3b.infer"].record([&]() {
732+
m_prefill_request->infer();
733+
});
727734

728-
// Accumulate Eagle3 last_hidden_state from this chunk
729-
if (m_eagle3_ext.is_eagle3_model()) {
730-
m_eagle3_ext.accumulate_chunk_last_hidden_state(m_prefill_request,
731-
m_prefill_out_ports,
732-
static_cast<uint32_t>(current_prompts_len),
733-
static_cast<uint32_t>(input_prompt_len));
734-
}
735+
m_llm_profile["1/prefill:3c.post_chunk"].record([&]() {
736+
// Accumulate Eagle3 last_hidden_state from this chunk
737+
if (m_eagle3_ext.is_eagle3_model()) {
738+
m_eagle3_ext.accumulate_chunk_last_hidden_state(m_prefill_request,
739+
m_prefill_out_ports,
740+
static_cast<uint32_t>(current_prompts_len),
741+
static_cast<uint32_t>(input_prompt_len));
742+
}
735743

736-
if (enable_prefix_caching) {
737-
m_prefix_caching_helper->store_computed_blocks(current_prompts_len,
738-
cache_context.prompt_hashes,
739-
cache_context.token_idx);
740-
}
744+
if (enable_prefix_caching) {
745+
m_prefix_caching_helper->store_computed_blocks(current_prompts_len,
746+
cache_context.prompt_hashes,
747+
cache_context.token_idx);
748+
}
749+
});
741750

742751
remaining_prompts -= current_prompts_len;
743752
kvcache_desc.num_stored_tokens += static_cast<uint32_t>(current_prompts_len);
@@ -746,20 +755,26 @@ void ov::npuw::LLMInferRequest::infer_chunked_prefill(ov::SoPtr<ov::ITensor> inp
746755
if (remaining_prompts <= 0) {
747756
LOG_DEBUG("All prompts have been prefilled in chunks");
748757
m_tokens_in_present_chunk = current_prompts_len;
749-
break;
758+
return;
750759
}
751760

752-
// Copy calculated key/values chunk from present k/v layer to past k/v layer for storage
753-
update_kvcache_for(m_prefill_request,
754-
m_prefill_in_ports,
755-
m_prefill_out_ports,
756-
static_cast<uint32_t>(current_prompts_len),
757-
kvcache_desc.v_tensors_transposed_pre);
758-
759-
// Update attention mask for the next iteration
760-
std::copy_n(attn_mask_in_tensor->data<int64_t>() + attn_mask_in_tensor->get_size() - current_prompts_len,
761-
current_prompts_len,
762-
attn_mask_in_tensor->data<int64_t>() + kvcache_desc.num_stored_tokens - current_prompts_len);
761+
m_llm_profile["1/prefill:3d.update_kvcache"].record([&]() {
762+
// Copy calculated key/values chunk from present k/v layer to past k/v layer for storage
763+
update_kvcache_for(m_prefill_request,
764+
m_prefill_in_ports,
765+
m_prefill_out_ports,
766+
static_cast<uint32_t>(current_prompts_len),
767+
kvcache_desc.v_tensors_transposed_pre);
768+
769+
// Update attention mask for the next iteration
770+
std::copy_n(attn_mask_in_tensor->data<int64_t>() + attn_mask_in_tensor->get_size() - current_prompts_len,
771+
current_prompts_len,
772+
attn_mask_in_tensor->data<int64_t>() + kvcache_desc.num_stored_tokens - current_prompts_len);
773+
});
774+
775+
if (remaining_prompts <= 0) {
776+
break;
777+
}
763778
}
764779

765780
LOG_DEBUG("Done.");
@@ -776,34 +791,43 @@ void ov::npuw::LLMInferRequest::infer_whole_prefill(ov::SoPtr<ov::ITensor> input
776791
LOG_DEBUG("Calling inference for prefill model in a single launch.");
777792
LOG_BLOCK();
778793

779-
// NB: padded_input can be either fp32(VLM) or i64(LLM)
780-
auto padded_input = m_prefill_request->get_tensor(m_prefill_in_ports.at(m_input_ids_name));
781-
std::copy_n(
782-
reinterpret_cast<uint8_t*>(input_ids->data()),
783-
input_ids->get_byte_size(),
784-
reinterpret_cast<uint8_t*>(padded_input->data()) + padded_input->get_byte_size() - input_ids->get_byte_size());
794+
m_llm_profile["1/prefill:3a.prepare"].record([&]() {
795+
// NB: padded_input can be either fp32(VLM) or i64(LLM)
796+
auto padded_input = m_prefill_request->get_tensor(m_prefill_in_ports.at(m_input_ids_name));
797+
std::copy_n(reinterpret_cast<uint8_t*>(input_ids->data()),
798+
input_ids->get_byte_size(),
799+
reinterpret_cast<uint8_t*>(padded_input->data()) + padded_input->get_byte_size() -
800+
input_ids->get_byte_size());
785801

786-
auto padded_attention_mask = m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::attention_mask));
787-
std::copy_n(
788-
attention_mask->data<int64_t>(),
789-
attention_mask->get_size(),
790-
padded_attention_mask->data<int64_t>() + padded_attention_mask->get_size() - attention_mask->get_size());
802+
auto padded_attention_mask =
803+
m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::attention_mask));
804+
std::copy_n(attention_mask->data<int64_t>(),
805+
attention_mask->get_size(),
806+
padded_attention_mask->data<int64_t>() + padded_attention_mask->get_size() -
807+
attention_mask->get_size());
791808

792-
if (token_type_ids) {
793-
auto padded_token_type_ids = m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::token_type_ids));
809+
if (token_type_ids) {
810+
auto padded_token_type_ids =
811+
m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::token_type_ids));
794812

795-
std::fill_n(reinterpret_cast<uint8_t*>(padded_token_type_ids->data()), token_type_ids->get_byte_size(), 0);
796-
util::copy_to_right(token_type_ids, padded_token_type_ids);
797-
}
813+
std::fill_n(reinterpret_cast<uint8_t*>(padded_token_type_ids->data()),
814+
token_type_ids->get_byte_size(),
815+
0);
816+
util::copy_to_right(token_type_ids, padded_token_type_ids);
817+
}
798818

799-
auto padded_position_ids = m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::position_ids));
800-
ov::npuw::util::pad_position_ids(padded_position_ids, position_ids);
819+
auto padded_position_ids = m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::position_ids));
820+
ov::npuw::util::pad_position_ids(padded_position_ids, position_ids);
801821

802-
if (m_eagle3_ext.is_eagle3_model()) {
803-
m_eagle3_ext.prepare_inputs(m_prefill_request, m_prefill_in_ports);
804-
}
822+
if (m_eagle3_ext.is_eagle3_model()) {
823+
m_eagle3_ext.prepare_inputs(m_prefill_request, m_prefill_in_ports);
824+
}
825+
});
826+
827+
m_llm_profile["1/prefill:3b.infer"].record([&]() {
828+
m_prefill_request->infer();
829+
});
805830

806-
m_prefill_request->infer();
807831
auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc;
808832
kvcache_desc.num_stored_tokens += static_cast<uint32_t>(input_ids->get_shape()[layer_ids::INPUT_IDS_SEQ_LEN_DIM]);
809833

@@ -826,6 +850,8 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
826850
"\"NPUW_LLM_MAX_PROMPT_LEN\" or shorten the prompt.");
827851
}
828852

853+
process_longrope(m_prefill_request, m_prefill_in_ports, position_ids);
854+
829855
m_llm_profile["1/prefill:1.prepare_for_new_conversation"].record([&]() {
830856
prepare_for_new_conversation(prompt_length);
831857
});
@@ -834,8 +860,6 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
834860
apply_lora();
835861
});
836862

837-
process_longrope(m_prefill_request, m_prefill_in_ports, position_ids);
838-
839863
const bool use_chunk_prefill = m_npuw_llm_compiled_model->m_use_chunk_prefill;
840864
m_llm_profile["1/prefill:3.infer"].record([&]() {
841865
if (use_chunk_prefill) {
@@ -963,36 +987,31 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
963987
if (m_lm_head_request) {
964988
LOG_DEBUG("Calling inference for LM head model asynchronously");
965989
m_lm_head_request->start_async();
966-
m_llm_profile["N/generate:3.update_kvcache"].record([&]() {
967-
if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size) {
968-
update_kvcache_for(m_kvcache_request,
969-
m_kvcache_in_ports,
970-
m_kvcache_out_ports,
971-
input_tokens_len,
972-
kvcache_desc.v_tensors_transposed_gen);
973-
}
974-
});
975-
m_lm_head_request->wait();
976-
LOG_DEBUG("Calling inference for LM head model -- done.");
990+
}
977991

978-
m_logits = m_lm_head_request->get_tensor(m_lm_head_logits_port);
979-
} else {
980-
m_llm_profile["N/generate:3.update_kvcache"].record([&]() {
981-
if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size) {
982-
update_kvcache_for(m_kvcache_request,
983-
m_kvcache_in_ports,
984-
m_kvcache_out_ports,
985-
input_tokens_len,
986-
kvcache_desc.v_tensors_transposed_gen);
987-
}
988-
});
992+
m_llm_profile["N/generate:3.update_kvcache"].record([&]() {
993+
if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size) {
994+
update_kvcache_for(m_kvcache_request,
995+
m_kvcache_in_ports,
996+
m_kvcache_out_ports,
997+
input_tokens_len,
998+
kvcache_desc.v_tensors_transposed_gen);
999+
}
1000+
});
9891001

990-
m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(layer_names::logits));
991-
}
1002+
m_llm_profile["N/generate:4.lm_head"].record([&]() {
1003+
if (m_lm_head_request) {
1004+
m_lm_head_request->wait();
1005+
LOG_DEBUG("Calling inference for LM head model -- done.");
1006+
m_logits = m_lm_head_request->get_tensor(m_lm_head_logits_port);
1007+
} else {
1008+
m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(layer_names::logits));
1009+
}
9921010

993-
if (m_eagle3_ext.is_eagle3_model()) {
994-
m_eagle3_ext.update_last_hidden_state(m_kvcache_request, m_kvcache_out_ports);
995-
}
1011+
if (m_eagle3_ext.is_eagle3_model()) {
1012+
m_eagle3_ext.update_last_hidden_state(m_kvcache_request, m_kvcache_out_ports);
1013+
}
1014+
});
9961015

9971016
LOG_DEBUG("Done");
9981017
}

0 commit comments

Comments
 (0)