Skip to content

Commit 2c4bb15

Browse files
committed
Add profiling to infer_chunked_prefill and infer_whole_prefill
Cherry-pick llm_infer_request.cpp changes from a8f643d to add sub-step profiling for chunked/whole prefill and generate lm_head.
1 parent 901a2b8 commit 2c4bb15

File tree

1 file changed

+129
-111
lines changed

1 file changed

+129
-111
lines changed

src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp

Lines changed: 129 additions & 111 deletions
Original file line numberDiff line numberDiff line change
@@ -655,89 +655,97 @@ void ov::npuw::LLMInferRequest::infer_chunked_prefill(ov::SoPtr<ov::ITensor> inp
655655

656656
while (remaining_prompts > 0) {
657657
// NB: input_ids can be either fp32(VLM) or i64(LLM)
658-
// The last chunk may not be completely filled if the actual length of the prompts is not evenly divisible by
659-
// the chunk size
658+
// The last chunk may not be completely filled if the actual length of the prompts is not evenly divisible
659+
// by the chunk size
660660
auto current_prompts_len = std::min(remaining_prompts, chunk_prompt_len);
661661

662-
// Handle first chunk with prefix caching: populate attention mask for restored cache
663-
if (enable_prefix_caching && cache_context.restore_prefix_cache) {
664-
m_prefix_caching_helper->populate_attention_mask_for_restored_cache(attention_mask,
665-
attn_mask_in_tensor,
666-
kvcache_desc.num_stored_tokens);
667-
cache_context.restore_prefix_cache = false;
668-
}
662+
m_llm_profile["1/prefill:3a.prepare_chunk"].record([&]() {
669663

670-
// Populate the attention mask for the present chunk
671-
// For the already processed tokens, they will be added into the attention mask after inference call
672-
size_t last_chunk_offset = attn_mask_in_tensor->get_size() - chunk_prompt_len;
673-
if (current_prompts_len < chunk_prompt_len) {
674-
// We will populate current_prompts_len on the right side of attention mask for the processing tokens
675-
// If the current prompt length is smaller than the chunk prompt length,
676-
// clear the last chunk of the attention mask to ensure non-relevant tokens are masked
677-
ov::npuw::util::fill_tensor<int64_t>(attn_mask_in_tensor, 0, last_chunk_offset);
678-
}
664+
// Handle first chunk with prefix caching: populate attention mask for restored cache
665+
if (enable_prefix_caching && cache_context.restore_prefix_cache) {
666+
m_prefix_caching_helper->populate_attention_mask_for_restored_cache(attention_mask,
667+
attn_mask_in_tensor,
668+
kvcache_desc.num_stored_tokens);
669+
cache_context.restore_prefix_cache = false;
670+
}
679671

680-
std::copy_n(attention_mask->data<int64_t>() + kvcache_desc.num_stored_tokens,
681-
current_prompts_len,
682-
attn_mask_in_tensor->data<int64_t>() + attn_mask_in_tensor->get_size() - current_prompts_len);
672+
// Populate the attention mask for the present chunk
673+
// For the already processed tokens, they will be added into the attention mask after inference call
674+
size_t last_chunk_offset = attn_mask_in_tensor->get_size() - chunk_prompt_len;
675+
if (current_prompts_len < chunk_prompt_len) {
676+
// We will populate current_prompts_len on the right side of attention mask for the processing tokens
677+
// If the current prompt length is smaller than the chunk prompt length,
678+
// clear the last chunk of the attention mask to ensure non-relevant tokens are masked
679+
ov::npuw::util::fill_tensor<int64_t>(attn_mask_in_tensor, 0, last_chunk_offset);
680+
}
683681

684-
auto current_prefill_bytes = current_prompts_len * input_ids_elem_size;
685-
auto prefilled_bytes = kvcache_desc.num_stored_tokens * input_ids_elem_size;
686-
if (is_input_embeds) {
687-
current_prefill_bytes *= input_ids->get_shape().back();
688-
prefilled_bytes *= input_ids->get_shape().back();
689-
}
682+
std::copy_n(attention_mask->data<int64_t>() + kvcache_desc.num_stored_tokens,
683+
current_prompts_len,
684+
attn_mask_in_tensor->data<int64_t>() + attn_mask_in_tensor->get_size() - current_prompts_len);
690685

691-
ov::npuw::util::fill_tensor_bytes(input_ids_in_tensor, 0u);
692-
std::copy_n(reinterpret_cast<uint8_t*>(input_ids->data()) + prefilled_bytes,
693-
current_prefill_bytes,
694-
reinterpret_cast<uint8_t*>(input_ids_in_tensor->data()) + input_ids_in_tensor->get_byte_size() -
695-
current_prefill_bytes);
696-
697-
// NB: Regular LLM uses 2D position_ids [BATCH, SEQ_LEN], Qwen2.5 VL/Omni uses 3D position_ids [3, BATCH,
698-
// SEQ_LEN]
699-
// Copy postion ids with considering the 3D position_ids
700-
auto last_dim = position_ids->get_shape().size() - 1;
701-
auto actual_position_ids_slice = ov::npuw::util::make_tensor_slice(
702-
position_ids,
703-
static_cast<uint32_t>(last_dim),
704-
kvcache_desc.num_stored_tokens,
705-
kvcache_desc.num_stored_tokens + static_cast<uint32_t>(current_prompts_len));
706-
707-
auto pos_ids_slice =
708-
ov::npuw::util::make_tensor_slice(pos_ids_in_tensor,
709-
static_cast<uint32_t>(last_dim),
710-
static_cast<uint32_t>(chunk_prompt_len - current_prompts_len),
711-
static_cast<uint32_t>(chunk_prompt_len));
712-
713-
// Copy with proper stride handling
714-
actual_position_ids_slice->copy_to(pos_ids_slice._ptr);
686+
auto current_prefill_bytes = current_prompts_len * input_ids_elem_size;
687+
auto prefilled_bytes = kvcache_desc.num_stored_tokens * input_ids_elem_size;
688+
if (is_input_embeds) {
689+
current_prefill_bytes *= input_ids->get_shape().back();
690+
prefilled_bytes *= input_ids->get_shape().back();
691+
}
715692

716-
if (m_eagle3_ext.is_eagle3_model()) {
717-
m_eagle3_ext.prepare_inputs_for_chunk(m_prefill_request,
718-
m_prefill_in_ports,
719-
kvcache_desc.num_stored_tokens,
720-
static_cast<uint32_t>(current_prompts_len));
721-
}
693+
ov::npuw::util::fill_tensor_bytes(input_ids_in_tensor, 0u);
694+
std::copy_n(reinterpret_cast<uint8_t*>(input_ids->data()) + prefilled_bytes,
695+
current_prefill_bytes,
696+
reinterpret_cast<uint8_t*>(input_ids_in_tensor->data()) + input_ids_in_tensor->get_byte_size() -
697+
current_prefill_bytes);
698+
699+
// NB: Regular LLM uses 2D position_ids [BATCH, SEQ_LEN], Qwen2.5 VL/Omni uses 3D position_ids [3,
700+
// BATCH, SEQ_LEN]
701+
// Copy postion ids with considering the 3D position_ids
702+
auto last_dim = position_ids->get_shape().size() - 1;
703+
auto actual_position_ids_slice = ov::npuw::util::make_tensor_slice(
704+
position_ids,
705+
static_cast<uint32_t>(last_dim),
706+
kvcache_desc.num_stored_tokens,
707+
kvcache_desc.num_stored_tokens + static_cast<uint32_t>(current_prompts_len));
708+
709+
auto pos_ids_slice =
710+
ov::npuw::util::make_tensor_slice(pos_ids_in_tensor,
711+
static_cast<uint32_t>(last_dim),
712+
static_cast<uint32_t>(chunk_prompt_len - current_prompts_len),
713+
static_cast<uint32_t>(chunk_prompt_len));
714+
715+
// Copy with proper stride handling
716+
actual_position_ids_slice->copy_to(pos_ids_slice._ptr);
717+
718+
if (m_eagle3_ext.is_eagle3_model()) {
719+
m_eagle3_ext.prepare_inputs_for_chunk(m_prefill_request,
720+
m_prefill_in_ports,
721+
kvcache_desc.num_stored_tokens,
722+
static_cast<uint32_t>(current_prompts_len));
723+
}
722724

723-
// Update history size for dynamic context:
724-
// dynamic attention selector needs history size to determin the past KV shape and attention mask shape
725-
m_prefill_base_request->update_history_size(kvcache_desc.num_stored_tokens);
726-
m_prefill_request->infer();
725+
// Update history size for dynamic context:
726+
// dynamic attention selector needs history size to determin the past KV shape and attention mask shape
727+
m_prefill_base_request->update_history_size(kvcache_desc.num_stored_tokens);
728+
});
727729

728-
// Accumulate Eagle3 last_hidden_state from this chunk
729-
if (m_eagle3_ext.is_eagle3_model()) {
730-
m_eagle3_ext.accumulate_chunk_last_hidden_state(m_prefill_request,
731-
m_prefill_out_ports,
732-
static_cast<uint32_t>(current_prompts_len),
733-
static_cast<uint32_t>(input_prompt_len));
734-
}
730+
m_llm_profile["1/prefill:3b.infer"].record([&]() {
731+
m_prefill_request->infer();
732+
});
735733

736-
if (enable_prefix_caching) {
737-
m_prefix_caching_helper->store_computed_blocks(current_prompts_len,
738-
cache_context.prompt_hashes,
739-
cache_context.token_idx);
740-
}
734+
m_llm_profile["1/prefill:3c.post_chunk"].record([&]() {
735+
// Accumulate Eagle3 last_hidden_state from this chunk
736+
if (m_eagle3_ext.is_eagle3_model()) {
737+
m_eagle3_ext.accumulate_chunk_last_hidden_state(m_prefill_request,
738+
m_prefill_out_ports,
739+
static_cast<uint32_t>(current_prompts_len),
740+
static_cast<uint32_t>(input_prompt_len));
741+
}
742+
743+
if (enable_prefix_caching) {
744+
m_prefix_caching_helper->store_computed_blocks(current_prompts_len,
745+
cache_context.prompt_hashes,
746+
cache_context.token_idx);
747+
}
748+
});
741749

742750
remaining_prompts -= current_prompts_len;
743751
kvcache_desc.num_stored_tokens += static_cast<uint32_t>(current_prompts_len);
@@ -749,17 +757,19 @@ void ov::npuw::LLMInferRequest::infer_chunked_prefill(ov::SoPtr<ov::ITensor> inp
749757
break;
750758
}
751759

752-
// Copy calculated key/values chunk from present k/v layer to past k/v layer for storage
753-
update_kvcache_for(m_prefill_request,
754-
m_prefill_in_ports,
755-
m_prefill_out_ports,
756-
static_cast<uint32_t>(current_prompts_len),
757-
kvcache_desc.v_tensors_transposed_pre);
758-
759-
// Update attention mask for the next iteration
760-
std::copy_n(attn_mask_in_tensor->data<int64_t>() + attn_mask_in_tensor->get_size() - current_prompts_len,
761-
current_prompts_len,
762-
attn_mask_in_tensor->data<int64_t>() + kvcache_desc.num_stored_tokens - current_prompts_len);
760+
m_llm_profile["1/prefill:3d.update_kvcache"].record([&]() {
761+
// Copy calculated key/values chunk from present k/v layer to past k/v layer for storage
762+
update_kvcache_for(m_prefill_request,
763+
m_prefill_in_ports,
764+
m_prefill_out_ports,
765+
static_cast<uint32_t>(current_prompts_len),
766+
kvcache_desc.v_tensors_transposed_pre);
767+
768+
// Update attention mask for the next iteration
769+
std::copy_n(attn_mask_in_tensor->data<int64_t>() + attn_mask_in_tensor->get_size() - current_prompts_len,
770+
current_prompts_len,
771+
attn_mask_in_tensor->data<int64_t>() + kvcache_desc.num_stored_tokens - current_prompts_len);
772+
});
763773
}
764774

765775
LOG_DEBUG("Done.");
@@ -776,34 +786,40 @@ void ov::npuw::LLMInferRequest::infer_whole_prefill(ov::SoPtr<ov::ITensor> input
776786
LOG_DEBUG("Calling inference for prefill model in a single launch.");
777787
LOG_BLOCK();
778788

779-
// NB: padded_input can be either fp32(VLM) or i64(LLM)
780-
auto padded_input = m_prefill_request->get_tensor(m_prefill_in_ports.at(m_input_ids_name));
781-
std::copy_n(
782-
reinterpret_cast<uint8_t*>(input_ids->data()),
783-
input_ids->get_byte_size(),
784-
reinterpret_cast<uint8_t*>(padded_input->data()) + padded_input->get_byte_size() - input_ids->get_byte_size());
789+
m_llm_profile["1/prefill:3a.prepare"].record([&]() {
790+
// NB: padded_input can be either fp32(VLM) or i64(LLM)
791+
auto padded_input = m_prefill_request->get_tensor(m_prefill_in_ports.at(m_input_ids_name));
792+
std::copy_n(reinterpret_cast<uint8_t*>(input_ids->data()),
793+
input_ids->get_byte_size(),
794+
reinterpret_cast<uint8_t*>(padded_input->data()) + padded_input->get_byte_size() -
795+
input_ids->get_byte_size());
785796

786-
auto padded_attention_mask = m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::attention_mask));
787-
std::copy_n(
788-
attention_mask->data<int64_t>(),
789-
attention_mask->get_size(),
790-
padded_attention_mask->data<int64_t>() + padded_attention_mask->get_size() - attention_mask->get_size());
797+
auto padded_attention_mask = m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::attention_mask));
798+
std::copy_n(
799+
attention_mask->data<int64_t>(),
800+
attention_mask->get_size(),
801+
padded_attention_mask->data<int64_t>() + padded_attention_mask->get_size() - attention_mask->get_size());
791802

792-
if (token_type_ids) {
793-
auto padded_token_type_ids = m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::token_type_ids));
803+
if (token_type_ids) {
804+
auto padded_token_type_ids =
805+
m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::token_type_ids));
794806

795-
std::fill_n(reinterpret_cast<uint8_t*>(padded_token_type_ids->data()), token_type_ids->get_byte_size(), 0);
796-
util::copy_to_right(token_type_ids, padded_token_type_ids);
797-
}
807+
std::fill_n(reinterpret_cast<uint8_t*>(padded_token_type_ids->data()), token_type_ids->get_byte_size(), 0);
808+
util::copy_to_right(token_type_ids, padded_token_type_ids);
809+
}
798810

799-
auto padded_position_ids = m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::position_ids));
800-
ov::npuw::util::pad_position_ids(padded_position_ids, position_ids);
811+
auto padded_position_ids = m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::position_ids));
812+
ov::npuw::util::pad_position_ids(padded_position_ids, position_ids);
801813

802-
if (m_eagle3_ext.is_eagle3_model()) {
803-
m_eagle3_ext.prepare_inputs(m_prefill_request, m_prefill_in_ports);
804-
}
814+
if (m_eagle3_ext.is_eagle3_model()) {
815+
m_eagle3_ext.prepare_inputs(m_prefill_request, m_prefill_in_ports);
816+
}
817+
});
818+
819+
m_llm_profile["1/prefill:3b.infer"].record([&]() {
820+
m_prefill_request->infer();
821+
});
805822

806-
m_prefill_request->infer();
807823
auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc;
808824
kvcache_desc.num_stored_tokens += static_cast<uint32_t>(input_ids->get_shape()[layer_ids::INPUT_IDS_SEQ_LEN_DIM]);
809825

@@ -830,12 +846,12 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
830846
prepare_for_new_conversation(prompt_length);
831847
});
832848

849+
process_longrope(m_prefill_request, m_prefill_in_ports, position_ids);
850+
833851
m_llm_profile["1/prefill:2.apply_lora"].record([&]() {
834852
apply_lora();
835853
});
836854

837-
process_longrope(m_prefill_request, m_prefill_in_ports, position_ids);
838-
839855
const bool use_chunk_prefill = m_npuw_llm_compiled_model->m_use_chunk_prefill;
840856
m_llm_profile["1/prefill:3.infer"].record([&]() {
841857
if (use_chunk_prefill) {
@@ -972,10 +988,12 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
972988
kvcache_desc.v_tensors_transposed_gen);
973989
}
974990
});
975-
m_lm_head_request->wait();
976-
LOG_DEBUG("Calling inference for LM head model -- done.");
991+
m_llm_profile["N/generate:4.lm_head"].record([&]() {
992+
m_lm_head_request->wait();
993+
LOG_DEBUG("Calling inference for LM head model -- done.");
977994

978-
m_logits = m_lm_head_request->get_tensor(m_lm_head_logits_port);
995+
m_logits = m_lm_head_request->get_tensor(m_lm_head_logits_port);
996+
});
979997
} else {
980998
m_llm_profile["N/generate:3.update_kvcache"].record([&]() {
981999
if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size) {

0 commit comments

Comments
 (0)