Skip to content

Commit f9de002

Browse files
dylanneve1claude
andcommitted
[NPUW] Add LLM-level profiling for 1st token generation analysis
Add fine-grained timing instrumentation to LLMInferRequest to help isolate the ~45ms low-power activity observed at the beginning of 1st token generation in NPU power measurements (EISW-202385). Profiled phases: input tensor fill, KV-cache clearing, generate request selection, LoRA application, prefill inference, LM head, and per-token generate preparation/inference/post-processing. Activated via OPENVINO_NPUW_PROF=1, reports on request destruction. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 3b7c385 commit f9de002

File tree

2 files changed

+156
-108
lines changed

2 files changed

+156
-108
lines changed

src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp

Lines changed: 151 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,9 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
204204
}
205205

206206
m_generate_initialized = false;
207+
208+
m_llm_profile.report_on_die = ov::npuw::profiling_enabled();
209+
m_llm_profile.area = "LLM/execution";
207210
}
208211

209212
std::string ov::npuw::LLMInferRequest::init_pre_alloc_device() {
@@ -447,28 +450,38 @@ void ov::npuw::LLMInferRequest::prepare_for_new_conversation() {
447450

448451
void ov::npuw::LLMInferRequest::prepare_for_new_conversation(int64_t prompt_length) {
449452
namespace uu = ov::npuw::util;
450-
uu::fill_tensor_bytes(m_prefill_request->get_tensor(m_prefill_in_ports.at(m_input_ids_name)), 0u);
451-
if (auto type_ids_port = m_prefill_in_ports.find(layer_names::token_type_ids);
452-
type_ids_port != m_prefill_in_ports.end()) {
453-
uu::fill_tensor_bytes(m_prefill_request->get_tensor(type_ids_port->second), 0u);
454-
}
455-
uu::fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::attention_mask)), 0);
456-
uu::fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::position_ids)), 0);
453+
namespace pp = ov::npuw::perf;
457454

458-
// Clear all past_key_values tensors - use cached ports for efficiency
459-
for (const auto& port : m_prefill_past_kv_ports) {
460-
uu::fill_tensor_bytes(m_prefill_request->get_tensor(port), 0u);
461-
}
455+
m_llm_profile["1/prefill:1a.fill_inputs"] += pp::ms_to_run([&]() {
456+
uu::fill_tensor_bytes(m_prefill_request->get_tensor(m_prefill_in_ports.at(m_input_ids_name)), 0u);
457+
if (auto type_ids_port = m_prefill_in_ports.find(layer_names::token_type_ids);
458+
type_ids_port != m_prefill_in_ports.end()) {
459+
uu::fill_tensor_bytes(m_prefill_request->get_tensor(type_ids_port->second), 0u);
460+
}
461+
uu::fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::attention_mask)), 0);
462+
uu::fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at(layer_names::position_ids)), 0);
463+
});
464+
465+
m_llm_profile["1/prefill:1b.clear_kvcache"] += pp::ms_to_run([&]() {
466+
// Clear all past_key_values tensors - use cached ports for efficiency
467+
for (const auto& port : m_prefill_past_kv_ports) {
468+
uu::fill_tensor_bytes(m_prefill_request->get_tensor(port), 0u);
469+
}
470+
});
462471

463472
m_npuw_llm_compiled_model->m_kvcache_desc.num_stored_tokens = 0u;
464473

465-
// Select the appropriate generate inference request variant based on prompt length
466-
// The function internally calculates expected total tokens (prompt + min_response_len)
467-
m_kvcache_request = select_generate_request(prompt_length);
468-
m_kvcache_in_ports = m_generate_variant_in_ports.at(m_kvcache_request);
469-
m_kvcache_out_ports = m_generate_variant_out_ports.at(m_kvcache_request);
474+
m_llm_profile["1/prefill:1c.select_gen_req"] += pp::ms_to_run([&]() {
475+
// Select the appropriate generate inference request variant based on prompt length
476+
// The function internally calculates expected total tokens (prompt + min_response_len)
477+
m_kvcache_request = select_generate_request(prompt_length);
478+
m_kvcache_in_ports = m_generate_variant_in_ports.at(m_kvcache_request);
479+
m_kvcache_out_ports = m_generate_variant_out_ports.at(m_kvcache_request);
480+
});
470481

471-
apply_lora();
482+
m_llm_profile["1/prefill:1d.apply_lora"] += pp::ms_to_run([&]() {
483+
apply_lora();
484+
});
472485
}
473486

474487
void ov::npuw::LLMInferRequest::copy_kvcache() {
@@ -786,7 +799,9 @@ void ov::npuw::LLMInferRequest::infer_whole_prefill(ov::SoPtr<ov::ITensor> input
786799
m_eagle3_ext.prepare_inputs(m_prefill_request, m_prefill_in_ports);
787800
}
788801

789-
m_prefill_request->infer();
802+
m_llm_profile["1/prefill:2a.npu_infer"] += ov::npuw::perf::ms_to_run([&]() {
803+
m_prefill_request->infer();
804+
});
790805
auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc;
791806
kvcache_desc.num_stored_tokens += static_cast<uint32_t>(input_ids->get_shape()[layer_ids::INPUT_IDS_SEQ_LEN_DIM]);
792807

@@ -809,28 +824,37 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
809824
"\"NPUW_LLM_MAX_PROMPT_LEN\" or shorten the prompt.");
810825
}
811826

812-
prepare_for_new_conversation(prompt_length);
827+
namespace pp = ov::npuw::perf;
813828

814-
const bool use_chunk_prefill = m_npuw_llm_compiled_model->m_use_chunk_prefill;
815-
if (use_chunk_prefill) {
816-
OPENVINO_ASSERT(!token_type_ids,
817-
"Chunking is not implemented for Gemma model family yet. "
818-
"Please set NPUW_LLM_PREFILL_HINT to 'STATIC'");
819-
infer_chunked_prefill(input_ids, attention_mask, position_ids);
820-
} else {
821-
infer_whole_prefill(input_ids, attention_mask, position_ids, token_type_ids);
822-
}
829+
m_llm_profile["1/prefill:1.prepare"] += pp::ms_to_run([&]() {
830+
prepare_for_new_conversation(prompt_length);
831+
});
823832

824-
if (m_lm_head_request) {
825-
LOG_DEBUG("Calling inference for LM head model.");
826-
m_lm_head_request->infer();
827-
m_logits = m_lm_head_request->get_tensor(m_lm_head_logits_port);
828-
} else {
829-
m_logits = m_prefill_request->get_tensor(m_prefill_out_ports.at(layer_names::logits));
830-
}
833+
m_llm_profile["1/prefill:2.infer"] += pp::ms_to_run([&]() {
834+
const bool use_chunk_prefill = m_npuw_llm_compiled_model->m_use_chunk_prefill;
835+
if (use_chunk_prefill) {
836+
OPENVINO_ASSERT(!token_type_ids,
837+
"Chunking is not implemented for Gemma model family yet. "
838+
"Please set NPUW_LLM_PREFILL_HINT to 'STATIC'");
839+
infer_chunked_prefill(input_ids, attention_mask, position_ids);
840+
} else {
841+
infer_whole_prefill(input_ids, attention_mask, position_ids, token_type_ids);
842+
}
843+
});
844+
845+
m_llm_profile["1/prefill:3.lm_head"] += pp::ms_to_run([&]() {
846+
if (m_lm_head_request) {
847+
LOG_DEBUG("Calling inference for LM head model.");
848+
m_lm_head_request->infer();
849+
m_logits = m_lm_head_request->get_tensor(m_lm_head_logits_port);
850+
} else {
851+
m_logits = m_prefill_request->get_tensor(m_prefill_out_ports.at(layer_names::logits));
852+
}
853+
});
831854

832855
// Update last_hidden_state only for non-chunked prefill
833856
// For chunked prefill, accumulate_chunk_last_hidden_state() already set the tensor
857+
const bool use_chunk_prefill = m_npuw_llm_compiled_model->m_use_chunk_prefill;
834858
if (m_eagle3_ext.is_eagle3_model() && !use_chunk_prefill) {
835859
m_eagle3_ext.update_last_hidden_state(m_prefill_request, m_prefill_out_ports);
836860
}
@@ -858,20 +882,30 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
858882
// prepare_for_new_conversation()
859883

860884
if (!m_generate_initialized) {
861-
LOG_DEBUG("Copy kv-cache from prefill to generate model.");
862-
if (kvcache_desc.num_stored_tokens > 0) {
863-
copy_kvcache();
864-
}
885+
namespace pp = ov::npuw::perf;
865886

866-
LOG_DEBUG("Prepare inputs.");
867-
namespace uu = ov::npuw::util;
868-
uu::fill_tensor_bytes(m_kvcache_request->get_tensor(m_kvcache_in_ports.at(m_input_ids_name)), 0u);
869-
uu::fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask)), 0);
870-
uu::fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids)), 0);
871-
if (token_type_ids) {
872-
uu::fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::token_type_ids)),
887+
m_llm_profile["N/generate:1.copy_kvcache"] += pp::ms_to_run([&]() {
888+
LOG_DEBUG("Copy kv-cache from prefill to generate model.");
889+
if (kvcache_desc.num_stored_tokens > 0) {
890+
copy_kvcache();
891+
}
892+
});
893+
894+
m_llm_profile["N/generate:2.init_inputs"] += pp::ms_to_run([&]() {
895+
LOG_DEBUG("Prepare inputs.");
896+
namespace uu = ov::npuw::util;
897+
uu::fill_tensor_bytes(m_kvcache_request->get_tensor(m_kvcache_in_ports.at(m_input_ids_name)), 0u);
898+
uu::fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask)),
873899
0);
874-
}
900+
uu::fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids)),
901+
0);
902+
if (token_type_ids) {
903+
uu::fill_tensor<int64_t>(
904+
m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::token_type_ids)),
905+
0);
906+
}
907+
});
908+
875909
m_generate_initialized = true;
876910
}
877911

@@ -880,77 +914,86 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
880914
OPENVINO_THROW("KV-Cache is full.");
881915
}
882916

883-
// FIXME: these tensors should be shared between the parent & child models
884-
// NB: input_ids can be either fp32(VLM) or i64(LLM)
885-
auto kv_input_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(m_input_ids_name));
886-
// NOTE: As `input_tokens_len` can be less than the value of `max_generation_token_len`, which
887-
// input layers of generation model are resized to, then we need to put
888-
// `input_tokens_len` prompt to the right of `max_generation_token_len`-sized tensors.
889-
// Attention mask should rule out all left unusable space.
890-
std::copy_n(
891-
reinterpret_cast<uint8_t*>(input_ids->data()),
892-
input_ids->get_byte_size(),
893-
reinterpret_cast<uint8_t*>(kv_input_ids->data()) + kv_input_ids->get_byte_size() - input_ids->get_byte_size());
917+
namespace pp = ov::npuw::perf;
894918

895-
if (token_type_ids) {
896-
auto kv_token_type_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::token_type_ids));
897-
util::copy_to_right(token_type_ids, kv_token_type_ids);
898-
}
919+
m_llm_profile["N/generate:3.prepare"] += pp::ms_to_run([&]() {
920+
// FIXME: these tensors should be shared between the parent & child models
921+
// NB: input_ids can be either fp32(VLM) or i64(LLM)
922+
auto kv_input_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(m_input_ids_name));
923+
// NOTE: As `input_tokens_len` can be less than the value of `max_generation_token_len`, which
924+
// input layers of generation model are resized to, then we need to put
925+
// `input_tokens_len` prompt to the right of `max_generation_token_len`-sized tensors.
926+
// Attention mask should rule out all left unusable space.
927+
std::copy_n(reinterpret_cast<uint8_t*>(input_ids->data()),
928+
input_ids->get_byte_size(),
929+
reinterpret_cast<uint8_t*>(kv_input_ids->data()) + kv_input_ids->get_byte_size() -
930+
input_ids->get_byte_size());
899931

900-
// NOTE: Attention mask pattern for generate model requires the set of "1"
901-
// units of length of the current prompt on the right (for present
902-
// kv layers) and the set of "1" units of number of previously calculated
903-
// tokens on the left (for past kv layers).
904-
auto kv_attn_mask = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask));
905-
std::copy_n(attention_mask->data<int64_t>(),
906-
attention_mask->get_size() - input_tokens_len,
907-
kv_attn_mask->data<int64_t>());
908-
if (input_tokens_len < kvcache_desc.max_generation_token_len) {
909-
std::fill_n(kv_attn_mask->data<int64_t>() + kv_attn_mask->get_size() - kvcache_desc.max_generation_token_len,
910-
kvcache_desc.max_generation_token_len - input_tokens_len,
911-
0);
912-
}
913-
std::fill_n(kv_attn_mask->data<int64_t>() + kv_attn_mask->get_size() - input_tokens_len, input_tokens_len, 1);
932+
if (token_type_ids) {
933+
auto kv_token_type_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::token_type_ids));
934+
util::copy_to_right(token_type_ids, kv_token_type_ids);
935+
}
914936

915-
auto kv_pos_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids));
916-
ov::npuw::util::pad_position_ids(kv_pos_ids, position_ids);
937+
// NOTE: Attention mask pattern for generate model requires the set of "1"
938+
// units of length of the current prompt on the right (for present
939+
// kv layers) and the set of "1" units of number of previously calculated
940+
// tokens on the left (for past kv layers).
941+
auto kv_attn_mask = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::attention_mask));
942+
std::copy_n(attention_mask->data<int64_t>(),
943+
attention_mask->get_size() - input_tokens_len,
944+
kv_attn_mask->data<int64_t>());
945+
if (input_tokens_len < kvcache_desc.max_generation_token_len) {
946+
std::fill_n(
947+
kv_attn_mask->data<int64_t>() + kv_attn_mask->get_size() - kvcache_desc.max_generation_token_len,
948+
kvcache_desc.max_generation_token_len - input_tokens_len,
949+
0);
950+
}
951+
std::fill_n(kv_attn_mask->data<int64_t>() + kv_attn_mask->get_size() - input_tokens_len, input_tokens_len, 1);
917952

918-
if (m_eagle3_ext.is_eagle3_model()) {
919-
m_eagle3_ext.prepare_inputs(m_kvcache_request, m_kvcache_in_ports);
920-
}
953+
auto kv_pos_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(layer_names::position_ids));
954+
ov::npuw::util::pad_position_ids(kv_pos_ids, position_ids);
921955

922-
m_kvcache_request->infer();
956+
if (m_eagle3_ext.is_eagle3_model()) {
957+
m_eagle3_ext.prepare_inputs(m_kvcache_request, m_kvcache_in_ports);
958+
}
959+
});
960+
961+
m_llm_profile["N/generate:4.infer"] += pp::ms_to_run([&]() {
962+
m_kvcache_request->infer();
963+
});
923964
kvcache_desc.num_stored_tokens += input_tokens_len;
924965

925-
if (m_lm_head_request) {
926-
LOG_DEBUG("Calling inference for LM head model asynchronously");
927-
m_lm_head_request->start_async();
928-
if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size) {
929-
update_kvcache_for(m_kvcache_request,
930-
m_kvcache_in_ports,
931-
m_kvcache_out_ports,
932-
input_tokens_len,
933-
kvcache_desc.v_tensors_transposed_gen);
934-
}
935-
m_lm_head_request->wait();
936-
LOG_DEBUG("Calling inference for LM head model -- done.");
966+
m_llm_profile["N/generate:5.post"] += pp::ms_to_run([&]() {
967+
if (m_lm_head_request) {
968+
LOG_DEBUG("Calling inference for LM head model asynchronously");
969+
m_lm_head_request->start_async();
970+
if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size) {
971+
update_kvcache_for(m_kvcache_request,
972+
m_kvcache_in_ports,
973+
m_kvcache_out_ports,
974+
input_tokens_len,
975+
kvcache_desc.v_tensors_transposed_gen);
976+
}
977+
m_lm_head_request->wait();
978+
LOG_DEBUG("Calling inference for LM head model -- done.");
937979

938-
m_logits = m_lm_head_request->get_tensor(m_lm_head_logits_port);
939-
} else {
940-
if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size) {
941-
update_kvcache_for(m_kvcache_request,
942-
m_kvcache_in_ports,
943-
m_kvcache_out_ports,
944-
input_tokens_len,
945-
kvcache_desc.v_tensors_transposed_gen);
946-
}
980+
m_logits = m_lm_head_request->get_tensor(m_lm_head_logits_port);
981+
} else {
982+
if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size) {
983+
update_kvcache_for(m_kvcache_request,
984+
m_kvcache_in_ports,
985+
m_kvcache_out_ports,
986+
input_tokens_len,
987+
kvcache_desc.v_tensors_transposed_gen);
988+
}
947989

948-
m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(layer_names::logits));
949-
}
990+
m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(layer_names::logits));
991+
}
950992

951-
if (m_eagle3_ext.is_eagle3_model()) {
952-
m_eagle3_ext.update_last_hidden_state(m_kvcache_request, m_kvcache_out_ports);
953-
}
993+
if (m_eagle3_ext.is_eagle3_model()) {
994+
m_eagle3_ext.update_last_hidden_state(m_kvcache_request, m_kvcache_out_ports);
995+
}
996+
});
954997

955998
LOG_DEBUG("Done");
956999
}

src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "llm_lora_states.hpp"
1414
#include "llm_prefix_caching.hpp"
1515
#include "openvino/core/descriptor/output.hpp"
16+
#include "perf.hpp"
1617

1718
namespace ov {
1819
namespace npuw {
@@ -127,6 +128,10 @@ class LLMInferRequest : public ov::npuw::LLMInferBaseRequest {
127128
// Support prefix caching
128129
std::unique_ptr<PrefixCachingHelper> m_prefix_caching_helper;
129130

131+
// LLM-level profiling for 1st token generation analysis
132+
using MS = ov::npuw::perf::metric<ov::npuw::perf::MSec>;
133+
ov::npuw::perf::Profile<MS> m_llm_profile;
134+
130135
// Friend declarations for PrefixCachingHelper to access protected members
131136
friend class PrefixCachingHelper;
132137
};

0 commit comments

Comments
 (0)