Skip to content

Commit a5af1e0

Browse files
authored
NPUW: Enable PREFILL/GENERATE configs in LLMCompiledModel (#28154)
### Details: - *Added parsing of passed `NPUW_LLM_PREFILL_CONFIG` and `NPUW_LLM_GENERATE_CONFIG` options* - *Added parsing of passed `NPUW_LLM_PAD_TOKEN_ID`* ### Tickets: - *EISW-149349* - *EISW-149350* ### Related PRs: - OpenVINO GenAI: openvinotoolkit/openvino.genai#1240
1 parent ef5678a commit a5af1e0

File tree

4 files changed

+73
-38
lines changed

4 files changed

+73
-38
lines changed

src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,14 @@ static constexpr ov::Property<uint32_t> max_prompt_len{"NPUW_LLM_MAX_PROMPT_LEN"
412412
*/
413413
static constexpr ov::Property<uint32_t> min_response_len{"NPUW_LLM_MIN_RESPONSE_LEN"};
414414

415+
/**
416+
* @brief
417+
* Type: ov::AnyMap.
418+
* Tell NPUW the configuration for compilation of prefill model.
419+
* NOTE: !! Write-only !!
420+
*/
421+
static constexpr ov::Property<ov::AnyMap> prefill_config{"NPUW_LLM_PREFILL_CONFIG"};
422+
415423
/**
416424
* @brief
417425
* Type: std::string.
@@ -421,6 +429,13 @@ static constexpr ov::Property<uint32_t> min_response_len{"NPUW_LLM_MIN_RESPONSE_
421429
*/
422430
static constexpr ov::Property<std::string> generate_hint{"NPUW_LLM_GENERATE_HINT"};
423431

432+
/**
433+
* @brief
434+
* Type: ov::AnyMap.
435+
* Tell NPUW the configuration for compilation of generate model.
436+
* NOTE: !! Write-only !!
437+
*/
438+
static constexpr ov::Property<ov::AnyMap> generate_config{"NPUW_LLM_GENERATE_CONFIG"};
424439
} // namespace llm
425440

426441
} // namespace npuw

src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,15 @@ std::optional<NPUDesc> extract_npu_descriptor(const std::shared_ptr<const ov::IP
321321
return std::make_optional(NPUDesc{arch.as<std::string>(), max_tiles.as<int64_t>()});
322322
}
323323

324+
std::optional<ov::Any> pop_option(ov::AnyMap& config, const std::string& option_name) {
325+
if (auto it = config.find(option_name); it != config.end()) {
326+
std::optional<ov::Any> found = std::make_optional(it->second);
327+
config.erase(it);
328+
return found;
329+
}
330+
return std::nullopt;
331+
}
332+
324333
ov::AnyMap get_baseline_common_config() {
325334
ov::AnyMap config = {
326335
{"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm"},
@@ -418,6 +427,13 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
418427
std::map<std::string, ov::Any> npuw_llm_props;
419428
std::map<std::string, ov::Any> other_props;
420429
split_llm_properties(properties, npuw_llm_props, other_props);
430+
431+
// Remove "NPUW_LLM_PREFILL_CONFIG", "NPUW_LLM_GENERATE_CONFIG" from map,
432+
// to not pass them into ::intel_npu::Config object, as we don't need to
433+
// preserve them somewhere.
434+
auto prefill_config_opt = pop_option(npuw_llm_props, std::string("NPUW_LLM_PREFILL_CONFIG"));
435+
auto generate_config_opt = pop_option(npuw_llm_props, std::string("NPUW_LLM_GENERATE_CONFIG"));
436+
421437
m_cfg.update(any_copy(npuw_llm_props));
422438

423439
LOG_DEBUG("1. Creating kvcache model as clone of passed one.");
@@ -455,17 +471,20 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
455471
prefill_model = cvt_kvcache_to_fp16(prefill_model);
456472

457473
auto npudesc = extract_npu_descriptor(plugin);
458-
ov::AnyMap properties_copy = other_props;
459-
auto prefill_config = get_default_prefill_config(model, npudesc);
474+
auto prefill_config =
475+
prefill_config_opt.value_or(get_default_prefill_config(prefill_model, npudesc)).as<ov::AnyMap>();
460476

461-
// NB: GENERATE_HINT is only applicable for default generate config!
462477
const ::intel_npu::npuw::llm::GenerateHint generate_hint = m_cfg.get<::intel_npu::NPUW_LLM_GENERATE_HINT>();
463-
LOG_DEBUG(
464-
"10. Passed GENERATE_HINT: " << std::string(::intel_npu::NPUW_LLM_GENERATE_HINT::toString(generate_hint)));
465-
auto generate_config = get_default_generate_config(model, npudesc, generate_hint);
478+
LOG_DEBUG("9. Passed GENERATE_HINT: " << std::string(::intel_npu::NPUW_LLM_GENERATE_HINT::toString(generate_hint)));
479+
// NB: GENERATE_HINT is only applicable for default generate config!
480+
if (generate_config_opt.has_value() && npuw_llm_props.count(ov::intel_npu::npuw::llm::generate_hint.name())) {
481+
OPENVINO_THROW("GENERATE_HINT is only applicable for default generate config!");
482+
}
483+
auto generate_config =
484+
generate_config_opt.value_or(get_default_generate_config(model, npudesc, generate_hint)).as<ov::AnyMap>();
466485

467-
merge_config_with(prefill_config, properties_copy);
468-
merge_config_with(generate_config, properties_copy);
486+
merge_config_with(prefill_config, other_props);
487+
merge_config_with(generate_config, other_props);
469488

470489
m_kvcache_compiled = std::make_shared<ov::npuw::CompiledModel>(kvcache_model, plugin, generate_config);
471490
m_prefill_compiled = std::make_shared<ov::npuw::CompiledModel>(prefill_model, plugin, prefill_config);
@@ -488,6 +507,11 @@ void ov::npuw::LLMCompiledModel::set_property(const ov::AnyMap& properties) {
488507

489508
ov::Any ov::npuw::LLMCompiledModel::get_property(const std::string& name) const {
490509
OPENVINO_SUPPRESS_DEPRECATED_START
510+
if (name == ov::intel_npu::npuw::llm::prefill_config.name() ||
511+
name == ov::intel_npu::npuw::llm::generate_config.name()) {
512+
OPENVINO_THROW(name, " is write-only option!");
513+
}
514+
491515
auto&& configIterator = m_prop_to_opt.find(name);
492516
if (configIterator != m_prop_to_opt.cend()) {
493517
return std::get<1>(configIterator->second)(m_cfg);
@@ -504,7 +528,7 @@ std::shared_ptr<ov::ISyncInferRequest> ov::npuw::LLMCompiledModel::create_sync_i
504528

505529
std::shared_ptr<ov::ISyncInferRequest> ov::npuw::LLMCompiledModel::create_llm_infer_request() {
506530
auto this_sptr = std::static_pointer_cast<ov::npuw::LLMCompiledModel>(shared_from_this());
507-
return std::make_shared<ov::npuw::LLMInferRequest>(this_sptr, m_kvcache_desc);
531+
return std::make_shared<ov::npuw::LLMInferRequest>(this_sptr);
508532
}
509533

510534
void ov::npuw::LLMCompiledModel::implement_properties() {

src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp

Lines changed: 23 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,9 @@ void copy_columns_by_row_chunks(ov::SoPtr<ov::ITensor> src, ov::SoPtr<ov::ITenso
5959
}
6060
} // anonymous namespace
6161

62-
ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model,
63-
const ov::npuw::LLMCompiledModel::KVCacheDesc& kvcache_desc)
62+
ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model)
6463
: ov::ISyncInferRequest(compiled_model),
65-
m_kvcache_desc(kvcache_desc) {
64+
m_npuw_llm_compiled_model(compiled_model) {
6665
m_kvcache_request = compiled_model->m_kvcache_compiled->create_infer_request();
6766
m_prefill_request = compiled_model->m_prefill_compiled->create_infer_request();
6867

@@ -82,13 +81,11 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
8281
}
8382

8483
void ov::npuw::LLMInferRequest::prepare_for_new_conversation() {
85-
// FIXME: for input_ids it must be padding from tokenizer that not available from here
86-
// Get it from NPUW options
87-
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), 0u);
88-
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0u);
89-
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0u);
90-
fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0u);
91-
m_kvcache_desc.num_stored_tokens = 0u;
84+
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), 0);
85+
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0);
86+
fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0);
87+
fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0);
88+
m_npuw_llm_compiled_model->m_kvcache_desc.num_stored_tokens = 0u;
9289
}
9390

9491
void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
@@ -112,7 +109,7 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
112109
std::copy_n(position_ids->data<int64_t>(), position_ids->get_size(), padded_position_ids->data<int64_t>() + offset);
113110

114111
m_prefill_request->infer();
115-
m_kvcache_desc.num_stored_tokens += static_cast<uint32_t>(input_ids->get_size());
112+
m_npuw_llm_compiled_model->m_kvcache_desc.num_stored_tokens += static_cast<uint32_t>(input_ids->get_size());
116113
m_need_copy_kvcache = true;
117114

118115
m_logits = m_prefill_request->get_tensor(m_prefill_out_ports.at("logits"));
@@ -126,8 +123,9 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
126123
LOG_DEBUG("Calling inference for generate model...");
127124
LOG_BLOCK();
128125

126+
auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc;
129127
// NB: KV-cache is full, further generation is impossible
130-
if (m_kvcache_desc.num_stored_tokens == m_kvcache_desc.total_size) {
128+
if (kvcache_desc.num_stored_tokens == kvcache_desc.total_size) {
131129
OPENVINO_THROW("KV-Cache is full.");
132130
}
133131

@@ -146,17 +144,16 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
146144
// taking into account kvcache dimension.
147145
fill_tensor<ov::float16>(kvcache_in_tensor, 0);
148146

149-
const auto& kv_dim = (output_name.find("value") != std::string::npos && m_kvcache_desc.v_tensors_transposed)
147+
const auto& kv_dim = (output_name.find("value") != std::string::npos && kvcache_desc.v_tensors_transposed)
150148
? 3u
151-
: m_kvcache_desc.dim;
149+
: kvcache_desc.dim;
152150

153-
auto prefill_out_slice =
154-
make_tensor_slice(prefill_out_tensor,
155-
kv_dim,
156-
m_kvcache_desc.max_prompt_size - m_kvcache_desc.num_stored_tokens,
157-
m_kvcache_desc.max_prompt_size);
151+
auto prefill_out_slice = make_tensor_slice(prefill_out_tensor,
152+
kv_dim,
153+
kvcache_desc.max_prompt_size - kvcache_desc.num_stored_tokens,
154+
kvcache_desc.max_prompt_size);
158155

159-
auto kvcache_in_slice = make_tensor_slice(kvcache_in_tensor, kv_dim, 0u, m_kvcache_desc.num_stored_tokens);
156+
auto kvcache_in_slice = make_tensor_slice(kvcache_in_tensor, kv_dim, 0u, kvcache_desc.num_stored_tokens);
160157

161158
if (kv_dim == 3u) {
162159
copy_columns_by_row_chunks(prefill_out_slice, kvcache_in_slice);
@@ -168,7 +165,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
168165
LOG_DEBUG("Prepare attention mask pattern.");
169166
auto* attention_mask_data =
170167
m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask"))->data<int64_t>();
171-
attention_mask_data[m_kvcache_desc.total_size - 1] = 1;
168+
attention_mask_data[kvcache_desc.total_size - 1] = 1;
172169

173170
m_need_copy_kvcache = false;
174171
}
@@ -185,7 +182,7 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
185182

186183
m_kvcache_request->infer();
187184
m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at("logits"));
188-
m_kvcache_desc.num_stored_tokens += 1;
185+
kvcache_desc.num_stored_tokens += 1;
189186

190187
LOG_DEBUG("Write KV-cache for the new token to the correct input position for next iteration.");
191188
const std::size_t kStartOutputKVCacheLayers = 1u;
@@ -194,13 +191,13 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
194191
const auto& output_name = kvcache_compiled->outputs()[kStartOutputKVCacheLayers + i].get_any_name();
195192
const auto& input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values");
196193
auto kvcache_in_tensor = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(input_name));
197-
const auto& kv_dim = (output_name.find("value") != std::string::npos && m_kvcache_desc.v_tensors_transposed)
194+
const auto& kv_dim = (output_name.find("value") != std::string::npos && kvcache_desc.v_tensors_transposed)
198195
? 3u
199-
: m_kvcache_desc.dim;
196+
: kvcache_desc.dim;
200197
auto kvcache_in_slice = make_tensor_slice(kvcache_in_tensor,
201198
kv_dim,
202-
m_kvcache_desc.num_stored_tokens - 1,
203-
m_kvcache_desc.num_stored_tokens);
199+
kvcache_desc.num_stored_tokens - 1,
200+
kvcache_desc.num_stored_tokens);
204201
auto kvcache_out_tensor = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(output_name));
205202
kvcache_out_tensor->copy_to(kvcache_in_slice._ptr);
206203
}

src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,7 @@ namespace npuw {
1515

1616
class LLMInferRequest final : public ov::ISyncInferRequest {
1717
public:
18-
explicit LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model,
19-
const ov::npuw::LLMCompiledModel::KVCacheDesc& kvcache_desc);
18+
explicit LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model);
2019

2120
void infer() override;
2221

@@ -44,7 +43,7 @@ class LLMInferRequest final : public ov::ISyncInferRequest {
4443

4544
std::shared_ptr<ov::IAsyncInferRequest> m_kvcache_request;
4645
std::shared_ptr<ov::IAsyncInferRequest> m_prefill_request;
47-
LLMCompiledModel::KVCacheDesc m_kvcache_desc;
46+
std::shared_ptr<LLMCompiledModel> m_npuw_llm_compiled_model;
4847
ov::SoPtr<ov::ITensor> m_logits;
4948
bool m_need_copy_kvcache = false;
5049

0 commit comments

Comments
 (0)