-
Notifications
You must be signed in to change notification settings - Fork 3.1k
[NPUW] gemma-2 patterns added to preserve tail constants matcher #32465
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 16 commits
6ad8b26
1209046
8c2da6f
29cb65e
1ea470e
3c19c32
be1e6d4
88c3e74
3125293
84cb7bc
5274f04
268048a
0a811c9
b872130
2ca0339
a9af6f9
fb12f9a
240e41b
a0186bd
d673f36
d52dc2e
8e04ee1
87029d7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1162,6 +1162,7 @@ struct NPUDesc { | |
| std::string arch; | ||
| int64_t max_tiles = 0; | ||
| bool compiler_dq = false; | ||
| bool compiler_matmul_gate = false; | ||
| int64_t compiler_ver = 0; | ||
| bool support_flash_attention_tile = false; | ||
| }; | ||
|
|
@@ -1199,6 +1200,19 @@ std::optional<NPUDesc> extract_npu_descriptor(const std::shared_ptr<const ov::IP | |
| ov::AnyMap{{ov::intel_npu::compiler_type.name(), target_compiler_type}}) | ||
| .as<int64_t>(); | ||
| } | ||
| LOG_INFO("Compiler version: " << ONEAPI_VERSION_MAJOR(desc.compiler_ver) << "." | ||
| << ONEAPI_VERSION_MINOR(desc.compiler_ver)); | ||
|
|
||
| constexpr std::string_view compiler_gate_support_msg = | ||
| "Compiler: accurate gated matmul (MatMul -> Divide -> Tanh -> Multiply -> Result) : "; | ||
|
|
||
| if (desc.compiler_ver >= ONEAPI_MAKE_VERSION(7, 28)) { | ||
| // accuracy for gated matmul fixed at 7.28 | ||
| desc.compiler_matmul_gate = true; | ||
| LOG_INFO(compiler_gate_support_msg << "supported"); | ||
| } else { | ||
| LOG_WARN(compiler_gate_support_msg << "unsupported"); | ||
| } | ||
|
|
||
| if (desc.arch == "5010" && desc.compiler_ver >= ONEAPI_MAKE_VERSION(7, 29)) { | ||
| // Flash attention tile is supported starting from compiler version 7.29 on NPU5010 | ||
|
|
@@ -1247,6 +1261,11 @@ ov::AnyMap get_baseline_common_config(const std::optional<NPUDesc>& npudesc) { | |
| config.erase("NPUW_DCOFF_TYPE"); | ||
| config.erase("NPUW_DCOFF_SCALE"); | ||
| } | ||
|
|
||
| // default version is ON - while for older compiler it might be turned off | ||
| if (npudesc.has_value()) { | ||
| config.emplace("NPUW_MM_GATED", (npudesc->compiler_matmul_gate ? "YES" : "NO")); | ||
| } | ||
| return config; | ||
| } | ||
|
|
||
|
|
@@ -1878,7 +1897,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m | |
| if (!m_is_embedding) { | ||
| if (!m_use_chunk_prefill) { | ||
| // TODO: sometimes it is ok if we cannot find any empty inputs or not? | ||
| NPUW_ASSERT(remove_empty_kv_inputs(prefill_model)); | ||
| remove_empty_kv_inputs(prefill_model); | ||
|
||
| } else { | ||
| LOG_DEBUG("Don't remove input key/values from prefill model."); | ||
| LOG_DEBUG("Ask prefill model to output key/values for prefill chunk size tokens."); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.