Skip to content

Commit f616896

Browse files
[NPUW] Serialization (#27915)
E-146009 E-149617
1 parent a45f30c commit f616896

File tree

12 files changed

+959
-16
lines changed

12 files changed

+959
-16
lines changed

src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,8 @@ class Config final {
423423

424424
std::string toString() const;
425425

426+
void fromString(const std::string& str);
427+
426428
private:
427429
std::shared_ptr<const OptionsDesc> _desc;
428430
ImplMap _impl;

src/plugins/intel_npu/src/al/src/config/config.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,31 @@ std::string Config::toString() const {
244244
return resultStream.str();
245245
}
246246

247+
void Config::fromString(const std::string& str) {
248+
std::map<std::string, std::string> config;
249+
std::string str_cfg(str);
250+
251+
auto parse_token = [&](const std::string& token) {
252+
auto pos_eq = token.find('=');
253+
auto key = token.substr(0, pos_eq);
254+
auto value = token.substr(pos_eq + 2, token.size() - pos_eq - 3);
255+
config[key] = value;
256+
};
257+
258+
size_t pos = 0;
259+
std::string token, key, value;
260+
while ((pos = str_cfg.find(' ')) != std::string::npos) {
261+
token = str_cfg.substr(0, pos);
262+
parse_token(token);
263+
str_cfg.erase(0, pos + 1);
264+
}
265+
266+
// Process tail
267+
parse_token(str_cfg);
268+
269+
update(config);
270+
}
271+
247272
//
248273
// envVarStrToBool
249274
//

src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp

Lines changed: 245 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (C) 2023-2024 Intel Corporation
1+
// Copyright (C) 2023-2025 Intel Corporation
22
// SPDX-License-Identifier: Apache-2.0
33
//
44
#include "compiled_model.hpp"
@@ -21,6 +21,7 @@
2121
#include "openvino/util/common_util.hpp"
2222
#include "partitioning/patterns/opt.hpp"
2323
#include "plugin.hpp"
24+
#include "serialization.hpp"
2425
#include "unfold_sync_infer_request.hpp"
2526
#include "util.hpp"
2627

@@ -486,6 +487,222 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
486487
report_io();
487488
}
488489

490+
ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
491+
const std::shared_ptr<const ov::IPlugin>& plugin,
492+
const bool serialized)
493+
: ov::npuw::ICompiledModel(model, plugin),
494+
m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()),
495+
m_cfg(m_options_desc),
496+
m_name(model->get_friendly_name()),
497+
m_loaded_from_cache(serialized) {
498+
::intel_npu::registerNPUWOptions(*m_options_desc);
499+
NPUW_ASSERT(serialized && "This constructor should only be utilized during deserialization!");
500+
LOG_DEBUG("CompiledModel is being deserialized, skipping the full constructor flow...");
501+
}
502+
503+
void ov::npuw::CompiledModel::CompiledModelDesc::serialize(std::ostream& stream) const {
504+
using namespace ov::npuw::s11n;
505+
506+
LOG_DEBUG("Serializing CompiledModelDesc...");
507+
LOG_BLOCK();
508+
509+
write(stream, replaced_by);
510+
511+
write(stream, param_base);
512+
write(stream, forced_to_fcall);
513+
514+
write(stream, host_gather.dst_idx);
515+
write(stream, host_gather.src_idx);
516+
write(stream, host_gather.idx_idx);
517+
518+
write(stream, spatial);
519+
520+
write(stream, scales);
521+
write(stream, zerops);
522+
write(stream, is_remote);
523+
524+
// NOTE: for closure only serialize uids - full flow
525+
write(stream, closure_uid);
526+
527+
// Some tensors might be present in CPU closure already - need to serialize as is
528+
// FIXME: When weightless serialization is introduced, this should be handled differently
529+
write(stream, closure.size());
530+
std::vector<ov::Tensor> cpu_closures;
531+
std::vector<std::size_t> cpu_closure_ids;
532+
for (std::size_t cidx = 0; cidx < closure.size(); ++cidx) {
533+
if (closure_uid[cidx] == -1) { // CPU closure, not in the bank
534+
cpu_closure_ids.push_back(cidx);
535+
cpu_closures.push_back(closure[cidx]);
536+
}
537+
}
538+
539+
write(stream, cpu_closure_ids);
540+
541+
for (const auto& tensor : cpu_closures) {
542+
write(stream, tensor);
543+
}
544+
545+
// FIXME: support weightless flow!
546+
547+
LOG_DEBUG("DONE.");
548+
}
549+
550+
void ov::npuw::CompiledModel::CompiledModelDesc::deserialize(std::istream& stream) {
551+
using namespace ov::npuw::s11n;
552+
553+
LOG_DEBUG("Deserializing CompiledModelDesc...");
554+
LOG_BLOCK();
555+
556+
read(stream, replaced_by);
557+
558+
read(stream, param_base);
559+
read(stream, forced_to_fcall);
560+
561+
read(stream, host_gather.dst_idx);
562+
read(stream, host_gather.src_idx);
563+
read(stream, host_gather.idx_idx);
564+
565+
read(stream, spatial);
566+
567+
read(stream, scales);
568+
read(stream, zerops);
569+
read(stream, is_remote);
570+
571+
// NOTE: for closure only deserialize uids - full flow
572+
read(stream, closure_uid);
573+
574+
// Some tensors might be present in CPU closure already - need to deserialize as is
575+
// FIXME: When weightless serialization is introduced, this should be handled differently
576+
std::size_t closure_size = 0;
577+
read(stream, closure_size);
578+
std::vector<std::size_t> cpu_closure_ids;
579+
read(stream, cpu_closure_ids);
580+
closure.resize(closure_size);
581+
for (const auto& cidx : cpu_closure_ids) {
582+
read(stream, closure[cidx]);
583+
}
584+
585+
// FIXME: support weightless flow!
586+
587+
LOG_DEBUG("DONE.");
588+
}
589+
590+
void ov::npuw::CompiledModel::serialize(std::ostream& stream) const {
591+
LOG_INFO("Serializing CompiledModel...");
592+
LOG_BLOCK();
593+
594+
using namespace ov::npuw::s11n;
595+
596+
// Serialize name
597+
write(stream, m_name);
598+
599+
// Serialize inputs and outputs
600+
write(stream, inputs());
601+
write(stream, outputs());
602+
603+
// Serialize meta
604+
write(stream, m_inputs_to_submodels_inputs);
605+
write(stream, m_outputs_to_submodels_outputs);
606+
write(stream, m_param_subscribers);
607+
write(stream, m_submodels_input_to_prev_output);
608+
609+
// Write device list
610+
write(stream, m_dev_list);
611+
612+
// Write config
613+
write(stream, m_cfg);
614+
615+
// Serialize compiled submodels
616+
write(stream, m_compiled_submodels.size());
617+
for (const auto& subm : m_compiled_submodels) {
618+
// Write device idx
619+
std::size_t device_idx = subm.device_it - m_dev_list.begin();
620+
write(stream, device_idx);
621+
// Write ICompiledModel if it's there
622+
if (subm.compiled_model) {
623+
write(stream, true);
624+
// FIXME: workaround for import/export model since import model seem to reset the file pointer
625+
std::stringstream ss;
626+
subm.compiled_model->export_model(ss);
627+
write(stream, ss.str());
628+
} else {
629+
write(stream, false);
630+
}
631+
// Write the rest of the submodel desc
632+
subm.serialize(stream);
633+
}
634+
635+
LOG_INFO("Done.");
636+
}
637+
638+
std::shared_ptr<ov::npuw::CompiledModel> ov::npuw::CompiledModel::deserialize(
639+
std::istream& stream,
640+
const std::shared_ptr<const ov::IPlugin>& plugin) {
641+
LOG_INFO("Deserializing CompiledModel...");
642+
LOG_BLOCK();
643+
644+
using namespace ov::npuw::s11n;
645+
646+
// Deserialize model name first
647+
std::string model_name;
648+
read(stream, model_name);
649+
650+
// Create a dummy CompiledModel with an empty ov::Model - this will skip the constructor flow
651+
// to continue deserialization
652+
ov::ParameterVector parameters;
653+
ov::NodeVector results;
654+
655+
read(stream, parameters);
656+
read(stream, results);
657+
658+
auto ov_model = std::make_shared<ov::Model>(results, parameters, model_name);
659+
660+
auto compiled = std::make_shared<ov::npuw::CompiledModel>(ov_model, plugin, true);
661+
662+
// Deserialize meta
663+
compiled->m_name = model_name;
664+
read(stream, compiled->m_inputs_to_submodels_inputs);
665+
read(stream, compiled->m_outputs_to_submodels_outputs);
666+
read(stream, compiled->m_param_subscribers);
667+
read(stream, compiled->m_submodels_input_to_prev_output);
668+
669+
// Deserialize device list
670+
read(stream, compiled->m_dev_list);
671+
672+
// Deserialize config
673+
read(stream, compiled->m_cfg);
674+
675+
// Deserialize compiled submodels
676+
std::size_t subm_size = 0;
677+
read(stream, subm_size);
678+
compiled->m_compiled_submodels.resize(subm_size);
679+
for (std::size_t i = 0; i < subm_size; ++i) {
680+
std::size_t device_idx = 0;
681+
read(stream, device_idx);
682+
683+
bool has_compiled_model = false;
684+
read(stream, has_compiled_model);
685+
if (has_compiled_model) {
686+
// Import model from the plugin
687+
// FIXME: workaround for import/export model since import model seems to reset the file pointer
688+
std::string buf;
689+
read(stream, buf);
690+
std::stringstream buffer(buf);
691+
compiled->m_compiled_submodels[i].compiled_model =
692+
plugin->get_core()->import_model(buffer, compiled->m_dev_list[device_idx]);
693+
}
694+
compiled->m_compiled_submodels[i].device_it = compiled->m_dev_list.begin() + device_idx;
695+
compiled->m_compiled_submodels[i].deserialize(stream);
696+
}
697+
698+
compiled->implement_properties();
699+
compiled->report_io();
700+
701+
LOG_INFO("Done.");
702+
703+
return compiled;
704+
}
705+
489706
void ov::npuw::CompiledModel::finalize_weights_bank() {
490707
LOG_INFO("Finalizing weights bank...");
491708
// Register lazy tensors
@@ -541,6 +758,33 @@ void ov::npuw::CompiledModel::finalize_weights_bank() {
541758
LOG_INFO("Done.");
542759
}
543760

761+
void ov::npuw::CompiledModel::reconstruct_closure() {
762+
for (size_t idx = 0; idx < m_compiled_submodels.size(); ++idx) {
763+
auto& comp_model_desc = m_compiled_submodels[idx];
764+
765+
// Skip optimized out and non-functions
766+
if (!comp_model_desc.compiled_model && !comp_model_desc.replaced_by) {
767+
continue;
768+
}
769+
770+
const auto real_idx = comp_model_desc.replaced_by.value_or(idx);
771+
auto& func_desc = m_compiled_submodels[real_idx];
772+
773+
// At this point closure size should have already been deserialized
774+
NPUW_ASSERT(!comp_model_desc.closure.empty() && "Closure shouldn't be empty at this point!");
775+
for (std::size_t cidx = 0; cidx < comp_model_desc.closure.size(); ++cidx) {
776+
if (comp_model_desc.closure[cidx]) {
777+
// host-side closure - already set, do nothing
778+
NPUW_ASSERT(!comp_model_desc.is_remote[cidx]);
779+
continue;
780+
}
781+
NPUW_ASSERT(comp_model_desc.closure_uid[cidx] != -1);
782+
comp_model_desc.closure[cidx] =
783+
m_weights_bank->get(comp_model_desc.closure_uid[cidx], *func_desc.device_it);
784+
}
785+
}
786+
}
787+
544788
void ov::npuw::CompiledModel::detach_memory() {
545789
LOG_INFO("Detaching model & weight memory...");
546790
LOG_BLOCK();

src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (C) 2023-2024 Intel Corporation
1+
// Copyright (C) 2023-2025 Intel Corporation
22
// SPDX-License-Identifier: Apache-2.0
33
//
44

@@ -40,6 +40,9 @@ class CompiledModel : public ov::npuw::ICompiledModel {
4040
CompiledModel(const std::shared_ptr<ov::Model>& model,
4141
const std::shared_ptr<const ov::IPlugin>& plugin,
4242
const ov::AnyMap& properties);
43+
CompiledModel(const std::shared_ptr<ov::Model>& model,
44+
const std::shared_ptr<const ov::IPlugin>& plugin,
45+
const bool serialized);
4346

4447
void export_model(std::ostream& model) const override;
4548
std::shared_ptr<const ov::Model> get_runtime_model() const override;
@@ -56,6 +59,7 @@ class CompiledModel : public ov::npuw::ICompiledModel {
5659
friend class UnfoldInferRequest;
5760
friend class MemAccessSim;
5861
friend class FuncMemMgr;
62+
friend class LLMCompiledModel;
5963

6064
bool compile_for_success(std::size_t id);
6165
bool compile_for_device(std::size_t id, const std::string& device_to_try);
@@ -66,6 +70,10 @@ class CompiledModel : public ov::npuw::ICompiledModel {
6670

6771
void report_io() const;
6872

73+
void serialize(std::ostream& stream) const;
74+
static std::shared_ptr<CompiledModel> deserialize(std::istream& stream,
75+
const std::shared_ptr<const ov::IPlugin>& plugin);
76+
6977
// This is used for removing too long output tensor names to fix some compilation issues
7078
// NB: These two methods has nothing to do with this particular class and should be
7179
// moved elsewhere
@@ -83,6 +91,9 @@ class CompiledModel : public ov::npuw::ICompiledModel {
8391
void log_device_dist() const;
8492
void implement_properties();
8593

94+
// For full deserialization flow with weights
95+
void reconstruct_closure();
96+
8697
void finalize_weights_bank();
8798
void detach_memory();
8899
std::string global_mem_device() const;
@@ -141,7 +152,7 @@ class CompiledModel : public ov::npuw::ICompiledModel {
141152
// lazy_closure is used for weights sharing and allocating device memory.
142153
std::vector<ov::Tensor> closure;
143154
std::vector<weights::LazyTensor> lazy_closure;
144-
std::vector<int64_t> closure_uid;
155+
std::vector<int64_t> closure_uid; // Note: value -1 is considered uninitialized
145156
std::vector<ov::Tensor> scales;
146157
std::vector<ov::Tensor> zerops;
147158
std::vector<bool> is_remote;
@@ -154,6 +165,9 @@ class CompiledModel : public ov::npuw::ICompiledModel {
154165

155166
// Metrics
156167
execution_stats stat;
168+
169+
void serialize(std::ostream& stream) const;
170+
void deserialize(std::istream& stream);
157171
};
158172
std::vector<CompiledModelDesc> m_compiled_submodels;
159173

0 commit comments

Comments
 (0)