Skip to content

Commit 4644007

Browse files
#1928 Too many data and uid copies when loading files (#1931)
Signed-off-by: shengjun.li <shengjun.li@zilliz.com> Co-authored-by: Jin Hai <hai.jin@zilliz.com>
1 parent 7ed6edc commit 4644007

File tree

5 files changed

+26
-22
lines changed

5 files changed

+26
-22
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ Please mark all change in change log and use the issue from GitHub
3333
- \#1885 Optimize knowhere unittest
3434
- \#1886 Refactor log on search and insert request
3535
- \#1897 Heap pop and push can be realized by heap_swap_top
36+
- \#1928 Fix too many data and uid copies when loading files
3637
- \#1930 Upgrade mishards to 0.8.0
3738

3839
## Task

core/src/codecs/default/DefaultVectorsFormat.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -90,15 +90,12 @@ DefaultVectorsFormat::read(const storage::FSHandlerPtr& fs_ptr, segment::Vectors
9090
for (; it != it_end; ++it) {
9191
const auto& path = it->path();
9292
if (path.extension().string() == raw_vector_extension_) {
93-
std::vector<uint8_t> vector_list;
93+
auto& vector_list = vectors_read->GetMutableData();
9494
read_vectors_internal(fs_ptr, path.string(), 0, INT64_MAX, vector_list);
95-
vectors_read->AddData(vector_list);
9695
vectors_read->SetName(path.stem().string());
97-
}
98-
if (path.extension().string() == user_id_extension_) {
99-
std::vector<segment::doc_id_t> uids;
96+
} else if (path.extension().string() == user_id_extension_) {
97+
auto& uids = vectors_read->GetMutableUids();
10098
read_uids_internal(fs_ptr, path.string(), uids);
101-
vectors_read->AddUids(uids);
10299
}
103100
}
104101
}

core/src/db/engine/ExecutionEngineImpl.cpp

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -375,8 +375,6 @@ ExecutionEngineImpl::Serialize() {
375375

376376
Status
377377
ExecutionEngineImpl::Load(bool to_cache) {
378-
// TODO(zhiru): refactor
379-
380378
index_ = std::static_pointer_cast<knowhere::VecIndex>(cache::CpuCacheMgr::GetInstance()->GetIndex(location_));
381379
bool already_in_cache = (index_ != nullptr);
382380
if (!already_in_cache) {
@@ -411,21 +409,19 @@ ExecutionEngineImpl::Load(bool to_cache) {
411409
auto& vectors = segment_ptr->vectors_ptr_;
412410
auto& deleted_docs = segment_ptr->deleted_docs_ptr_->GetDeletedDocs();
413411

414-
auto vectors_uids = vectors->GetUids();
412+
auto& vectors_uids = vectors->GetMutableUids();
413+
auto count = vectors_uids.size();
415414
index_->SetUids(vectors_uids);
416415
ENGINE_LOG_DEBUG << "set uids " << index_->GetUids().size() << " for index " << location_;
417416

418-
auto vectors_data = vectors->GetData();
417+
auto& vectors_data = vectors->GetData();
419418

420-
faiss::ConcurrentBitsetPtr concurrent_bitset_ptr =
421-
std::make_shared<faiss::ConcurrentBitset>(vectors->GetCount());
419+
faiss::ConcurrentBitsetPtr concurrent_bitset_ptr = std::make_shared<faiss::ConcurrentBitset>(count);
422420
for (auto& offset : deleted_docs) {
423-
if (!concurrent_bitset_ptr->test(offset)) {
424-
concurrent_bitset_ptr->set(offset);
425-
}
421+
concurrent_bitset_ptr->set(offset);
426422
}
427423

428-
auto dataset = knowhere::GenDataset(vectors->GetCount(), this->dim_, vectors_data.data());
424+
auto dataset = knowhere::GenDataset(count, this->dim_, vectors_data.data());
429425
if (index_type_ == EngineType::FAISS_IDMAP) {
430426
auto bf_index = std::static_pointer_cast<knowhere::IDMAP>(index_);
431427
bf_index->Train(knowhere::DatasetPtr(), conf);

core/src/segment/Vectors.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,6 @@
2828
namespace milvus {
2929
namespace segment {
3030

31-
Vectors::Vectors(std::vector<uint8_t> data, std::vector<doc_id_t> uids, const std::string& name)
32-
: data_(std::move(data)), uids_(std::move(uids)), name_(name) {
33-
}
34-
3531
void
3632
Vectors::AddData(const std::vector<uint8_t>& data) {
3733
data_.reserve(data_.size() + data.size());
@@ -120,6 +116,16 @@ Vectors::Erase(std::vector<int32_t>& offsets) {
120116
<< diff.count() << " s";
121117
}
122118

119+
std::vector<uint8_t>&
120+
Vectors::GetMutableData() {
121+
return data_;
122+
}
123+
124+
std::vector<doc_id_t>&
125+
Vectors::GetMutableUids() {
126+
return uids_;
127+
}
128+
123129
const std::vector<uint8_t>&
124130
Vectors::GetData() const {
125131
return data_;

core/src/segment/Vectors.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,6 @@ using doc_id_t = int64_t;
2828

2929
class Vectors {
3030
public:
31-
Vectors(std::vector<uint8_t> data, std::vector<doc_id_t> uids, const std::string& name);
32-
3331
Vectors() = default;
3432

3533
void
@@ -41,6 +39,12 @@ class Vectors {
4139
void
4240
SetName(const std::string& name);
4341

42+
std::vector<uint8_t>&
43+
GetMutableData();
44+
45+
std::vector<doc_id_t>&
46+
GetMutableUids();
47+
4448
const std::vector<uint8_t>&
4549
GetData() const;
4650

0 commit comments

Comments
 (0)