From 95e4cf1809abed903808244928b224ab0fb22812 Mon Sep 17 00:00:00 2001 From: Narek Galstyan Date: Sun, 30 Jul 2023 23:43:06 +0000 Subject: [PATCH 1/2] Add compile time feature-gate for lookups and removals --- c/lib.cpp | 12 ++++-- cpp/CMakeLists.txt | 4 ++ cpp/test.cpp | 4 ++ include/usearch/index_punned_dense.hpp | 59 +++++++++++++++++++++++--- 4 files changed, 70 insertions(+), 9 deletions(-) diff --git a/c/lib.cpp b/c/lib.cpp index a3a283c3..adf7b533 100644 --- a/c/lib.cpp +++ b/c/lib.cpp @@ -59,6 +59,7 @@ add_result_t add_(index_t* index, usearch_label_t label, void const* vector, sca } } +#if USEARCH_LOOKUP_LABEL bool get_(index_t* index, label_t label, void* vector, scalar_kind_t kind) { switch (kind) { case scalar_kind_t::f32_k: return index->get(label, (f32_t*)vector); @@ -69,6 +70,7 @@ bool get_(index_t* index, label_t label, void* vector, scalar_kind_t kind) { default: return index->empty_search_result().failed("Unknown scalar kind!"); } } +#endif search_result_t search_(index_t* index, void const* vector, scalar_kind_t kind, size_t n) { switch (kind) { @@ -153,7 +155,7 @@ USEARCH_EXPORT void usearch_reserve(usearch_index_t index, size_t capacity, usea reinterpret_cast(index)->reserve(capacity); } -USEARCH_EXPORT void usearch_add( // +USEARCH_EXPORT void usearch_add( // usearch_index_t index, usearch_label_t label, void const* vector, usearch_scalar_kind_t kind, // usearch_error_t* error) { add_result_t result = add_(reinterpret_cast(index), label, vector, to_native_scalar(kind)); @@ -161,11 +163,13 @@ USEARCH_EXPORT void usearch_add( *error = result.error.what(); } +#if USEARCH_LOOKUP_LABEL USEARCH_EXPORT bool usearch_contains(usearch_index_t index, usearch_label_t label, usearch_error_t*) { return reinterpret_cast(index)->contains(label); } +#endif -USEARCH_EXPORT size_t usearch_search( // +USEARCH_EXPORT size_t usearch_search( // usearch_index_t index, void const* vector, usearch_scalar_kind_t kind, size_t results_limit, // usearch_label_t* found_labels, usearch_distance_t* found_distances, usearch_error_t* error) { search_result_t result = search_(reinterpret_cast(index), vector, to_native_scalar(kind), results_limit); @@ -177,11 +181,13 @@ USEARCH_EXPORT size_t usearch_search( return result.dump_to(found_labels, found_distances); } -USEARCH_EXPORT bool usearch_get( // +#if USEARCH_LOOKUP_LABEL +USEARCH_EXPORT bool usearch_get( // usearch_index_t index, usearch_label_t label, // void* vector, usearch_scalar_kind_t kind, usearch_error_t*) { return get_(reinterpret_cast(index), label, vector, to_native_scalar(kind)); } +#endif USEARCH_EXPORT void usearch_remove(usearch_index_t, usearch_label_t, usearch_error_t* error) { if (error != nullptr) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 44bb2ca0..83cfb4a9 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -3,6 +3,7 @@ cmake_minimum_required(VERSION 3.1) option(USEARCH_USE_OPENMP "Use OpenMP for a thread pool" OFF) option(USEARCH_USE_SIMSIMD "Use SimSIMD hardware-accelerated metrics" OFF) option(USEARCH_USE_JEMALLOC "Use JeMalloc for faster memory allocations" OFF) +option(USEARCH_LOOKUP_LABEL "Compile with label lookup and removal tests" OFF) # Make "Release" by default if(NOT CMAKE_BUILD_TYPE) @@ -83,6 +84,9 @@ if(${USEARCH_BUILD_TEST}) target_link_libraries(test PRIVATE Threads::Threads) target_include_directories(test PRIVATE ${USEARCH_PUNNED_INCLUDE_DIRS}) set_target_properties(test PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) + if (${USEARCH_LOOKUP_LABEL}) + target_compile_definitions(test PRIVATE USEARCH_LOOKUP_LABEL=1) + endif() if(${CMAKE_VERSION} VERSION_EQUAL 3.13 OR ${CMAKE_VERSION} VERSION_GREATER 3.13) include(CTest) diff --git a/cpp/test.cpp b/cpp/test.cpp index 1045e1f8..f45bf336 100644 --- a/cpp/test.cpp +++ b/cpp/test.cpp @@ -92,16 +92,20 @@ template void test3d_punned(index_at&& i index.add(42, view_t{&vec42[0], 3ul}); // Reconstruct +#if USEARCH_LOOKUP_LABEL scalar_t vec42_reconstructed[3] = {0, 0, 0}; index.get(42, span_t{&vec42_reconstructed[0], 3ul}); expect(vec42_reconstructed[0] == vec42[0]); expect(vec42_reconstructed[1] == vec42[1]); expect(vec42_reconstructed[2] == vec42[2]); +#endif index.add(43, view_t{&vec43[0], 3ul}); expect(index.size() == 2); +#if USEARCH_LOOKUP_LABEL index.remove(43); expect(index.size() == 1); +#endif } template void test_sets(index_at&& index) { diff --git a/include/usearch/index_punned_dense.hpp b/include/usearch/index_punned_dense.hpp index b2a19613..782cd36c 100644 --- a/include/usearch/index_punned_dense.hpp +++ b/include/usearch/index_punned_dense.hpp @@ -185,12 +185,14 @@ class index_punned_dense_gt { using shared_lock_t = std::unique_lock; using unique_lock_t = std::unique_lock; +#if USEARCH_LOOKUP_LABEL mutable shared_mutex_t labeled_lookup_mutex_; tsl::robin_map labeled_lookup_; mutable std::mutex free_ids_mutex_; ring_gt free_ids_; label_t free_label_; +#endif public: using search_result_t = typename index_t::search_result_t; @@ -211,10 +213,15 @@ class index_punned_dense_gt { cast_buffer_(std::move(other.cast_buffer_)), // casts_(std::move(other.casts_)), // root_metric_(std::move(other.root_metric_)), // - available_threads_(std::move(other.available_threads_)), // - labeled_lookup_(std::move(other.labeled_lookup_)), // - free_ids_(std::move(other.free_ids_)), // - free_label_(std::move(other.free_label_)) {} // + available_threads_(std::move(other.available_threads_)) +#if USEARCH_LOOKUP_LABEL + , + labeled_lookup_(std::move(other.labeled_lookup_)), // + free_ids_(std::move(other.free_ids_)), // + free_label_(std::move(other.free_label_)) // +#endif + { + } // index_punned_dense_gt& operator=(index_punned_dense_gt&& other) { swap(other); @@ -236,9 +243,11 @@ class index_punned_dense_gt { std::swap(casts_, other.casts_); std::swap(root_metric_, other.root_metric_); std::swap(available_threads_, other.available_threads_); +#if USEARCH_LOOKUP_LABEL std::swap(labeled_lookup_, other.labeled_lookup_); std::swap(free_ids_, other.free_ids_); std::swap(free_label_, other.free_label_); +#endif } ~index_punned_dense_gt() { @@ -303,7 +312,13 @@ class index_punned_dense_gt { std::size_t dimensions() const { return dimensions_; } std::size_t scalar_words() const { return scalar_words_; } std::size_t connectivity() const { return typed_->connectivity(); } - std::size_t size() const { return typed_->size() - free_ids_.size(); } + std::size_t size() const { +#if USEARCH_LOOKUP_LABEL + return typed_->size() - free_ids_.size(); +#else + return typed_->size(); +#endif + } std::size_t capacity() const { return typed_->capacity(); } std::size_t max_level() const noexcept { return typed_->max_level(); } index_config_t const& config() const { return typed_->config(); } @@ -356,11 +371,13 @@ class index_punned_dense_gt { search_result_t search(f32_t const* vector, std::size_t wanted, search_config_t config) const { return search_(vector, wanted, config, casts_.from_f32); } search_result_t search(f64_t const* vector, std::size_t wanted, search_config_t config) const { return search_(vector, wanted, config, casts_.from_f64); } +#if USEARCH_LOOKUP_LABEL bool get(label_t label, b1x8_t* vector) const { return get_(label, vector, casts_.to_b1x8); } bool get(label_t label, f8_bits_t* vector) const { return get_(label, vector, casts_.to_f8); } bool get(label_t label, f16_t* vector) const { return get_(label, vector, casts_.to_f16); } bool get(label_t label, f32_t* vector) const { return get_(label, vector, casts_.to_f32); } bool get(label_t label, f64_t* vector) const { return get_(label, vector, casts_.to_f64); } +#endif // clang-format on search_result_t empty_search_result() const { return search_result_t{*typed_}; } @@ -370,10 +387,12 @@ class index_punned_dense_gt { * @return `true` if the memory reservation was successful, `false` otherwise. */ bool reserve(index_limits_t limits) { +#if USEARCH_LOOKUP_LABEL { unique_lock_t lock(labeled_lookup_mutex_); labeled_lookup_.reserve(limits.members); } +#endif return typed_->reserve(limits); } @@ -381,11 +400,13 @@ class index_punned_dense_gt { * @brief Clears the whole index, reclaiming the memory. */ void clear() { +#if USEARCH_LOOKUP_LABEL unique_lock_t lookup_lock(labeled_lookup_mutex_); std::unique_lock free_lock(free_ids_mutex_); - typed_->clear(); labeled_lookup_.clear(); free_ids_.clear(); +#endif + typed_->clear(); } /** @@ -402,8 +423,10 @@ class index_punned_dense_gt { */ serialization_result_t load(char const* path) { serialization_result_t result = typed_->load(path); +#if USEARCH_LOOKUP_LABEL if (result) reindex_labels_(); +#endif return result; } @@ -414,11 +437,14 @@ class index_punned_dense_gt { */ serialization_result_t view(char const* path) { serialization_result_t result = typed_->view(path); +#if USEARCH_LOOKUP_LABEL if (result) reindex_labels_(); +#endif return result; } +#if USEARCH_LOOKUP_LABEL /** * @brief Checks if a vector with specidied label is present. * @return `true` if the label is present in the index, `false` otherwise. @@ -427,6 +453,7 @@ class index_punned_dense_gt { shared_lock_t lock(labeled_lookup_mutex_); return labeled_lookup_.contains(label); } +#endif struct labeling_result_t { error_t error{}; @@ -439,6 +466,7 @@ class index_punned_dense_gt { } }; +#if USEARCH_LOOKUP_LABEL /** * @brief Removes an entry with the specified label from the index. * @param[in] label The label of the entry to remove. @@ -552,6 +580,7 @@ class index_punned_dense_gt { for (; it != labeled_lookup_.end() && limit; ++it, ++labels, --limit) *labels = it->first; } +#endif /** * @brief Adapts the Male-Optimal Stable Marriage algorithm for unequal sets @@ -609,12 +638,14 @@ class index_punned_dense_gt { auto typed_result = typed_->copy(config); if (!typed_result) return result.failed(std::move(typed_result.error)); +#if USEARCH_LOOKUP_LABEL if (!result.index.free_ids_.reserve(free_ids_.size())) return result.failed(std::move(typed_result.error)); for (std::size_t i = 0; i != free_ids_.size(); ++i) result.index.free_ids_.push(free_ids_[i]); result.index.labeled_lookup_ = labeled_lookup_; +#endif *result.index.typed_ = std::move(typed_result.index); return result; } @@ -658,6 +689,7 @@ class index_punned_dense_gt { } }; +#if USEARCH_LOOKUP_LABEL /** * @brief Performs compaction on the index, pruning links to removed entries. * @param executor The executor parallel processing. Default ::dummy_executor_t single-threaded. @@ -679,6 +711,7 @@ class index_punned_dense_gt { result.pruned_edges = pruned_edges; return result; } +#endif private: struct thread_lock_t { @@ -714,20 +747,24 @@ class index_punned_dense_gt { // Check if there are some removed entries, whose nodes we can reuse id_t free_id = default_free_value(); +#if USEARCH_LOOKUP_LABEL { std::unique_lock lock(free_ids_mutex_); free_ids_.try_pop(free_id); } +#endif // Perform the insertion or the update add_result_t result = // free_id != default_free_value() // ? typed_->update(free_id, label, {vector_data, vector_bytes}, config) : typed_->add(label, {vector_data, vector_bytes}, config); +#if USEARCH_LOOKUP_LABEL { unique_lock_t lock(labeled_lookup_mutex_); labeled_lookup_.emplace(label, result.id); } +#endif return result; } @@ -744,15 +781,22 @@ class index_punned_dense_gt { if (casted) vector_data = casted_data, vector_bytes = casted_vector_bytes_; +#if USEARCH_LOOKUP_LABEL auto allow = [=](match_t const& match) noexcept { return match.member.label != free_label_; }; return typed_->search({vector_data, vector_bytes}, wanted, config, allow); +#else + return typed_->search({vector_data, vector_bytes}, wanted, config); +#endif } +#if USEARCH_LOOKUP_LABEL id_t lookup_id_(label_t label) const { shared_lock_t lock(labeled_lookup_mutex_); return labeled_lookup_.at(label); } +#endif +#if USEARCH_LOOKUP_LABEL void reindex_labels_() { // Estimate number of entries first @@ -795,6 +839,7 @@ class index_punned_dense_gt { std::memcpy(reconstructed, punned_vector, casted_vector_bytes_); return true; } +#endif template add_result_t add_(label_t label, scalar_at const* vector, cast_t const& cast) { thread_lock_t lock = thread_lock_(); @@ -828,7 +873,9 @@ class index_punned_dense_gt { result.cast_buffer_.resize(hardware_threads * result.casted_vector_bytes_); result.casts_ = casts; result.root_metric_ = metric; +#if USEARCH_LOOKUP_LABEL result.free_label_ = free_label; +#endif // Fill the thread IDs. result.available_threads_.resize(hardware_threads); From e763615aff25c3993d489da2952ce4e13fcefd54 Mon Sep 17 00:00:00 2001 From: Narek Galstyan Date: Mon, 31 Jul 2023 07:42:46 +0000 Subject: [PATCH 2/2] Port external index machinery to the latest version of usearch --- c/lib.cpp | 61 +++- c/usearch.h | 25 +- include/usearch/index.hpp | 401 +++++++++++++++++++------ include/usearch/index_punned_dense.hpp | 48 ++- 4 files changed, 440 insertions(+), 95 deletions(-) diff --git a/c/lib.cpp b/c/lib.cpp index adf7b533..fd091658 100644 --- a/c/lib.cpp +++ b/c/lib.cpp @@ -1,4 +1,5 @@ #include +#include #include @@ -48,9 +49,10 @@ scalar_kind_t to_native_scalar(usearch_scalar_kind_t kind) { } } -add_result_t add_(index_t* index, usearch_label_t label, void const* vector, scalar_kind_t kind) { +add_result_t add_(index_t* index, usearch_label_t label, void const* vector, scalar_kind_t kind, int32_t level, + void* tape) { switch (kind) { - case scalar_kind_t::f32_k: return index->add(label, (f32_t const*)vector); + case scalar_kind_t::f32_k: return index->add(label, (f32_t const*)vector, level, (byte_t*)tape); case scalar_kind_t::f64_k: return index->add(label, (f64_t const*)vector); case scalar_kind_t::f16_k: return index->add(label, (f16_t const*)vector); case scalar_kind_t::f8_k: return index->add(label, (f8_bits_t const*)vector); @@ -100,6 +102,7 @@ USEARCH_EXPORT usearch_index_t usearch_init(usearch_init_options_t* options, use index_config_t config; config.connectivity = options->connectivity; + config.vector_alignment = sizeof(float); index_t index = // options->metric ? // index_t::make( // @@ -134,6 +137,41 @@ USEARCH_EXPORT void usearch_view(usearch_index_t index, char const* path, usearc *error = result.error.what(); } +void usearch_view_mem(usearch_index_t index, char* data, usearch_error_t* error) { + serialization_result_t result = reinterpret_cast(index)->view_mem(data); + if (!result) + *error = result.error.what(); +} + +void usearch_view_mem_lazy(usearch_index_t index, char* data, usearch_error_t* error) { + serialization_result_t result = reinterpret_cast(index)->view_mem_lazy(data); + if (!result) { + *error = result.error.what(); + // error needs to be reset. otherwise error_t destructor will raise. + // todo:: fix for the rest of the interface + result.error = nullptr; + } +} + +void usearch_update_header(usearch_index_t index, char* headerp, usearch_error_t* error) { + serialization_result_t result = reinterpret_cast(index)->update_header(headerp); + if (!result) { + *error = result.error.what(); + result.error = nullptr; + } +} + +usearch_metadata_t usearch_metadata(usearch_index_t index, usearch_error_t*) { + usearch_metadata_t res; + precomputed_constants_t pre = reinterpret_cast(index)->metadata(); + + res.inverse_log_connectivity = pre.inverse_log_connectivity; + res.connectivity_max_base = pre.connectivity_max_base; + res.neighbors_bytes = pre.neighbors_bytes; + res.neighbors_base_bytes = pre.neighbors_base_bytes; + return res; +} + USEARCH_EXPORT size_t usearch_size(usearch_index_t index, usearch_error_t*) { // return reinterpret_cast(index)->size(); } @@ -158,11 +196,28 @@ USEARCH_EXPORT void usearch_reserve(usearch_index_t index, size_t capacity, usea USEARCH_EXPORT void usearch_add( // usearch_index_t index, usearch_label_t label, void const* vector, usearch_scalar_kind_t kind, // usearch_error_t* error) { - add_result_t result = add_(reinterpret_cast(index), label, vector, to_native_scalar(kind)); + add_result_t result = add_(reinterpret_cast(index), label, vector, to_native_scalar(kind), -1, nullptr); + if (!result) + *error = result.error.what(); +} + +int32_t usearch_newnode_level(usearch_index_t index, usearch_error_t*) { + return reinterpret_cast(index)->newnode_level(); +} + +void usearch_add_external( // + usearch_index_t index, usearch_label_t label, void const* vector, void* tape, usearch_scalar_kind_t kind, // + int32_t level, usearch_error_t* error) { + add_result_t result = add_(reinterpret_cast(index), label, vector, to_native_scalar(kind), level, tape); if (!result) *error = result.error.what(); } +void usearch_set_node_retriever(usearch_index_t index, usearch_node_retriever_t retriever, + usearch_node_retriever_t retriever_mut, usearch_error_t*) { + reinterpret_cast(index)->set_node_retriever(retriever, retriever_mut); +} + #if USEARCH_LOOKUP_LABEL USEARCH_EXPORT bool usearch_contains(usearch_index_t index, usearch_label_t label, usearch_error_t*) { return reinterpret_cast(index)->contains(label); diff --git a/c/usearch.h b/c/usearch.h index 7d10876d..8dad5514 100644 --- a/c/usearch.h +++ b/c/usearch.h @@ -11,11 +11,15 @@ extern "C" { #include // `size_t` USEARCH_EXPORT typedef void* usearch_index_t; -USEARCH_EXPORT typedef uint32_t usearch_label_t; +// let this be larger, before I make it truly configurable +// lanterndb assumes this is at least 48 bits +// todo:: make this configurable +USEARCH_EXPORT typedef uint64_t usearch_label_t; USEARCH_EXPORT typedef float usearch_distance_t; USEARCH_EXPORT typedef char const* usearch_error_t; USEARCH_EXPORT typedef usearch_distance_t (*usearch_metric_t)(void const*, void const*); +USEARCH_EXPORT typedef void* (*usearch_node_retriever_t)(int index); USEARCH_EXPORT typedef enum usearch_metric_kind_t { usearch_metric_ip_k = 0, @@ -51,13 +55,24 @@ USEARCH_EXPORT typedef struct usearch_init_options_t { size_t expansion_search; } usearch_init_options_t; +USEARCH_EXPORT typedef struct { + double inverse_log_connectivity; + size_t connectivity_max_base; + size_t neighbors_bytes; + size_t neighbors_base_bytes; +} usearch_metadata_t; + USEARCH_EXPORT usearch_index_t usearch_init(usearch_init_options_t*, usearch_error_t*); USEARCH_EXPORT void usearch_free(usearch_index_t, usearch_error_t*); USEARCH_EXPORT void usearch_save(usearch_index_t, char const* path, usearch_error_t*); USEARCH_EXPORT void usearch_load(usearch_index_t, char const* path, usearch_error_t*); USEARCH_EXPORT void usearch_view(usearch_index_t, char const* path, usearch_error_t*); +USEARCH_EXPORT void usearch_view_mem(usearch_index_t index, char* data, usearch_error_t* error); +USEARCH_EXPORT void usearch_view_mem_lazy(usearch_index_t index, char* data, usearch_error_t* error); +USEARCH_EXPORT void usearch_update_header(usearch_index_t index, char* headerp, usearch_error_t* error); +USEARCH_EXPORT usearch_metadata_t usearch_metadata(usearch_index_t, usearch_error_t*); USEARCH_EXPORT size_t usearch_size(usearch_index_t, usearch_error_t*); USEARCH_EXPORT size_t usearch_capacity(usearch_index_t, usearch_error_t*); USEARCH_EXPORT size_t usearch_dimensions(usearch_index_t, usearch_error_t*); @@ -85,6 +100,14 @@ USEARCH_EXPORT bool usearch_get( // USEARCH_EXPORT void usearch_remove(usearch_index_t, usearch_label_t, usearch_error_t*); +USEARCH_EXPORT int32_t usearch_newnode_level(usearch_index_t index, usearch_error_t* error); + +USEARCH_EXPORT void usearch_set_node_retriever(usearch_index_t index, usearch_node_retriever_t retriever, + usearch_node_retriever_t retriever_mut, usearch_error_t* error); +USEARCH_EXPORT void usearch_add_external( // + usearch_index_t index, usearch_label_t label, void const* vector, void* tape, usearch_scalar_kind_t kind, // + int32_t level, usearch_error_t* error); + #ifdef __cplusplus } #endif diff --git a/include/usearch/index.hpp b/include/usearch/index.hpp index 11ade638..359e2cba 100644 --- a/include/usearch/index.hpp +++ b/include/usearch/index.hpp @@ -69,18 +69,19 @@ #endif // STL includes -#include // `std::sort_heap` -#include // `std::atomic` -#include // `std::bitset` -#include // `CHAR_BIT` -#include // `std::sqrt` -#include // `std::memset` -#include // `std::reverse_iterator` -#include // `std::unique_lock` - replacement candidate -#include // `std::default_random_engine` - replacement candidate -#include // `std::runtime_exception` -#include // `std::thread` -#include // `std::pair` +#include // `std::sort_heap` +#include // `std::atomic` +#include // `std::bitset` +#include // `CHAR_BIT` +#include // `std::sqrt` +#include // `std::memset` +#include // `std::function` +#include // `std::reverse_iterator` +#include // `std::unique_lock` - replacement candidate +#include // `std::default_random_engine` - replacement candidate +#include // `std::runtime_exception` +#include // `std::thread` +#include // `std::pair` // Prefetching #if defined(USEARCH_DEFINED_GCC) @@ -1208,6 +1209,12 @@ constexpr std::size_t default_allocator_entry_bytes() { return 64; } */ constexpr char const* default_magic() { return "usearch"; } +/** + * @brief How much larger (number of neighbors per node) will + * the base level be compared to other levels. + */ +constexpr std::size_t base_level_multiple() { return 2; } + /** * @brief Configuration settings for the index construction. * Includes the main `::connectivity` parameter (`M` in the paper) @@ -1229,6 +1236,24 @@ struct index_config_t { std::size_t vector_alignment = 1; }; +using neighbors_count_t = std::uint32_t; + +// brought this here so I could expose it from the punned class for external indexes +// probably better to include these in the file_head_result_t and expose it as part of metadata() +struct precomputed_constants_t { + double inverse_log_connectivity{}; + std::size_t connectivity_max_base{}; + std::size_t neighbors_bytes{}; + std::size_t neighbors_base_bytes{}; + precomputed_constants_t() {} + precomputed_constants_t(index_config_t const& config) noexcept + : inverse_log_connectivity(1.0 / std::log(static_cast(config.connectivity))), + connectivity_max_base(config.connectivity * base_level_multiple()), + neighbors_bytes(config.connectivity * sizeof(id_t) + sizeof(neighbors_count_t)), + // todo:: is it ok to use one init variable from another in this kind of initializers? + neighbors_base_bytes(connectivity_max_base * sizeof(id_t) + sizeof(neighbors_count_t)) {} +}; + struct index_limits_t { std::size_t members = 0; std::size_t threads_add = std::thread::hardware_concurrency(); @@ -1377,6 +1402,11 @@ struct file_head_result_t { size_t size; entry_idx_t entry_idx; + // Derived structural: + size_t connectivity_max_base; + size_t neighbors_bytes; + size_t neighbors_base_bytes; + // Additional: size_t bytes_for_graphs; size_t bytes_for_vectors; @@ -1644,7 +1674,6 @@ class index_gt { using reverse_const_iterator = std::reverse_iterator; private: - using neighbors_count_t = std::uint32_t; using dim_t = std::uint32_t; using level_t = std::int32_t; @@ -1657,12 +1686,6 @@ class index_gt { static_assert(sizeof(typename tape_allocator_traits_t::value_type) == 1, // "Tape allocator must allocate separate addressable bytes"); - /** - * @brief How much larger (number of neighbors per node) will - * the base level be compared to other levels. - */ - static constexpr std::size_t base_level_multiple_() { return 2; } - /** * @brief How many bytes of memory are needed to form the "head" of the node. */ @@ -1670,12 +1693,6 @@ class index_gt { using visits_bitset_t = visits_bitset_gt; - struct precomputed_constants_t { - double inverse_log_connectivity{}; - std::size_t connectivity_max_base{}; - std::size_t neighbors_bytes{}; - std::size_t neighbors_base_bytes{}; - }; struct candidate_t { distance_t distance; id_t id; @@ -1790,6 +1807,20 @@ class index_gt { node_t* nodes_{}; mutable visits_bitset_t nodes_mutexes_{}; + // used to retrieve nodes by id. If a custom retriever is not specified, this returns corresponding element + // from the nodes_ array. + // If custom retriever is specified, then the index is stored externally and the caller + // is responsible for providing an appropriate node retriever and storage facilities. + // the non-pointer node_with_id_ member function is still available as builtin_node_with_id_() + // todo:: uint32_t here should be id_t. change it once I figure out a way to enforce these in the c bindings + std::function node_with_id_ = [this](std::uint32_t idx) { return nodes_[idx]; }; + // same as above. Used for mutable access to nodes, in case the external storage treats immutable and mutable + // accesses differently. (E.g., LanternDB locks WAL blocks exclusively for mutable access and returns a reference + // but returns a copy on immutable access) + std::function node_with_id_mut_ = [this](std::uint32_t idx) { return nodes_[idx]; }; + bool custom_node_retriever_ = false; + bool debug_node_retriever_ = false; + using contexts_allocator_t = typename allocator_traits_t::template rebind_alloc; context_t* contexts_{}; @@ -1953,15 +1984,19 @@ class index_gt { return false; nodes_allocator_t node_allocator; - node_t* new_nodes = node_allocator.allocate(limits.members); - if (!new_nodes) - return false; + node_t* new_nodes = nullptr; + if (!custom_node_retriever_ || (custom_node_retriever_ && debug_node_retriever_)) { + new_nodes = node_allocator.allocate(limits.members); + if (!new_nodes) + return false; + } std::size_t limits_threads = limits.threads(); contexts_allocator_t context_allocator; context_t* new_contexts = context_allocator.allocate(limits_threads); if (!new_contexts) { - node_allocator.deallocate(new_nodes, limits.members); + if (new_nodes) + node_allocator.deallocate(new_nodes, limits.members); return false; } for (std::size_t i = 0; i != limits_threads; ++i) { @@ -1971,7 +2006,8 @@ class index_gt { if (!context.visits.resize(limits.members)) { for (std::size_t j = 0; j != i; ++j) context.visits.reset(); - node_allocator.deallocate(new_nodes, limits.members); + if (new_nodes) + node_allocator.deallocate(new_nodes, limits.members); context_allocator.deallocate(new_contexts, limits_threads); return false; } @@ -1989,8 +2025,8 @@ class index_gt { old_context.visits.reset(); } - // Move the nodes info, and deallocate previous buffers. - if (nodes_) + // Move the nodes info, and deallocate previous buffers when not using external storage. + if (new_nodes && nodes_) std::memcpy(new_nodes, nodes_, sizeof(node_t) * size()), node_allocator.deallocate(nodes_, limits_.members); if (contexts_) context_allocator.deallocate(contexts_, limits_.threads()); @@ -2011,7 +2047,7 @@ class index_gt { std::size_t bytes_per_node_base = node_head_bytes_() + pre.neighbors_base_bytes; std::size_t rounded_size = divide_round_up<64>(bytes_per_node_base) * 64; std::size_t added_connections = (rounded_size - bytes_per_node_base) / sizeof(id_t); - config.connectivity = config.connectivity + added_connections / base_level_multiple_(); + config.connectivity = config.connectivity + added_connections / base_level_multiple(); return config; } @@ -2100,14 +2136,28 @@ class index_gt { } }; + /** + * @brief Generate a random level for a new externally stored vector. Thread-safe. + * + * @param[in] config Configuration options for this specific operation. + * @return The level to use for the new vector add operation. + */ + level_t choose_random_level(add_config_t config) { + context_t& context = contexts_[config.thread]; + return choose_random_level_(context.level_generator); + } + /** * @brief Inserts a new vector into the index. Thread-safe. * * @param[in] label External identifier/name/descriptor for the vector. * @param[in] vector Contiguous range of scalars forming a vector view. * @param[in] config Configuration options for this specific operation. + * @param[in] level The level to use for the new vector add operation (external storage only). + * @param[in] tape The tape to use for the new vector add operation (external storage only). */ - add_result_t add(label_t label, vector_view_t vector, add_config_t config = {}) usearch_noexcept_m { + add_result_t add(label_t label, vector_view_t vector, add_config_t config = {}, level_t level = -1, + byte_t* tape = nullptr) usearch_noexcept_m { usearch_assert_m(!is_immutable(), "Can't add to an immutable index"); add_result_t result; @@ -2121,7 +2171,7 @@ class index_gt { // The top list needs one more slot than the connectivity of the base level // for the heuristic, that tries to squeeze one more element into saturated list. - std::size_t top_limit = (std::max)(base_level_multiple_() * config_.connectivity + 1, config.expansion); + std::size_t top_limit = (std::max)(base_level_multiple() * config_.connectivity + 1, config.expansion); if (!top.reserve(top_limit)) return result.failed("Out of memory!"); if (!next.reserve(config.expansion)) @@ -2131,17 +2181,30 @@ class index_gt { std::unique_lock new_level_lock(global_mutex_); level_t max_level_copy = max_level_; // Copy under lock id_t entry_id_copy = entry_id_; // Copy under lock - level_t target_level = choose_random_level_(context.level_generator); + usearch_assert_m(custom_node_retriever_ ^ (level == -1 && tape == nullptr), + "Must generate and specify level&tape iff nodes are externally stored"); + level_t target_level = level != -1 ? level : choose_random_level_(context.level_generator); if (target_level <= max_level_copy) new_level_lock.unlock(); - // Allocate the neighbors - node_t node = node_make_(label, vector, target_level, config.store_vector); - if (!node) - return result.failed("Out of memory!"); + // Allocate and initialize node tape (config, the neighbors and optionally the vector) + node_t node; + if (!custom_node_retriever_) { + node = node_make_(label, vector, target_level, config.store_vector); + if (!node) + return result.failed("Out of memory!"); + } else { + node_bytes_split_t node_bytes = node_view_(tape, vector.size() * config.store_vector, target_level); + node = node_init_(node_bytes, label, vector, target_level, config.store_vector); + if (!node) + return result.failed("Node init failed. Bad tape?"); + } std::size_t old_size = size_.fetch_add(1); id_t new_id = static_cast(old_size); - nodes_[old_size] = node; + if (!custom_node_retriever_) { + // node internally stored. save the node we just allocated + nodes_[old_size] = node; + } result.new_size = old_size + 1; result.id = new_id; node_lock_t new_lock = node_lock_(old_size); @@ -2165,6 +2228,7 @@ class index_gt { // Updating the entry point if needed if (target_level > max_level_copy) { + usearch_assert_m(new_level_lock.owns_lock(), "Must hold the new level lock when changing max level"); entry_id_ = new_id; max_level_ = target_level; } @@ -2193,7 +2257,7 @@ class index_gt { // The top list needs one more slot than the connectivity of the base level // for the heuristic, that tries to squeeze one more element into saturated list. - std::size_t top_limit = (std::max)(base_level_multiple_() * config_.connectivity + 1, config.expansion); + std::size_t top_limit = (std::max)(base_level_multiple() * config_.connectivity + 1, config.expansion); if (!top.reserve(top_limit)) return result.failed("Out of memory!"); if (!next.reserve(config.expansion)) @@ -2284,7 +2348,7 @@ class index_gt { result.nodes = size(); for (std::size_t i = 0; i != result.nodes; ++i) { node_t node = node_with_id_(i); - std::size_t max_edges = node.level() * config_.connectivity + base_level_multiple_() * config_.connectivity; + std::size_t max_edges = node.level() * config_.connectivity + base_level_multiple() * config_.connectivity; std::size_t edges = 0; for (level_t level = 0; level <= node.level(); ++level) edges += neighbors_(node, level).size(); @@ -2310,7 +2374,7 @@ class index_gt { result.allocated_bytes += node_head_bytes_() + node_vector_bytes_(node) + neighbors_bytes; } - std::size_t max_edges_per_node = level ? config_.connectivity : base_level_multiple_() * config_.connectivity; + std::size_t max_edges_per_node = level ? config_.connectivity : base_level_multiple() * config_.connectivity; result.max_edges = result.nodes * max_edges_per_node; return result; } @@ -2530,6 +2594,58 @@ class index_gt { return {}; } + serialization_result_t view_mem_lazy(byte_t* file) noexcept { + serialization_result_t result; + result = load_index_header(file); + if (result.error) { + // q:: this generates a warning "moving a local object in a return statement prevents copy elision" + // but if I do not move, it seems result is actually copied and the restructor of this function's copy + // raises when ~error_t is called. + // what is the right approach to make sure result is moved/constructed at destination? + // I read that return std::move is bad practice so probably this is not the right approach + return std::move(result); + } + + if (!custom_node_retriever_) + return result.failed("custom node retriever must be set for lazy index loading"); +#ifdef DEBUG_RETRIEVER + return result.failed("for retriever debugging you must eagerly load the index into usearch with view_mem"); +#endif + return std::move(result); + } + + serialization_result_t update_header(byte_t* headerp) noexcept { + serialization_result_t result; + result = save_index_header(headerp); + return std::move(result); + } + + serialization_result_t view_mem(byte_t* file) noexcept { + serialization_result_t result; + // Parse and load the header + result = load_index_header(file); + if (result.error) { + return result; + } + + // Locate every node packed into file + std::size_t progress_bytes = sizeof(file_header_t); + std::size_t const size = size_; + for (std::size_t i = 0; i != size; ++i) { + byte_t* tape = (byte_t*)(file + progress_bytes); + dim_t dim = misaligned_load(tape + sizeof(label_t)); + level_t level = misaligned_load(tape + sizeof(label_t) + sizeof(dim_t)); + + std::size_t node_bytes = node_bytes_(dim, level); + std::size_t node_vector_bytes = dim * sizeof(scalar_t); + nodes_[i] = node_t{tape, (scalar_t*)(tape + node_bytes - node_vector_bytes)}; + progress_bytes += node_bytes; + // progress(i, size); + } + + return {}; + } + /** * @brief Memory-maps the serialized binary index representation from disk, * @b without copying the vectors and neighbors lists into RAM. @@ -2595,51 +2711,81 @@ class index_gt { viewed_file_.length = file_stat.st_size; #endif // Platform specific code - // Read the header - { - file_head_t state{file}; - if (state.bytes_per_label != sizeof(label_t)) { - reset_view_(); - return result.failed("Incompatible label type!"); - } - if (state.bytes_per_id != sizeof(id_t)) { - reset_view_(); - return result.failed("Incompatible ID type!"); - } + return view_mem(file); + } - config_.connectivity = state.connectivity; - config_.vector_alignment = state.vector_alignment; - pre_ = precompute_(config_); +#pragma endregion - index_limits_t limits; - limits.members = state.size; - limits.threads_add = 0; - if (!reserve(limits)) - return result.failed("Out of memory!"); +#pragma region External Index + using node_retriever_t = void* (*)(int index); + /** @brief The function sets the immutable and mutable getters for externally stored index nodes + * The caller must ensure the following properties for each kind of retriever: + * Immutable retriever: + * 1. The returned pointer must be valid until the corresponding usearch-API + * function (e.g., search, insert) returns. + * 2. The retriever function must handle multiple calls with the same index + * during the same high level API call. + * (as a result, the external retriever may not blindly lock the corresponding + * resource, for example, and first check whether the node is already locked + * and in accessed-cache) + * Mutable retriever: + * 1. Requirements of the immutable retriever + * 2. The retriever must ensure that when a node is modified, then retrieved again, + * the *modified* version is returned and not the committed version. + * + * @param[in] external_node_retriever Pointer to external node retriever (the retriever returns + * a const pointer) + * @param[in] external_node_retriever_mut Pointer to external node retriever (the retriever returns + * a mutable pointer) + */ + void set_node_retriever(node_retriever_t external_node_retriever, + node_retriever_t external_node_retriever_mut) noexcept { + custom_node_retriever_ = true; +#ifdef DEBUG_RETRIEVER + debug_node_retriever_ = true; +#endif - size_ = state.size; - max_level_ = static_cast(state.max_level); - entry_id_ = static_cast(state.entry_idx); - } + node_with_id_ = [this, external_node_retriever](size_t index) { + byte_t* tape = (byte_t*)external_node_retriever(index); + dim_t dim = misaligned_load(tape + sizeof(label_t)); + level_t level = misaligned_load(tape + sizeof(label_t) + sizeof(dim_t)); - // Locate every node packed into file - std::size_t progress_bytes = sizeof(file_header_t); - std::size_t const size = size_; - for (std::size_t i = 0; i != size; ++i) { - byte_t* tape = (byte_t*)(file + progress_bytes); + std::size_t node_bytes = node_bytes_(dim, level); + std::size_t node_vector_bytes = dim * sizeof(scalar_t); + + node_t node = node_t{tape, (scalar_t*)(tape + node_bytes - node_vector_bytes)}; +#ifdef DEBUG_RETRIEVER + node_t correct_node = builtin_node_with_id_(index); + if (correct_node.tape() != node.tape()) { + std::cerr << "node retriever is incorrect. expected tape at addr " << (const void*)correct_node.tape() + << "but got " << (const void*)node.tape() << std::endl; + throw std::runtime_error("node retriever is incorrect"); + } +#endif + return node; + }; + node_with_id_mut_ = [this, external_node_retriever_mut](size_t index) { + byte_t* tape = (byte_t*)external_node_retriever_mut(index); dim_t dim = misaligned_load(tape + sizeof(label_t)); level_t level = misaligned_load(tape + sizeof(label_t) + sizeof(dim_t)); std::size_t node_bytes = node_bytes_(dim, level); std::size_t node_vector_bytes = dim * sizeof(scalar_t); - nodes_[i] = node_t{tape, (scalar_t*)(tape + node_bytes - node_vector_bytes)}; - progress_bytes += node_bytes; - progress(i, size); - } - return {}; + node_t node = node_t{tape, (scalar_t*)(tape + node_bytes - node_vector_bytes)}; +#ifdef DEBUG_RETRIEVER + node_t correct_node = builtin_node_with_id_(index); + if (correct_node.tape() != node.tape()) { + std::cerr << "node retriever is incorrect. expected tape at addr " << (const void*)correct_node.tape() + << "but got " << (const void*)node.tape() << std::endl; + throw std::runtime_error("node retriever is incorrect"); + } +#endif + return node; + }; } + precomputed_constants_t metadata() { return pre_; } #pragma endregion struct join_result_t { @@ -2920,7 +3066,7 @@ class index_gt { inline static precomputed_constants_t precompute_(index_config_t const& config) noexcept { precomputed_constants_t pre; - pre.connectivity_max_base = config.connectivity * base_level_multiple_(); + pre.connectivity_max_base = config.connectivity * base_level_multiple(); pre.inverse_log_connectivity = 1.0 / std::log(static_cast(config.connectivity)); pre.neighbors_bytes = config.connectivity * sizeof(id_t) + sizeof(neighbors_count_t); pre.neighbors_base_bytes = pre.connectivity_max_base * sizeof(id_t) + sizeof(neighbors_count_t); @@ -2955,19 +3101,30 @@ class index_gt { inline std::size_t node_vector_bytes_(node_t node) const noexcept { return node_vector_bytes_(node.dim()); } node_bytes_split_t node_malloc_(dim_t dims_to_store, level_t level) noexcept { - - std::size_t vector_bytes = node_vector_bytes_(dims_to_store); std::size_t node_bytes = node_bytes_(dims_to_store, level); - std::size_t non_vector_bytes = node_bytes - vector_bytes; - byte_t* data = (byte_t*)tape_allocator_.allocate(node_bytes); if (!data) return node_bytes_split_t{}; + return node_view_(data, dims_to_store, level); + } + + // if vector is stored external to this view, it is caller's responsibility to initialize + // node_butes_split_t.vector{} before contstructing a node{} on this + node_bytes_split_t node_view_(byte_t* data, dim_t dims_stored, level_t level) noexcept { + std::size_t vector_bytes = node_vector_bytes_(dims_stored); + std::size_t node_bytes = node_bytes_(dims_stored, level); + std::size_t non_vector_bytes = node_bytes - vector_bytes; + return {{data, non_vector_bytes}, {data + non_vector_bytes, vector_bytes}}; } node_t node_make_(label_t label, vector_view_t vector, level_t level, bool store_vector) noexcept { node_bytes_split_t node_bytes = node_malloc_(vector.size() * store_vector, level); + return node_init_(node_bytes, label, vector, level, store_vector); + } + + node_t node_init_(node_bytes_split_t node_bytes, label_t label, vector_view_t vector, level_t level, + bool store_vector) { if (store_vector) { std::memset(node_bytes.tape.data(), 0, node_bytes.tape.size()); std::memcpy(node_bytes.vector.data(), vector.data(), node_bytes.vector.size()); @@ -2979,7 +3136,7 @@ class index_gt { node.dim(static_cast(vector.size())); node.level(level); return node; - } + }; node_t node_make_copy_(node_bytes_split_t old_bytes) noexcept { if (old_bytes.colocated()) { @@ -2999,6 +3156,10 @@ class index_gt { if (viewed_file_) return; + // for view_mem and view_mem_lazy where file is provided by the caller so there are no nodes + // managed by us to free + if (!nodes_) + return; node_t& node = nodes_[id]; std::size_t node_bytes = node_bytes_(node) - node_vector_bytes_(node) * !node_bytes_split_(node).colocated(); @@ -3006,7 +3167,7 @@ class index_gt { node = node_t{}; } - inline node_t node_with_id_(std::size_t idx) const noexcept { return nodes_[idx]; } + inline node_t builtin_node_with_id_(std::size_t idx) const noexcept { return nodes_[idx]; } inline neighbors_ref_t neighbors_base_(node_t node) const noexcept { return {node.neighbors_tape()}; } inline neighbors_ref_t neighbors_non_base_(node_t node, level_t level) const noexcept { @@ -3077,8 +3238,8 @@ class index_gt { // Reverse links from the neighbors: std::size_t const connectivity_max = level ? config_.connectivity : pre_.connectivity_max_base; for (id_t close_id : new_neighbors) { - node_t close_node = node_with_id_(close_id); - node_lock_t close_lock = node_lock_(close_id); + node_t close_node = node_with_id_mut_(close_id); + node_lock_t close_lock = node_lock_(close_id); //--<< remove on external inserts todo:: neighbors_ref_t close_header = neighbors_(close_node, level); usearch_assert_m(close_header.size() <= connectivity_max, "Possible corruption"); @@ -3095,10 +3256,13 @@ class index_gt { // To fit a new connection we need to drop an existing one. top.clear(); usearch_assert_m((top.reserve(close_header.size() + 1)), "The memory must have been reserved in `add`"); + // insert the newly added node to the competition for neighbors of close_node top.insert_reserved({context.measure(new_node, close_node), new_id}); + // add the existing neighbors of close_node to the competition for (id_t successor_id : close_header) top.insert_reserved({context.measure(node_with_id_(successor_id), close_node), successor_id}); + // select the closest ones and update close_node neighbors. No other node is changed from here on // Export the results: close_header.clear(); candidates_view_t top_view = refine_(top, connectivity_max, context); @@ -3319,6 +3483,61 @@ class index_gt { top.shrink(submitted_count); return {top_data, submitted_count}; } + + serialization_result_t load_index_header(byte_t* file) { + serialization_result_t result; + file_head_t state{file}; + if (state.bytes_per_label != sizeof(label_t)) { + reset_view_(); + return result.failed("Incompatible label type!"); + } + if (state.bytes_per_id != sizeof(id_t)) { + reset_view_(); + return result.failed("Incompatible ID type!"); + } + if (std::strncmp(state.magic, default_magic(), std::strlen(default_magic())) != 0) + return result.failed("Wrong MIME type!"); + + config_.connectivity = state.connectivity; + config_.vector_alignment = state.vector_alignment; + pre_ = precomputed_constants_t(config_); + + index_limits_t limits; + limits.members = state.size; + limits.threads_add = 0; + if (!reserve(limits)) + return result.failed("Out of memory!"); + + size_ = state.size; + max_level_ = static_cast(state.max_level); + entry_id_ = static_cast(state.entry_idx); + return result; + } + + serialization_result_t save_index_header(byte_t* headerp) { + // todo:: make sure that the index is not in use or is properly locked + // when generating the header + serialization_result_t result; + file_head_t state{headerp}; + if (state.bytes_per_label != sizeof(label_t)) { + reset_view_(); + return result.failed("Incompatible label type!"); + } + if (state.bytes_per_id != sizeof(id_t)) { + reset_view_(); + return result.failed("Incompatible ID type!"); + } + if (std::strncmp(state.magic, default_magic(), std::strlen(default_magic())) != 0) + return result.failed("Wrong MIME type!"); + + state.connectivity = config_.connectivity; + state.vector_alignment = config_.vector_alignment; + + state.size = size_; + state.max_level = max_level_; + state.entry_idx = entry_id_; + return result; + } }; /** @@ -3344,6 +3563,12 @@ inline file_head_result_t index_metadata(char const* file_path) noexcept { if (std::strncmp(state.magic, default_magic(), std::strlen(default_magic())) != 0) return result.failed("Wrong MIME type!"); + // add precomputed constants to the resulting metadata + index_config_t config; + config.connectivity = state.connectivity; + config.vector_alignment = state.vector_alignment; + precomputed_constants_t pre = precomputed_constants_t(config); + result.version_major = state.version_major; result.version_minor = state.version_minor; result.version_patch = state.version_patch; @@ -3359,6 +3584,10 @@ inline file_head_result_t index_metadata(char const* file_path) noexcept { result.bytes_for_graphs = state.bytes_for_graphs; result.bytes_for_vectors = state.bytes_for_vectors; result.bytes_checksum = state.bytes_checksum; + + result.connectivity_max_base = pre.connectivity_max_base; + result.neighbors_base_bytes = pre.neighbors_base_bytes; + result.neighbors_bytes = pre.neighbors_bytes; return result; } diff --git a/include/usearch/index_punned_dense.hpp b/include/usearch/index_punned_dense.hpp index 782cd36c..90abc5bf 100644 --- a/include/usearch/index_punned_dense.hpp +++ b/include/usearch/index_punned_dense.hpp @@ -201,6 +201,7 @@ class index_punned_dense_gt { using join_result_t = typename index_t::join_result_t; using stats_t = typename index_t::stats_t; using match_t = typename index_t::match_t; + using node_retriever_t = typename index_t::node_retriever_t; index_punned_dense_gt() = default; index_punned_dense_gt(index_punned_dense_gt&& other) @@ -339,6 +340,7 @@ class index_punned_dense_gt { stats_t stats() const { return typed_->stats(); } stats_t stats(std::size_t level) const { return typed_->stats(level); } + precomputed_constants_t metadata() const { return typed_->metadata(); } std::size_t memory_usage() const { return typed_->memory_usage(0) + // @@ -350,7 +352,7 @@ class index_punned_dense_gt { add_result_t add(label_t label, b1x8_t const* vector) { return add_(label, vector, casts_.from_b1x8); } add_result_t add(label_t label, f8_bits_t const* vector) { return add_(label, vector, casts_.from_f8); } add_result_t add(label_t label, f16_t const* vector) { return add_(label, vector, casts_.from_f16); } - add_result_t add(label_t label, f32_t const* vector) { return add_(label, vector, casts_.from_f32); } + add_result_t add(label_t label, f32_t const* vector, int32_t level = -1, byte_t *tape = nullptr) { return add_(label, vector, casts_.from_f32, level, tape); } add_result_t add(label_t label, f64_t const* vector) { return add_(label, vector, casts_.from_f64); } add_result_t add(label_t label, b1x8_t const* vector, add_config_t config) { return add_(label, vector, config, casts_.from_b1x8); } @@ -444,6 +446,33 @@ class index_punned_dense_gt { return result; } + serialization_result_t view_mem(char* memory) { + serialization_result_t result = typed_->view_mem(memory); +#if USEARCH_LOOKUP_LABEL + if (result) + reindex_labels_(); +#endif + return result; + } + + serialization_result_t view_mem_lazy(char* memory) { + serialization_result_t result; +#if USEARCH_LOOKUP_LABEL + return result.failed("Usearch does not support label lookup and member removals for external memory indexes."); +#endif + result = typed_->view_mem_lazy(memory); + return std::move(result); + } + + serialization_result_t update_header(char* headerp) { + serialization_result_t result = typed_->update_header(headerp); + return result; + } + + void set_node_retriever(node_retriever_t node_retriever, node_retriever_t node_retriever_mut) { + typed_->set_node_retriever(node_retriever, node_retriever_mut); + } + #if USEARCH_LOOKUP_LABEL /** * @brief Checks if a vector with specidied label is present. @@ -581,6 +610,12 @@ class index_punned_dense_gt { *labels = it->first; } #endif + int32_t newnode_level() { + thread_lock_t lock = thread_lock_(); + add_config_t add_config; + add_config.thread = lock.thread_id; + return typed_->choose_random_level(add_config); + } /** * @brief Adapts the Male-Optimal Stable Marriage algorithm for unequal sets @@ -736,7 +771,8 @@ class index_punned_dense_gt { } template - add_result_t add_(label_t label, scalar_at const* vector, add_config_t config, cast_t const& cast) { + add_result_t add_(label_t label, scalar_at const* vector, add_config_t config, cast_t const& cast, + int32_t level = -1, byte_t* tape = nullptr) { byte_t const* vector_data = reinterpret_cast(vector); std::size_t vector_bytes = dimensions_ * sizeof(scalar_at); @@ -758,7 +794,7 @@ class index_punned_dense_gt { add_result_t result = // free_id != default_free_value() // ? typed_->update(free_id, label, {vector_data, vector_bytes}, config) - : typed_->add(label, {vector_data, vector_bytes}, config); + : typed_->add(label, {vector_data, vector_bytes}, config, level, tape); #if USEARCH_LOOKUP_LABEL { unique_lock_t lock(labeled_lookup_mutex_); @@ -841,11 +877,13 @@ class index_punned_dense_gt { } #endif - template add_result_t add_(label_t label, scalar_at const* vector, cast_t const& cast) { + template + add_result_t add_(label_t label, scalar_at const* vector, cast_t const& cast, int32_t level = -1, + byte_t* tape = nullptr) { thread_lock_t lock = thread_lock_(); add_config_t add_config; add_config.thread = lock.thread_id; - return add_(label, vector, add_config, cast); + return add_(label, vector, add_config, cast, level, tape); } template