Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions conanfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

class HomestoreConan(ConanFile):
name = "homestore"
version = "6.20.30"
version = "6.20.31"

homepage = "https://github.com/eBay/Homestore"
description = "HomeStore Storage Engine"
Expand Down Expand Up @@ -53,7 +53,7 @@ def build_requirements(self):

def requirements(self):
self.requires("iomgr/[^11.3]@oss/master", transitive_headers=True)
self.requires("sisl/[^12.4.7]@oss/master", transitive_headers=True)
self.requires("sisl/[^12.4]@oss/master", transitive_headers=True)
self.requires("nuraft_mesg/[~3.8.7]@oss/main", transitive_headers=True)

self.requires("farmhash/cci.20190513@", transitive_headers=True)
Expand Down
2 changes: 1 addition & 1 deletion src/include/homestore/btree/detail/btree_node.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ class BtreeNode : public sisl::ObjLifeCounter< BtreeNode > {
auto prevKey = get_nth_key< K >(i - 1, false);
auto curKey = get_nth_key< K >(i, false);
if (prevKey.compare(curKey) >= 0) {
DEBUG_ASSERT(false, "Order check failed at entry={}", i);
DEBUG_ASSERT(false, "Order check failed at entry={}, btree id={}", i, node_id());
return false;
}
}
Expand Down
109 changes: 30 additions & 79 deletions src/include/homestore/index/index_table.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -618,19 +618,24 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
K last_child_last_key;
K last_child_neighbor_key;
BtreeNodePtr cur_child = child_node;
auto sibling_first_child = true_sibling_first_child(parent_node);
LOGTRACEMOD(wbcache, "Sibling first child id is {}", sibling_first_child);

// We find the last child node by starting from the leftmost child and traversing through the
// next_bnode links until we reach the end or find a sibling first child.
bool found_child = false;
auto sibling_first_child = true_sibling_first_child(parent_node);
LOGTRACEMOD(wbcache, "Sibling first child id is {}", sibling_first_child);
while (cur_child != nullptr) {
LOGTRACEMOD(wbcache, "Processing child node [{}]", cur_child->to_string());
if (!cur_child->is_node_deleted() && cur_child->total_entries() > 0) {
last_child_last_key = cur_child->get_last_key< K >();
found_child = true;
}

if (child_node->total_entries() == 0 && orig_child_infos.contains(child_node->node_id())) {
last_child_last_key = orig_child_infos[child_node->node_id()];
found_child = true;
}

next_cur_child = nullptr;
if (cur_child->next_bnode() == empty_bnodeid ||
read_node_impl(cur_child->next_bnode(), next_cur_child) != btree_status_t::success) {
Expand Down Expand Up @@ -685,7 +690,6 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
LOGTRACEMOD(wbcache,
"No undeleted child found for parent_node [{}], keep normal repair (regular recovery)",
parent_node->to_string());
next_cur_child = nullptr;
}
}
}
Expand All @@ -700,6 +704,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
// Walk across all child nodes until it gets the last_parent_key and keep fixing them.
auto cur_parent = parent_node;
BtreeNodeList new_parent_nodes;
BtreeNodeList child_nodes_to_free;
do {
if (child_node->has_valid_edge() || (child_node->is_leaf() && child_node->next_bnode() == empty_bnodeid)) {
LOGTRACEMOD(wbcache, "Child node [{}] is an edge node or a leaf with no next", child_node->to_string());
Expand Down Expand Up @@ -760,61 +765,6 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
LOGTRACEMOD(wbcache, "Repairing node={}, child_node=[{}] child_last_key={}", cur_parent->node_id(),
child_node->to_string(), child_last_key.to_string());

// Check if we are beyond the last child node.
//
// There can be cases where the child level merge is successfully persisted but the parent level is
// not. In this case, you may have your rightmost child node with last key greater than the
// last_parent_key. That's why here we have to check if the child node is one of the original child
// nodes first.
if (!is_parent_edge_node && !orig_child_infos.contains(child_node->node_id())) {
LOGTRACEMOD(
wbcache,
"Child node [{}] is not one of the original child nodes, so we need to check if it is beyond the "
"last parent key {}",
child_node->to_string(), last_parent_key.to_string());
if (child_last_key.compare(last_parent_key) > 0) {
// We have reached a child beyond this parent, we can stop now
// TODO this case if child last key is less than last parent key to update the parent node.
// this case can potentially break the btree for put and remove op.
break;
}
if (child_node->total_entries() == 0) {
// this child has no entries, but maybe in the middle of the parent node, we need to update the key
// of parent as previous one and go on
LOGTRACEMOD(wbcache,
"Reach to an empty child node {}, and this child doesn't belong to this parent; Hence "
"loop ends",
child_node->to_string());
// now update the next of parent node by skipping all deleted siblings of this parent node
auto valid_sibling = cur_parent->next_bnode();
while (valid_sibling != empty_bnodeid) {
BtreeNodePtr sibling;
if (read_node_impl(valid_sibling, sibling) == btree_status_t::success) {
if (sibling->is_node_deleted()) {
valid_sibling = sibling->next_bnode();
continue;
}
// cur_parent->set_next_bnode(sibling->node_id());
break;
}
LOGTRACEMOD(wbcache, "Failed to read child node {} for parent node [{}] reason {}",
valid_sibling, cur_parent->to_string(), ret);
}
if (valid_sibling != empty_bnodeid) {
cur_parent->set_next_bnode(valid_sibling);
LOGTRACEMOD(wbcache, "Repairing node=[{}], child_node=[{}] is an edge node, end loop",
cur_parent->to_string(), child_node->to_string());

} else {
cur_parent->set_next_bnode(empty_bnodeid);
LOGTRACEMOD(wbcache, "Repairing node=[{}], child_node=[{}] is an edge node, end loop",
cur_parent->to_string(), child_node->to_string());
}

break;
}
}

if (!cur_parent->has_room_for_put(btree_put_type::INSERT, K::get_max_size(),
BtreeLinkInfo::get_fixed_size())) {
// No room in the parent_node, let us split the parent_node and continue
Expand All @@ -836,22 +786,25 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
cur_parent = std::move(new_parent);
}

// If the child node is empty, we mark it as deleted and remove them after the repair loop
if (child_node->total_entries() == 0) {
LOGTRACEMOD(wbcache, "Found an empty child node {}, marking it deleted",
child_node->to_string());
child_node->set_node_deleted();
child_nodes_to_free.push_back(child_node);
// the links to the previous child node will be fixed in the next code block
}


// Insert the last key of the child node into parent node
if (!child_node->is_node_deleted()) {
if (child_node->total_entries() == 0) {
if (orig_child_infos.contains(child_node->node_id())) {
child_last_key = orig_child_infos[child_node->node_id()];
LOGTRACEMOD(wbcache,
"Reach to an empty child node [{}], but not the end of the parent node, so we need "
"to update the key of parent node as original one {}",
child_node->to_string(), child_last_key.to_string());
} else {
LOGTRACEMOD(wbcache,
"Reach to an empty child node [{}] but not belonging to this parent (probably next "
"parent sibling); Hence end loop",
child_node->to_string());
break;
}
BT_LOG_ASSERT(false,
"Child node [{}] is empty but not deleted and not an edge, while it doesn't "
"belong to this parent node {}",
child_node->to_string(), parent_node->to_string());
ret = btree_status_t::not_found;
break;
}
cur_parent->insert(cur_parent->total_entries(), child_last_key,
BtreeLinkInfo{child_node->node_id(), child_node->link_version()});
Expand All @@ -873,13 +826,6 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
IndexBtreeNode* idx_node = static_cast< IndexBtreeNode* >(pre_child_node.get());
idx_node->m_idx_buf->set_state(index_buf_state_t::CLEAN);
write_node_impl(pre_child_node, cp_ctx);
// update the key of last entry of the parent with the last key of deleted child
child_last_key = orig_child_infos[child_node->node_id()];
LOGTRACEMOD(wbcache, "updating parent [{}] current last key with {}", cur_parent->to_string(),
child_last_key.to_string());
// update it here to go to the next child node and unlock this node
LOGTRACEMOD(wbcache, "update the child node next to the next of previous child node");
child_node->set_next_bnode(child_node->next_bnode());
}
}

Expand Down Expand Up @@ -914,6 +860,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
child_node = nullptr;
break;
}

ret = this->read_and_lock_node(next_node_id, child_node, locktype_t::READ, locktype_t::READ, cp_ctx);
if (ret != btree_status_t::success) {
BT_LOG_ASSERT(false, "Parent node={} repair is partial, because child_node get has failed with ret={}",
Expand All @@ -925,6 +872,10 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
} while (true);

if (child_node) { this->unlock_node(child_node, locktype_t::READ); }

// free the deleted child nodes
for (const auto& cnode : child_nodes_to_free) { free_node_impl(cnode, cp_ctx); }

// if last parent has the key less than the last child key, then we need to update the parent node with
// the last child key if it doesn't have edge.
auto last_parent = parent_node;
Expand Down Expand Up @@ -996,7 +947,7 @@ class IndexTable : public IndexTableBase, public Btree< K, V > {
}

if (sibling_node->is_node_deleted()) {
LOGTRACEMOD(wbcache, "Sibling node [{}] is not the sibling for parent_node {}",
LOGTRACEMOD(wbcache, "Sibling node [{}] is not the true sibling for parent_node {}",
sibling_node->to_string(), node->to_string());
return find_true_sibling(sibling_node);
} else {
Expand Down
3 changes: 2 additions & 1 deletion src/lib/index/wb_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ IndexWBCache::IndexWBCache(const std::shared_ptr< VirtualDev >& vdev, std::pair<
},
[](const sisl::CacheRecord& rec) -> bool {
const auto& hnode = (sisl::SingleEntryHashNode< BtreeNodePtr >&)rec;
return static_cast< IndexBtreeNode* >(hnode.m_value.get())->m_idx_buf->is_clean();
auto idx_buf = static_cast< IndexBtreeNode* >(hnode.m_value.get())->m_idx_buf;
return !idx_buf || idx_buf->is_clean();
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would you share under which use case idx_buf will be nulllptr?
Shouldn't cache always be the last one to free this buf when evicted?
I think what we need here is probably an debug assert that idx_buf is not nullptr? Or is there a case that I am not aware of?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cache is a little different to 1.3. Here, the cache holds a shared ptr to the btree node and the index buffer is a member of the btree node. The idx buffer of a btree node can be freed or copied into a new buffer and the previous idx buffer can become invalid

Copy link
Copy Markdown
Contributor

@yamingk yamingk Sep 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy to a new buffer is okay as long as it is still valid. When (and who) will free a btree nodes's index buffer with a cache record still pointing to this node?
Would you also help me understand in which case we need to copy a index buffer and invalidate the original idx buff?

}},
m_node_size{node_size},
m_meta_blk{sb.first} {
Expand Down
13 changes: 10 additions & 3 deletions src/tests/btree_helpers/btree_test_helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ struct BtreeTestHelper {
m_max_range_input = SISL_OPTIONS["num_entries"].as< uint32_t >();

if (m_is_multi_threaded) {
m_fibers.clear();
std::mutex mtx;
m_run_time = SISL_OPTIONS["run_time"].as< uint32_t >();
iomanager.run_on_wait(iomgr::reactor_regex::all_worker, [this, &mtx]() {
Expand Down Expand Up @@ -328,6 +329,10 @@ struct BtreeTestHelper {
do_range_remove(start_k, end_k, false /* removing_all_existing */);
}

void range_remove_all(uint32_t start_k, uint32_t end_k, bool expect_success = true) {
do_range_remove(start_k, end_k, true /* removing_all_existing */, expect_success);
}

////////////////////// All query operation variants ///////////////////////////////
void query_all() { do_query(0u, SISL_OPTIONS["num_entries"].as< uint32_t >() - 1, UINT32_MAX); }

Expand Down Expand Up @@ -527,7 +532,7 @@ struct BtreeTestHelper {
if (expect_success) { m_shadow_map.put_and_check(key, value, *existing_v, done); }
}

void do_range_remove(uint64_t start_k, uint64_t end_k, bool all_existing) {
void do_range_remove(uint64_t start_k, uint64_t end_k, bool all_existing, bool expect_success = true) {
K start_key = K{start_k};
K end_key = K{end_k};

Expand All @@ -537,8 +542,10 @@ struct BtreeTestHelper {

if (all_existing) {
m_shadow_map.range_erase(start_key, end_key);
ASSERT_EQ((ret == btree_status_t::success), true)
<< "not a successful remove op for range " << start_k << "-" << end_k;
if (expect_success) {
ASSERT_EQ(ret, btree_status_t::success)
<< "not a successful remove op for range " << start_k << "-" << end_k;
}
} else if (start_k < m_max_range_input) {
K end_range{std::min(end_k, uint64_cast(m_max_range_input - 1))};
m_shadow_map.range_erase(start_key, end_range);
Expand Down
2 changes: 1 addition & 1 deletion src/tests/btree_helpers/btree_test_kvs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ class TestFixedKey : public BtreeKey {
public:
TestFixedKey() = default;
TestFixedKey(uint64_t k) : m_key{k} {}
TestFixedKey(const TestFixedKey& other) : TestFixedKey(other.serialize(), true) {}
TestFixedKey(const TestFixedKey& other) : m_key{other.m_key} {}
TestFixedKey(const BtreeKey& other) : TestFixedKey(other.serialize(), true) {}
TestFixedKey(const sisl::blob& b, bool copy) : BtreeKey(), m_key{*(r_cast< const uint64_t* >(b.cbytes()))} {}
TestFixedKey& operator=(const TestFixedKey& other) = default;
Expand Down
14 changes: 14 additions & 0 deletions src/tests/btree_helpers/shadow_map.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ class ShadowMap {
public:
ShadowMap(uint32_t num_keys) : m_range_scheduler(num_keys), m_max_keys{num_keys} {}

using pick_existing_range_cb_t = std::function< void(const K&) >;
void put_and_check(const K& key, const V& val, const V& old_val, bool expected_success) {
std::lock_guard lock{m_mutex};
auto const [it, happened] = m_map.insert(std::make_pair(key, val));
Expand Down Expand Up @@ -69,6 +70,19 @@ class ShadowMap {
return std::pair(start_it->first, it->first);
}

std::optional<std::pair< K, K > > pick_existing_range(const K& start_key, uint32_t max_count, pick_existing_range_cb_t cb) const {
std::lock_guard lock{m_mutex};
auto const start_it = m_map.lower_bound(start_key);
if (start_it == m_map.cend()) { return std::nullopt; }
auto end_it = start_it;
auto it = start_it;
for(uint32_t count = 0; count < max_count && it != m_map.cend(); ++count, ++it) {
cb(it->first);
end_it = it;
}
return std::pair(start_it->first, end_it->first);
}

uint32_t max_keys() const { return m_max_keys; }

bool exists(const K& key) const {
Expand Down
39 changes: 38 additions & 1 deletion src/tests/test_common/homestore_test_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <device/hs_super_blk.h>
#include <iomgr/iomgr_config_generated.h>
#include <common/homestore_assert.hpp>
#include <iomgr/http_server.hpp>

#ifdef _PRERELEASE
#include "common/crash_simulator.hpp"
Expand Down Expand Up @@ -70,6 +71,34 @@ using namespace homestore;

namespace test_common {

class test_http_server {
public:
void get_prometheus_metrics(const Pistache::Rest::Request&, Pistache::Http::ResponseWriter response) {
response.send(Pistache::Http::Code::Ok,
sisl::MetricsFarm::getInstance().report(sisl::ReportFormat::kTextFormat));
}

void start() {
auto http_server_ptr = ioenvironment.get_http_server();

std::vector< iomgr::http_route > routes = {
{Pistache::Http::Method::Get, "/metrics",
Pistache::Rest::Routes::bind(&test_http_server::get_prometheus_metrics, this), iomgr::url_t::safe}};
try {
http_server_ptr->setup_routes(routes);
LOGINFO("Started http server ");
} catch (std::runtime_error const& e) { LOGERROR("setup routes failed, {}", e.what()) }

// start the server
http_server_ptr->start();
}

void stop() {
auto http_server_ptr = ioenvironment.get_http_server();
http_server_ptr->stop();
}
};

// Fix a port for http server
inline static void set_fixed_http_port(uint32_t http_port) {
SETTINGS_FACTORY(iomgr_config).modifiable_settings([http_port](auto& s) { s.io_env->http_port = http_port; });
Expand Down Expand Up @@ -362,6 +391,12 @@ class HSTestHelper {
}
}

test_http_server* get_http_server() {
return m_http_server.get();
}

void set_app_mem_size(uint64_t app_mem_size) { m_app_mem_size = app_mem_size; }

private:
void do_start_homestore(bool fake_restart = false, bool init_device = true, uint32_t shutdown_delay_sec = 5) {
auto const ndevices = SISL_OPTIONS["num_devs"].as< uint32_t >();
Expand Down Expand Up @@ -429,7 +464,7 @@ class HSTestHelper {
ioenvironment.with_http_server();
}

const uint64_t app_mem_size = ((ndevices * dev_size) * 15) / 100;
const uint64_t app_mem_size = (m_app_mem_size == 0) ? (((ndevices * dev_size) * 15) / 100) : m_app_mem_size;
LOGINFO("Initialize and start HomeStore with app_mem_size = {}", homestore::in_bytes(app_mem_size));

using namespace homestore;
Expand Down Expand Up @@ -530,6 +565,8 @@ class HSTestHelper {
protected:
test_token m_token;
std::vector< std::string > m_generated_devs;
std::unique_ptr< test_http_server > m_http_server;
uint64_t m_app_mem_size{0};
#ifdef _PRERELEASE
flip::FlipClient m_fc{iomgr_flip::instance()};
folly::Promise< folly::Unit > m_crash_recovered;
Expand Down
Loading
Loading