From ad8c58870d9f9c046aff0d66b335445a64c64416 Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Wed, 19 Nov 2025 14:20:59 +0000 Subject: [PATCH 01/41] Fix some bugs --- src/cluster/ClusterCommunicationService.cpp | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/src/cluster/ClusterCommunicationService.cpp b/src/cluster/ClusterCommunicationService.cpp index a0aba56b9c..2543baaded 100644 --- a/src/cluster/ClusterCommunicationService.cpp +++ b/src/cluster/ClusterCommunicationService.cpp @@ -150,7 +150,10 @@ ClioNode ClusterCommunicationService::selfData() const { ClioNode result{}; - util::spawn(strand_, [this, &result](boost::asio::yield_context) { result = selfData_; }); + boost::asio::spawn( + strand_, [this, &result](boost::asio::yield_context) { result = selfData_; }, boost::asio::use_future + ) + .wait(); return result; } @@ -161,10 +164,15 @@ ClusterCommunicationService::clusterData() const return std::unexpected{"Service is not healthy"}; } std::vector result; - util::spawn(strand_, [this, &result](boost::asio::yield_context) { - result = otherNodesData_; - result.push_back(selfData_); - }); + boost::asio::spawn( + strand_, + [this, &result](boost::asio::yield_context) { + result = otherNodesData_; + result.push_back(selfData_); + }, + boost::asio::use_future + ) + .wait(); return result; } @@ -220,7 +228,8 @@ ClusterCommunicationService::doWrite() { selfData_.updateTime = std::chrono::system_clock::now(); boost::json::value jsonValue{}; - boost::json::value_from(selfData_, jsonValue); + auto const& selfDataRef = selfData_; + boost::json::value_from(selfDataRef, jsonValue); backend_->writeNodeMessage(*selfData_.uuid, boost::json::serialize(jsonValue.as_object())); } From db86ff9a41e8ccf82e7d905d7e6c8152c9717bbf Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Thu, 20 Nov 2025 16:46:25 +0000 Subject: [PATCH 02/41] Change flags in etl SystemState --- src/etl/ETLService.cpp | 13 +++++++++---- src/etl/SystemState.hpp | 5 +++-- src/etl/impl/Loading.cpp | 2 +- tests/unit/etl/ETLServiceTests.cpp | 18 ++++++++++++------ 4 files changed, 25 insertions(+), 13 deletions(-) diff --git a/src/etl/ETLService.cpp b/src/etl/ETLService.cpp index 6f8c0d4ee6..743f76bdb1 100644 --- a/src/etl/ETLService.cpp +++ b/src/etl/ETLService.cpp @@ -351,11 +351,14 @@ ETLService::startMonitor(uint32_t seq) monitorNewSeqSubscription_ = monitor_->subscribeToNewSequence([this](uint32_t seq) { LOG(log_.info()) << "ETLService (via Monitor) got new seq from db: " << seq; - if (state_->writeConflict) { - LOG(log_.info()) << "Got a write conflict; Giving up writer seat immediately"; + if (state_->shouldGiveUpWriter) { giveUpWriter(); } + if (state_->shouldTakeoverWriting) { + attemptTakeoverWriter(); + } + if (not state_->isWriting) { auto const diff = data::synchronousAndRetryOnTimeout([this, seq](auto yield) { return backend_->fetchLedgerDiff(seq, yield); @@ -371,7 +374,7 @@ ETLService::startMonitor(uint32_t seq) monitorDbStalledSubscription_ = monitor_->subscribeToDbStalled([this]() { LOG(log_.warn()) << "ETLService received DbStalled signal from Monitor"; if (not state_->isStrictReadonly and not state_->isWriting) - attemptTakeoverWriter(); + state_->shouldTakeoverWriting = true; }); monitor_->run(); @@ -395,6 +398,7 @@ ETLService::attemptTakeoverWriter() ASSERT(rng.has_value(), "Ledger range can't be null"); state_->isWriting = true; // switch to writer + state_->shouldTakeoverWriting = false; LOG(log_.info()) << "Taking over the ETL writer seat"; startLoading(rng->maxSequence + 1); } @@ -404,7 +408,8 @@ ETLService::giveUpWriter() { ASSERT(not state_->isStrictReadonly, "This should only happen on writer nodes"); state_->isWriting = false; - state_->writeConflict = false; + state_->shouldGiveUpWriter = false; + LOG(log_.info()) << "Giving up writer seat"; taskMan_ = nullptr; } diff --git a/src/etl/SystemState.hpp b/src/etl/SystemState.hpp index 7f841665f4..8de89e75a3 100644 --- a/src/etl/SystemState.hpp +++ b/src/etl/SystemState.hpp @@ -50,8 +50,9 @@ struct SystemState { "Whether the process is writing to the database" ); - std::atomic_bool isStopping = false; /**< @brief Whether the software is stopping. */ - std::atomic_bool writeConflict = false; /**< @brief Whether a write conflict was detected. */ + std::atomic_bool isStopping = false; /**< @brief Whether the software is stopping. */ + std::atomic_bool shouldTakeoverWriting = false; /**< @brief Whether ETL should start writing to DB. */ + std::atomic_bool shouldGiveUpWriter = false; /**< @brief Whether ETL should stop writing to DB. */ /** * @brief Whether clio detected an amendment block. diff --git a/src/etl/impl/Loading.cpp b/src/etl/impl/Loading.cpp index 59d2d0a9c7..f27cc64f37 100644 --- a/src/etl/impl/Loading.cpp +++ b/src/etl/impl/Loading.cpp @@ -75,7 +75,7 @@ Loader::load(model::LedgerData const& data) << "; took " << duration << "ms"; if (not success) { - state_->writeConflict = true; + state_->shouldGiveUpWriter = true; LOG(log_.warn()) << "Another node wrote a ledger into the DB - we have a write conflict"; return std::unexpected(LoaderError::WriteConflict); } diff --git a/tests/unit/etl/ETLServiceTests.cpp b/tests/unit/etl/ETLServiceTests.cpp index 253009459c..e458a6332d 100644 --- a/tests/unit/etl/ETLServiceTests.cpp +++ b/tests/unit/etl/ETLServiceTests.cpp @@ -370,13 +370,13 @@ TEST_F(ETLServiceTests, HandlesWriteConflictInMonitorSubscription) EXPECT_CALL(*cacheLoader_, load(kSEQ)); service_.run(); - systemState_->writeConflict = true; + systemState_->shouldGiveUpWriter = true; EXPECT_CALL(*publisher_, publish(kSEQ + 1, testing::_, testing::_)); ASSERT_TRUE(capturedCallback); capturedCallback(kSEQ + 1); - EXPECT_FALSE(systemState_->writeConflict); + EXPECT_FALSE(systemState_->shouldGiveUpWriter); EXPECT_FALSE(systemState_->isWriting); } @@ -424,7 +424,11 @@ TEST_F(ETLServiceTests, AttemptTakeoverWriter) return std::move(mockMonitor); }); - EXPECT_CALL(mockMonitorRef, subscribeToNewSequence); + std::function onNewSeqCallback; + EXPECT_CALL(mockMonitorRef, subscribeToNewSequence).WillOnce([&onNewSeqCallback](auto cb) { + onNewSeqCallback = std::move(cb); + return boost::signals2::scoped_connection{}; + }); EXPECT_CALL(mockMonitorRef, subscribeToDbStalled).WillOnce([&capturedDbStalledCallback](auto callback) { capturedDbStalledCallback = callback; return boost::signals2::scoped_connection{}; @@ -450,6 +454,8 @@ TEST_F(ETLServiceTests, AttemptTakeoverWriter) ASSERT_TRUE(capturedDbStalledCallback); capturedDbStalledCallback(); + EXPECT_FALSE(systemState_->isWriting); // will attempt to become writer after new sequence appears but not yet + onNewSeqCallback(kSEQ); EXPECT_TRUE(systemState_->isWriting); // should attempt to become writer } @@ -477,15 +483,15 @@ TEST_F(ETLServiceTests, GiveUpWriterAfterWriteConflict) service_.run(); systemState_->isWriting = true; - systemState_->writeConflict = true; // got a write conflict along the way + systemState_->shouldGiveUpWriter = true; // got a write conflict along the way EXPECT_CALL(*publisher_, publish(kSEQ + 1, testing::_, testing::_)); ASSERT_TRUE(capturedCallback); capturedCallback(kSEQ + 1); - EXPECT_FALSE(systemState_->isWriting); // gives up writing - EXPECT_FALSE(systemState_->writeConflict); // and removes write conflict flag + EXPECT_FALSE(systemState_->isWriting); // gives up writing + EXPECT_FALSE(systemState_->shouldGiveUpWriter); // and removes write conflict flag } TEST_F(ETLServiceTests, CancelledLoadInitialLedger) From 1b00f2842e7691d002245405f9813ec4e97c5421 Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Fri, 21 Nov 2025 12:17:59 +0000 Subject: [PATCH 03/41] Add db role to ClioNode --- src/cluster/ClioNode.cpp | 13 +++- src/cluster/ClioNode.hpp | 10 +-- src/cluster/ClusterCommunicationService.hpp | 4 +- tests/unit/cluster/ClioNodeTests.cpp | 80 +++++++++++++++++---- 4 files changed, 84 insertions(+), 23 deletions(-) diff --git a/src/cluster/ClioNode.cpp b/src/cluster/ClioNode.cpp index e28585a905..1315cffdba 100644 --- a/src/cluster/ClioNode.cpp +++ b/src/cluster/ClioNode.cpp @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -37,6 +38,7 @@ namespace { struct Fields { static constexpr std::string_view const kUPDATE_TIME = "update_time"; + static constexpr std::string_view const kDB_ROLE = "db_role"; }; } // namespace @@ -46,6 +48,7 @@ tag_invoke(boost::json::value_from_tag, boost::json::value& jv, ClioNode const& { jv = { {Fields::kUPDATE_TIME, util::systemTpToUtcStr(node.updateTime, ClioNode::kTIME_FORMAT)}, + {Fields::kDB_ROLE, static_cast(node.dbRole)} }; } @@ -58,7 +61,15 @@ tag_invoke(boost::json::value_to_tag, boost::json::value const& jv) throw std::runtime_error("Failed to parse update time"); } - return ClioNode{.uuid = std::make_shared(), .updateTime = updateTime.value()}; + auto const dbRoleValue = jv.as_object().at(Fields::kDB_ROLE).as_int64(); + if (dbRoleValue > static_cast(ClioNode::DbRole::MAX)) + throw std::runtime_error("Invalid db_role value"); + + return ClioNode{ + .uuid = std::make_shared(), + .updateTime = updateTime.value(), + .dbRole = static_cast(dbRoleValue) + }; } } // namespace cluster diff --git a/src/cluster/ClioNode.hpp b/src/cluster/ClioNode.hpp index a350a37156..3afcdeadb0 100644 --- a/src/cluster/ClioNode.hpp +++ b/src/cluster/ClioNode.hpp @@ -37,16 +37,12 @@ struct ClioNode { */ static constexpr char const* kTIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"; - // enum class WriterRole { - // ReadOnly, - // NotWriter, - // Writer - // }; + /** @brief Database role */ + enum class DbRole { ReadOnly = 0, NotWriter = 1, Writer = 2, MAX = 2 }; std::shared_ptr uuid; ///< The UUID of the node. std::chrono::system_clock::time_point updateTime; ///< The time the data about the node was last updated. - - // WriterRole writerRole; + DbRole dbRole; ///< The database role of the node }; void diff --git a/src/cluster/ClusterCommunicationService.hpp b/src/cluster/ClusterCommunicationService.hpp index 3814271e6d..75d0207de7 100644 --- a/src/cluster/ClusterCommunicationService.hpp +++ b/src/cluster/ClusterCommunicationService.hpp @@ -77,8 +77,8 @@ class ClusterCommunicationService : public ClusterCommunicationServiceInterface std::vector otherNodesData_; public: - static constexpr std::chrono::milliseconds kDEFAULT_READ_INTERVAL{2100}; - static constexpr std::chrono::milliseconds kDEFAULT_WRITE_INTERVAL{1200}; + static constexpr std::chrono::milliseconds kDEFAULT_READ_INTERVAL{1000}; + static constexpr std::chrono::milliseconds kDEFAULT_WRITE_INTERVAL{1000}; /** * @brief Construct a new Cluster Communication Service object. * diff --git a/tests/unit/cluster/ClioNodeTests.cpp b/tests/unit/cluster/ClioNodeTests.cpp index 6cfe9c9242..f15f4fa61b 100644 --- a/tests/unit/cluster/ClioNodeTests.cpp +++ b/tests/unit/cluster/ClioNodeTests.cpp @@ -18,6 +18,7 @@ //============================================================================== #include "cluster/ClioNode.hpp" +#include "util/NameGenerator.hpp" #include "util/TimeUtils.hpp" #include @@ -29,6 +30,7 @@ #include #include +#include #include #include #include @@ -44,44 +46,44 @@ struct ClioNodeTest : testing::Test { TEST_F(ClioNodeTest, Serialization) { - // Create a ClioNode with test data ClioNode const node{ - .uuid = std::make_shared(boost::uuids::random_generator()()), .updateTime = updateTime + .uuid = std::make_shared(boost::uuids::random_generator()()), + .updateTime = updateTime, + .dbRole = ClioNode::DbRole::Writer }; - // Serialize to JSON boost::json::value jsonValue; EXPECT_NO_THROW(boost::json::value_from(node, jsonValue)); - // Verify JSON structure ASSERT_TRUE(jsonValue.is_object()) << jsonValue; auto const& obj = jsonValue.as_object(); - // Check update_time exists and is a string EXPECT_TRUE(obj.contains("update_time")); EXPECT_TRUE(obj.at("update_time").is_string()); + + EXPECT_TRUE(obj.contains("db_role")); + EXPECT_TRUE(obj.at("db_role").is_number()); + EXPECT_EQ(obj.at("db_role").as_int64(), static_cast(node.dbRole)); } TEST_F(ClioNodeTest, Deserialization) { - boost::json::value const jsonValue = {{"update_time", updateTimeStr}}; + boost::json::value const jsonValue = {{"update_time", updateTimeStr}, {"db_role", 1}}; - // Deserialize to ClioNode - ClioNode node{.uuid = std::make_shared(), .updateTime = {}}; - EXPECT_NO_THROW(node = boost::json::value_to(jsonValue)); + ClioNode node{ + .uuid = std::make_shared(), .updateTime = {}, .dbRole = ClioNode::DbRole::ReadOnly + }; + ASSERT_NO_THROW(node = boost::json::value_to(jsonValue)); - // Verify deserialized data EXPECT_NE(node.uuid, nullptr); EXPECT_EQ(*node.uuid, boost::uuids::uuid{}); EXPECT_EQ(node.updateTime, updateTime); + EXPECT_EQ(node.dbRole, ClioNode::DbRole::NotWriter); } TEST_F(ClioNodeTest, DeserializationInvalidTime) { - // Prepare an invalid time format boost::json::value const jsonValue{"update_time", "invalid_format"}; - - // Expect an exception during deserialization EXPECT_THROW(boost::json::value_to(jsonValue), std::runtime_error); } @@ -93,3 +95,55 @@ TEST_F(ClioNodeTest, DeserializationMissingTime) // Expect an exception EXPECT_THROW(boost::json::value_to(jsonValue), std::runtime_error); } + +struct ClioNodeDbRoleTestBundle { + std::string testName; + ClioNode::DbRole role; +}; + +struct ClioNodeDbRoleTest : ClioNodeTest, testing::WithParamInterface {}; + +INSTANTIATE_TEST_SUITE_P( + AllDbRoles, + ClioNodeDbRoleTest, + testing::Values( + ClioNodeDbRoleTestBundle{.testName = "ReadOnly", .role = ClioNode::DbRole::ReadOnly}, + ClioNodeDbRoleTestBundle{.testName = "NotWriter", .role = ClioNode::DbRole::NotWriter}, + ClioNodeDbRoleTestBundle{.testName = "Writer", .role = ClioNode::DbRole::Writer} + ), + tests::util::kNAME_GENERATOR +); + +TEST_P(ClioNodeDbRoleTest, Serialization) +{ + auto const param = GetParam(); + ClioNode const node{ + .uuid = std::make_shared(boost::uuids::random_generator()()), + .updateTime = updateTime, + .dbRole = param.role + }; + auto const jsonValue = boost::json::value_from(node); + EXPECT_EQ(jsonValue.as_object().at("db_role").as_int64(), static_cast(param.role)); +} + +TEST_P(ClioNodeDbRoleTest, Deserialization) +{ + auto const param = GetParam(); + boost::json::value const jsonValue = { + {"update_time", updateTimeStr}, {"db_role", static_cast(param.role)} + }; + auto const node = boost::json::value_to(jsonValue); + EXPECT_EQ(node.dbRole, param.role); +} + +TEST_F(ClioNodeDbRoleTest, DeserializationInvalidDbRole) +{ + boost::json::value const jsonValue = {{"update_time", updateTimeStr}, {"db_role", 10}}; + EXPECT_THROW(boost::json::value_to(jsonValue), std::runtime_error); +} + +TEST_F(ClioNodeDbRoleTest, DeserializationMissingDbRole) +{ + boost::json::value const jsonValue = {{"update_time", updateTimeStr}}; + EXPECT_THROW(boost::json::value_to(jsonValue), std::runtime_error); +} From a32177a0c824afa44fd905daf403a9a59f045cb3 Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Mon, 24 Nov 2025 16:02:36 +0000 Subject: [PATCH 04/41] Add WriterState --- src/cluster/ClioNode.hpp | 5 +- src/etl/CMakeLists.txt | 1 + src/etl/WriterState.cpp | 57 +++++++++++++++ src/etl/WriterState.hpp | 109 ++++++++++++++++++++++++++++ tests/unit/CMakeLists.txt | 1 + tests/unit/etl/WriterStateTests.cpp | 82 +++++++++++++++++++++ 6 files changed, 254 insertions(+), 1 deletion(-) create mode 100644 src/etl/WriterState.cpp create mode 100644 src/etl/WriterState.hpp create mode 100644 tests/unit/etl/WriterStateTests.cpp diff --git a/src/cluster/ClioNode.hpp b/src/cluster/ClioNode.hpp index 3afcdeadb0..fe7f0fabbc 100644 --- a/src/cluster/ClioNode.hpp +++ b/src/cluster/ClioNode.hpp @@ -40,7 +40,10 @@ struct ClioNode { /** @brief Database role */ enum class DbRole { ReadOnly = 0, NotWriter = 1, Writer = 2, MAX = 2 }; - std::shared_ptr uuid; ///< The UUID of the node. + using UUID = std::shared_ptr; + using cUUID = std::shared_ptr; + + UUID uuid; ///< The UUID of the node. std::chrono::system_clock::time_point updateTime; ///< The time the data about the node was last updated. DbRole dbRole; ///< The database role of the node }; diff --git a/src/etl/CMakeLists.txt b/src/etl/CMakeLists.txt index d6e8557db0..466b0cec13 100644 --- a/src/etl/CMakeLists.txt +++ b/src/etl/CMakeLists.txt @@ -11,6 +11,7 @@ target_sources( NetworkValidatedLedgers.cpp NFTHelpers.cpp Source.cpp + WriterState.cpp impl/AmendmentBlockHandler.cpp impl/AsyncGrpcCall.cpp impl/Extraction.cpp diff --git a/src/etl/WriterState.cpp b/src/etl/WriterState.cpp new file mode 100644 index 0000000000..0602a2d9ca --- /dev/null +++ b/src/etl/WriterState.cpp @@ -0,0 +1,57 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#include "etl/WriterState.hpp" + +#include "etl/SystemState.hpp" + +#include +#include + +namespace etl { + +WriterState::WriterState(std::shared_ptr state) : systemState_(std::move(state)) +{ +} + +bool +WriterState::isWriting() const +{ + return systemState_->isWriting; +} + +void +WriterState::startWriting() +{ + if (isWriting()) + return; + + systemState_->shouldTakeoverWriting = true; +} + +void +WriterState::giveUpWriting() +{ + if (not isWriting()) + return; + + systemState_->shouldTakeoverWriting = true; +} + +} // namespace etl diff --git a/src/etl/WriterState.hpp b/src/etl/WriterState.hpp new file mode 100644 index 0000000000..c75c6a0082 --- /dev/null +++ b/src/etl/WriterState.hpp @@ -0,0 +1,109 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#pragma once + +#include "etl/SystemState.hpp" + +#include + +namespace etl { + +/** + * @brief Interface for managing writer state in the ETL subsystem. + * + * This interface provides methods to query and control whether the ETL process + * is actively writing to the database. Implementations should coordinate with + * the ETL system state to manage write responsibilities. + */ +class WriterStateInterface { +public: + virtual ~WriterStateInterface() = default; + + /** + * @brief Check if the ETL process is currently writing to the database. + * @return true if the process is writing, false otherwise + */ + [[nodiscard]] virtual bool + isWriting() const = 0; + + /** + * @brief Request to start writing to the database. + * + * This method signals that the process should take over writing responsibilities. + * The actual transition to writing state may not be immediate. + */ + virtual void + startWriting() = 0; + + /** + * @brief Request to stop writing to the database. + * + * This method signals that the process should give up writing responsibilities. + * The actual transition from writing state may not be immediate. + */ + virtual void + giveUpWriting() = 0; +}; + +/** + * @brief Implementation of WriterStateInterface that manages ETL writer state. + * + * This class coordinates with SystemState to manage whether the ETL process + * is actively writing to the database. It provides methods to query the current + * writing state and request transitions between writing and non-writing states. + */ +class WriterState : public WriterStateInterface { +private: + std::shared_ptr systemState_; /**< @brief Shared system state for ETL coordination */ + +public: + /** + * @brief Construct a WriterState with the given system state. + * @param state Shared pointer to the system state for coordination + */ + WriterState(std::shared_ptr state); + + /** + * @brief Check if the ETL process is currently writing to the database. + * @return true if the process is writing, false otherwise + */ + bool + isWriting() const override; + + /** + * @brief Request to start writing to the database. + * + * If already writing, this method does nothing. Otherwise, it sets the + * shouldTakeoverWriting flag in the system state to signal the request. + */ + void + startWriting() override; + + /** + * @brief Request to stop writing to the database. + * + * If not currently writing, this method does nothing. Otherwise, it sets the + * shouldTakeoverWriting flag in the system state to signal the request. + */ + void + giveUpWriting() override; +}; + +} // namespace etl diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index cff464b7ac..4d1dfd08e7 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -47,6 +47,7 @@ target_sources( etl/SourceImplTests.cpp etl/SubscriptionSourceTests.cpp etl/TaskManagerTests.cpp + etl/WriterStateTests.cpp etl/ext/CoreTests.cpp etl/ext/CacheTests.cpp etl/ext/MPTTests.cpp diff --git a/tests/unit/etl/WriterStateTests.cpp b/tests/unit/etl/WriterStateTests.cpp new file mode 100644 index 0000000000..62ae06a1ed --- /dev/null +++ b/tests/unit/etl/WriterStateTests.cpp @@ -0,0 +1,82 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#include "etl/SystemState.hpp" +#include "etl/WriterState.hpp" +#include "util/MockPrometheus.hpp" + +#include + +#include + +using namespace etl; + +struct WriterStateTest : util::prometheus::WithPrometheus { + std::shared_ptr systemState = std::make_shared(); + WriterState writerState{systemState}; +}; + +TEST_F(WriterStateTest, IsWritingReturnsSystemStateValue) +{ + systemState->isWriting = false; + EXPECT_FALSE(writerState.isWriting()); + + systemState->isWriting = true; + EXPECT_TRUE(writerState.isWriting()); +} + +TEST_F(WriterStateTest, StartWritingSetsFlag) +{ + systemState->isWriting = false; + systemState->shouldTakeoverWriting = false; + + writerState.startWriting(); + + EXPECT_TRUE(systemState->shouldTakeoverWriting); +} + +TEST_F(WriterStateTest, StartWritingDoesNothingWhenAlreadyWriting) +{ + systemState->isWriting = true; + systemState->shouldTakeoverWriting = false; + + writerState.startWriting(); + + EXPECT_FALSE(systemState->shouldTakeoverWriting); +} + +TEST_F(WriterStateTest, GiveUpWritingSetsFlag) +{ + systemState->isWriting = true; + systemState->shouldTakeoverWriting = false; + + writerState.giveUpWriting(); + + EXPECT_TRUE(systemState->shouldTakeoverWriting); +} + +TEST_F(WriterStateTest, GiveUpWritingDoesNothingWhenNotWriting) +{ + systemState->isWriting = false; + systemState->shouldTakeoverWriting = false; + + writerState.giveUpWriting(); + + EXPECT_FALSE(systemState->shouldTakeoverWriting); +} From 49c3eb46d87ef411a1c74572849f3b3e9f767237 Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Tue, 25 Nov 2025 17:02:32 +0000 Subject: [PATCH 05/41] Refactored ClusterCommunicationService --- src/app/ClioApplication.cpp | 12 +- src/cluster/Backend.cpp | 136 ++++++++++++++ src/cluster/Backend.hpp | 105 +++++++++++ src/cluster/CMakeLists.txt | 2 +- src/cluster/ClioNode.cpp | 16 ++ src/cluster/ClioNode.hpp | 5 + src/cluster/ClusterCommunicationService.cpp | 171 +----------------- src/cluster/ClusterCommunicationService.hpp | 73 ++------ .../ClusterCommunicationServiceInterface.hpp | 54 ------ src/cluster/impl/RepeatedTask.hpp | 107 +++++++++++ src/etl/ETLService.cpp | 4 +- src/etl/ETLService.hpp | 1 + src/etl/SystemState.hpp | 11 ++ src/etl/WriterState.cpp | 6 + src/etl/WriterState.hpp | 6 + .../ClusterCommunicationServiceTests.cpp | 2 + 16 files changed, 425 insertions(+), 286 deletions(-) create mode 100644 src/cluster/Backend.cpp create mode 100644 src/cluster/Backend.hpp delete mode 100644 src/cluster/ClusterCommunicationServiceInterface.hpp create mode 100644 src/cluster/impl/RepeatedTask.hpp diff --git a/src/app/ClioApplication.cpp b/src/app/ClioApplication.cpp index 23cc4dff74..4dbfb0c265 100644 --- a/src/app/ClioApplication.cpp +++ b/src/app/ClioApplication.cpp @@ -29,6 +29,8 @@ #include "etl/ETLService.hpp" #include "etl/LoadBalancer.hpp" #include "etl/NetworkValidatedLedgers.hpp" +#include "etl/SystemState.hpp" +#include "etl/WriterState.hpp" #include "feed/SubscriptionManager.hpp" #include "migration/MigrationInspectorFactory.hpp" #include "rpc/Counters.hpp" @@ -120,7 +122,11 @@ ClioApplication::run(bool const useNgWebServer) // Interface to the database auto backend = data::makeBackend(config_, cache); - cluster::ClusterCommunicationService clusterCommunicationService{backend}; + auto systemState = etl::SystemState::makeSystemState(config_); + + cluster::ClusterCommunicationService clusterCommunicationService{ + backend, std::make_unique(systemState) + }; clusterCommunicationService.run(); auto const amendmentCenter = std::make_shared(backend); @@ -150,7 +156,9 @@ ClioApplication::run(bool const useNgWebServer) ); // ETL is responsible for writing and publishing to streams. In read-only mode, ETL only publishes - auto etl = etl::ETLService::makeETLService(config_, ctx, backend, subscriptions, balancer, ledgers); + auto etl = etl::ETLService::makeETLService( + config_, std::move(systemState), ctx, backend, subscriptions, balancer, ledgers + ); auto workQueue = rpc::WorkQueue::makeWorkQueue(config_); auto counters = rpc::Counters::makeCounters(workQueue); diff --git a/src/cluster/Backend.cpp b/src/cluster/Backend.cpp new file mode 100644 index 0000000000..bd1211dceb --- /dev/null +++ b/src/cluster/Backend.cpp @@ -0,0 +1,136 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#include "cluster/Backend.hpp" + +#include "cluster/ClioNode.hpp" +#include "data/BackendInterface.hpp" +#include "etl/WriterState.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace cluster { + +Backend::Backend( + boost::asio::thread_pool::executor_type ctx, + std::shared_ptr backend, + std::unique_ptr writerState, + std::chrono::steady_clock::duration readInterval, + std::chrono::steady_clock::duration writeInterval +) + : backend_(std::move(backend)) + , writerState_(std::move(writerState)) + , readerTask_(readInterval, ctx) + , writerTask_(writeInterval, ctx) + , selfUuid_(std::make_shared(boost::uuids::random_generator{}())) +{ +} + +void +Backend::run() +{ + readerTask_.run([this](boost::asio::yield_context yield) { + auto clusterState = doRead(yield); + onNewState_(selfUuid_, std::move(clusterState)); + }); + + writerTask_.run([this]() { doWrite(); }); +} + +Backend::~Backend() +{ + stop(); +} + +void +Backend::stop() +{ + readerTask_.stop(); + writerTask_.stop(); +} + +std::expected const>, std::string> +Backend::doRead(boost::asio::yield_context yield) +{ + BackendInterface::ClioNodesDataFetchResult expectedResult; + try { + expectedResult = backend_->fetchClioNodesData(yield); + } catch (...) { + expectedResult = std::unexpected{"Failed to fetch Clio nodes data"}; + } + + if (!expectedResult.has_value()) { + return std::unexpected{"Failed to fetch nodes data"}; + } + + std::vector otherNodesData; + for (auto const& [uuid, nodeDataStr] : expectedResult.value()) { + if (uuid == *selfUuid_) { + continue; + } + + boost::system::error_code errorCode; + auto const json = boost::json::parse(nodeDataStr, errorCode); + if (errorCode.failed()) { + return std::unexpected{fmt::format("Error parsing json from DB: {}", nodeDataStr)}; + } + + auto expectedNodeData = boost::json::try_value_to(json); + if (expectedNodeData.has_error()) { + return std::unexpected{fmt::format("Error converting json to ClioNode: {}", nodeDataStr)}; + } + *expectedNodeData->uuid = uuid; + otherNodesData.push_back(std::move(expectedNodeData).value()); + } + otherNodesData.push_back(ClioNode::from(selfUuid_, *writerState_)); + return std::make_shared>(otherNodesData); +} + +void +Backend::doWrite() +{ + auto const selfData = ClioNode::from(selfUuid_, *writerState_); + boost::json::value jsonValue{}; + boost::json::value_from(selfData, jsonValue); + backend_->writeNodeMessage(*selfData.uuid, boost::json::serialize(jsonValue.as_object())); +} + +} // namespace cluster diff --git a/src/cluster/Backend.hpp b/src/cluster/Backend.hpp new file mode 100644 index 0000000000..855606fe21 --- /dev/null +++ b/src/cluster/Backend.hpp @@ -0,0 +1,105 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#pragma once + +#include "cluster/ClioNode.hpp" +#include "cluster/impl/RepeatedTask.hpp" +#include "data/BackendInterface.hpp" +#include "etl/WriterState.hpp" +#include "util/log/Logger.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace cluster { + +class Backend { +public: + using ClusterData = std::expected const>, std::string>; + +private: + util::Logger log_{"ClusterCommunication"}; + + std::shared_ptr backend_; + std::unique_ptr writerState_; + + impl::RepeatedTask readerTask_; + impl::RepeatedTask writerTask_; + + ClioNode::UUID selfUuid_; + + boost::signals2::signal onNewState_; + +public: + Backend( + boost::asio::thread_pool::executor_type ctx, + std::shared_ptr backend, + std::unique_ptr writerState, + std::chrono::steady_clock::duration readInterval, + std::chrono::steady_clock::duration writeInterval + ); + + ~Backend(); + + Backend(Backend&&) = delete; + Backend& + operator=(Backend&&) = delete; + Backend(Backend const&) = delete; + Backend& + operator=(Backend const&) = delete; + + void + run(); + + void + stop(); + + template + requires std::invocable + boost::signals2::connection + subscribeToNewState(S&& s) + { + return onNewState_.connect(s); + } + +private: + std::expected const>, std::string> + doRead(boost::asio::yield_context yield); + + void + doWrite(); +}; + +} // namespace cluster diff --git a/src/cluster/CMakeLists.txt b/src/cluster/CMakeLists.txt index defd5853ec..724c195e7e 100644 --- a/src/cluster/CMakeLists.txt +++ b/src/cluster/CMakeLists.txt @@ -1,5 +1,5 @@ add_library(clio_cluster) -target_sources(clio_cluster PRIVATE ClioNode.cpp ClusterCommunicationService.cpp) +target_sources(clio_cluster PRIVATE Backend.cpp ClioNode.cpp ClusterCommunicationService.cpp) target_link_libraries(clio_cluster PRIVATE clio_util clio_data) diff --git a/src/cluster/ClioNode.cpp b/src/cluster/ClioNode.cpp index 1315cffdba..cf16b6d4a8 100644 --- a/src/cluster/ClioNode.cpp +++ b/src/cluster/ClioNode.cpp @@ -19,6 +19,7 @@ #include "cluster/ClioNode.hpp" +#include "etl/WriterState.hpp" #include "util/TimeUtils.hpp" #include @@ -26,11 +27,13 @@ #include #include +#include #include #include #include #include #include +#include namespace cluster { @@ -43,6 +46,19 @@ struct Fields { } // namespace +ClioNode +ClioNode::from(ClioNode::UUID uuid, etl::WriterStateInterface const& writerState) +{ + auto const dbRole = [&writerState]() { + if (writerState.isReadOnly()) { + return ClioNode::DbRole::ReadOnly; + } + + return writerState.isWriting() ? ClioNode::DbRole::Writer : ClioNode::DbRole::NotWriter; + }(); + return ClioNode{.uuid = std::move(uuid), .updateTime = std::chrono::system_clock::now(), .dbRole = dbRole}; +} + void tag_invoke(boost::json::value_from_tag, boost::json::value& jv, ClioNode const& node) { diff --git a/src/cluster/ClioNode.hpp b/src/cluster/ClioNode.hpp index fe7f0fabbc..220b63a3e2 100644 --- a/src/cluster/ClioNode.hpp +++ b/src/cluster/ClioNode.hpp @@ -19,6 +19,8 @@ #pragma once +#include "etl/WriterState.hpp" + #include #include #include @@ -46,6 +48,9 @@ struct ClioNode { UUID uuid; ///< The UUID of the node. std::chrono::system_clock::time_point updateTime; ///< The time the data about the node was last updated. DbRole dbRole; ///< The database role of the node + + static ClioNode + from(UUID uuid, etl::WriterStateInterface const& writerState); }; void diff --git a/src/cluster/ClusterCommunicationService.cpp b/src/cluster/ClusterCommunicationService.cpp index 2543baaded..0a428d2e6d 100644 --- a/src/cluster/ClusterCommunicationService.cpp +++ b/src/cluster/ClusterCommunicationService.cpp @@ -19,11 +19,8 @@ #include "cluster/ClusterCommunicationService.hpp" -#include "cluster/ClioNode.hpp" #include "data/BackendInterface.hpp" -#include "util/Assert.hpp" -#include "util/Spawn.hpp" -#include "util/log/Logger.hpp" +#include "etl/WriterState.hpp" #include #include @@ -41,31 +38,18 @@ #include #include -#include #include -#include #include -#include - -namespace { -constexpr auto kTOTAL_WORKERS = 2uz; // 1 reading and 1 writing worker (coroutines) -} // namespace namespace cluster { ClusterCommunicationService::ClusterCommunicationService( std::shared_ptr backend, + std::unique_ptr writerState, std::chrono::steady_clock::duration readInterval, std::chrono::steady_clock::duration writeInterval ) - : backend_(std::move(backend)) - , readInterval_(readInterval) - , writeInterval_(writeInterval) - , finishedCountdown_(kTOTAL_WORKERS) - , selfData_{ClioNode{ - .uuid = std::make_shared(boost::uuids::random_generator{}()), - .updateTime = std::chrono::system_clock::time_point{} - }} + : backend_(ctx_.executor(), std::move(backend), std::move(writerState), readInterval, writeInterval) { nodesInClusterMetric_.set(1); // The node always sees itself isHealthy_ = true; @@ -74,43 +58,7 @@ ClusterCommunicationService::ClusterCommunicationService( void ClusterCommunicationService::run() { - ASSERT(not running_ and not stopped_, "Can only be ran once"); - running_ = true; - - util::spawn(strand_, [this](boost::asio::yield_context yield) { - boost::asio::steady_timer timer(yield.get_executor()); - boost::system::error_code ec; - - while (running_) { - timer.expires_after(readInterval_); - auto token = cancelSignal_.slot(); - timer.async_wait(boost::asio::bind_cancellation_slot(token, yield[ec])); - - if (ec == boost::asio::error::operation_aborted or not running_) - break; - - doRead(yield); - } - - finishedCountdown_.count_down(1); - }); - - util::spawn(strand_, [this](boost::asio::yield_context yield) { - boost::asio::steady_timer timer(yield.get_executor()); - boost::system::error_code ec; - - while (running_) { - doWrite(); - timer.expires_after(writeInterval_); - auto token = cancelSignal_.slot(); - timer.async_wait(boost::asio::bind_cancellation_slot(token, yield[ec])); - - if (ec == boost::asio::error::operation_aborted or not running_) - break; - } - - finishedCountdown_.count_down(1); - }); + backend_.run(); } ClusterCommunicationService::~ClusterCommunicationService() @@ -121,116 +69,7 @@ ClusterCommunicationService::~ClusterCommunicationService() void ClusterCommunicationService::stop() { - if (stopped_) - return; - - stopped_ = true; - - // for ASAN to see through concurrency correctly we need to exit all coroutines before joining the ctx - running_ = false; - - // cancelSignal_ is not thread safe so we execute emit on the same strand - boost::asio::spawn( - strand_, [this](auto&&) { cancelSignal_.emit(boost::asio::cancellation_type::all); }, boost::asio::use_future - ) - .wait(); - finishedCountdown_.wait(); - - ctx_.join(); -} - -std::shared_ptr -ClusterCommunicationService::selfUuid() const -{ - // Uuid never changes so it is safe to copy it without using strand_ - return selfData_.uuid; -} - -ClioNode -ClusterCommunicationService::selfData() const -{ - ClioNode result{}; - boost::asio::spawn( - strand_, [this, &result](boost::asio::yield_context) { result = selfData_; }, boost::asio::use_future - ) - .wait(); - return result; -} - -std::expected, std::string> -ClusterCommunicationService::clusterData() const -{ - if (not isHealthy_) { - return std::unexpected{"Service is not healthy"}; - } - std::vector result; - boost::asio::spawn( - strand_, - [this, &result](boost::asio::yield_context) { - result = otherNodesData_; - result.push_back(selfData_); - }, - boost::asio::use_future - ) - .wait(); - return result; -} - -void -ClusterCommunicationService::doRead(boost::asio::yield_context yield) -{ - otherNodesData_.clear(); - - BackendInterface::ClioNodesDataFetchResult expectedResult; - try { - expectedResult = backend_->fetchClioNodesData(yield); - } catch (...) { - expectedResult = std::unexpected{"Failed to fecth Clio nodes data"}; - } - - if (!expectedResult.has_value()) { - LOG(log_.error()) << "Failed to fetch nodes data"; - isHealthy_ = false; - return; - } - - // Create a new vector here to not have partially parsed data in otherNodesData_ - std::vector otherNodesData; - for (auto const& [uuid, nodeDataStr] : expectedResult.value()) { - if (uuid == *selfData_.uuid) { - continue; - } - - boost::system::error_code errorCode; - auto const json = boost::json::parse(nodeDataStr, errorCode); - if (errorCode.failed()) { - LOG(log_.error()) << "Error parsing json from DB: " << nodeDataStr; - isHealthy_ = false; - return; - } - - auto expectedNodeData = boost::json::try_value_to(json); - if (expectedNodeData.has_error()) { - LOG(log_.error()) << "Error converting json to ClioNode: " << json; - isHealthy_ = false; - return; - } - *expectedNodeData->uuid = uuid; - otherNodesData.push_back(std::move(expectedNodeData).value()); - } - otherNodesData_ = std::move(otherNodesData); - nodesInClusterMetric_.set(otherNodesData_.size() + 1); - isHealthy_ = true; -} - -void -ClusterCommunicationService::doWrite() -{ - selfData_.updateTime = std::chrono::system_clock::now(); - boost::json::value jsonValue{}; - auto const& selfDataRef = selfData_; - boost::json::value_from(selfDataRef, jsonValue); - backend_->writeNodeMessage(*selfData_.uuid, boost::json::serialize(jsonValue.as_object())); + backend_.stop(); } } // namespace cluster diff --git a/src/cluster/ClusterCommunicationService.hpp b/src/cluster/ClusterCommunicationService.hpp index 75d0207de7..899e403705 100644 --- a/src/cluster/ClusterCommunicationService.hpp +++ b/src/cluster/ClusterCommunicationService.hpp @@ -19,10 +19,9 @@ #pragma once -#include "cluster/ClioNode.hpp" -#include "cluster/ClusterCommunicationServiceInterface.hpp" +#include "cluster/Backend.hpp" #include "data/BackendInterface.hpp" -#include "util/log/Logger.hpp" +#include "etl/WriterState.hpp" #include "util/prometheus/Bool.hpp" #include "util/prometheus/Gauge.hpp" #include "util/prometheus/Prometheus.hpp" @@ -33,19 +32,16 @@ #include #include -#include #include -#include #include #include -#include namespace cluster { /** * @brief Service to post and read messages to/from the cluster. It uses a backend to communicate with the cluster. */ -class ClusterCommunicationService : public ClusterCommunicationServiceInterface { +class ClusterCommunicationService { util::prometheus::GaugeInt& nodesInClusterMetric_ = PrometheusService::gaugeInt( "cluster_nodes_total_number", {}, @@ -59,52 +55,28 @@ class ClusterCommunicationService : public ClusterCommunicationServiceInterface // TODO: Use util::async::CoroExecutionContext after https://github.com/XRPLF/clio/issues/1973 is implemented boost::asio::thread_pool ctx_{1}; - boost::asio::strand strand_ = boost::asio::make_strand(ctx_); - - util::Logger log_{"ClusterCommunication"}; - - std::shared_ptr backend_; - - std::chrono::steady_clock::duration readInterval_; - std::chrono::steady_clock::duration writeInterval_; - - boost::asio::cancellation_signal cancelSignal_; - std::latch finishedCountdown_; - std::atomic_bool running_ = false; - bool stopped_ = false; - - ClioNode selfData_; - std::vector otherNodesData_; + Backend backend_; public: static constexpr std::chrono::milliseconds kDEFAULT_READ_INTERVAL{1000}; static constexpr std::chrono::milliseconds kDEFAULT_WRITE_INTERVAL{1000}; + /** * @brief Construct a new Cluster Communication Service object. * * @param backend The backend to use for communication. + * @param writerState The state showing whether clio is writing to the database. * @param readInterval The interval to read messages from the cluster. * @param writeInterval The interval to write messages to the cluster. */ ClusterCommunicationService( std::shared_ptr backend, + std::unique_ptr writerState, std::chrono::steady_clock::duration readInterval = kDEFAULT_READ_INTERVAL, std::chrono::steady_clock::duration writeInterval = kDEFAULT_WRITE_INTERVAL ); - ~ClusterCommunicationService() override; - - /** - * @brief Start the service. - */ - void - run(); - - /** - * @brief Stop the service. - */ - void - stop(); + ~ClusterCommunicationService(); ClusterCommunicationService(ClusterCommunicationService&&) = delete; ClusterCommunicationService(ClusterCommunicationService const&) = delete; @@ -114,35 +86,16 @@ class ClusterCommunicationService : public ClusterCommunicationServiceInterface operator=(ClusterCommunicationService const&) = delete; /** - * @brief Get the UUID of the current node. - * - * @return The UUID of the current node. - */ - std::shared_ptr - selfUuid() const; - - /** - * @brief Get the data of the current node. - * - * @return The data of the current node. + * @brief Start the service. */ - ClioNode - selfData() const override; + void + run(); /** - * @brief Get the data of all nodes in the cluster (including self). - * - * @return The data of all nodes in the cluster or error if the service is not healthy. + * @brief Stop the service. */ - std::expected, std::string> - clusterData() const override; - -private: - void - doRead(boost::asio::yield_context yield); - void - doWrite(); + stop(); }; } // namespace cluster diff --git a/src/cluster/ClusterCommunicationServiceInterface.hpp b/src/cluster/ClusterCommunicationServiceInterface.hpp deleted file mode 100644 index 6e79460c58..0000000000 --- a/src/cluster/ClusterCommunicationServiceInterface.hpp +++ /dev/null @@ -1,54 +0,0 @@ -//------------------------------------------------------------------------------ -/* - This file is part of clio: https://github.com/XRPLF/clio - Copyright (c) 2025, the clio developers. - - Permission to use, copy, modify, and distribute this software for any - purpose with or without fee is hereby granted, provided that the above - copyright notice and this permission notice appear in all copies. - - THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -*/ -//============================================================================== - -#pragma once - -#include "cluster/ClioNode.hpp" - -#include -#include -#include - -namespace cluster { - -/** - * @brief Interface for the cluster communication service. - */ -class ClusterCommunicationServiceInterface { -public: - virtual ~ClusterCommunicationServiceInterface() = default; - - /** - * @brief Get the data of the current node. - * - * @return The data of the current node. - */ - [[nodiscard]] virtual ClioNode - selfData() const = 0; - - /** - * @brief Get the data of all nodes in the cluster (including self). - * - * @return The data of all nodes in the cluster or error if the service is not healthy. - */ - [[nodiscard]] virtual std::expected, std::string> - clusterData() const = 0; -}; - -} // namespace cluster diff --git a/src/cluster/impl/RepeatedTask.hpp b/src/cluster/impl/RepeatedTask.hpp new file mode 100644 index 0000000000..ac387f4a30 --- /dev/null +++ b/src/cluster/impl/RepeatedTask.hpp @@ -0,0 +1,107 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#pragma once + +#include "util/Assert.hpp" +#include "util/Spawn.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace cluster::impl { + +// TODO: Try to replace util/Repeat by this +template +class RepeatedTask { + std::chrono::steady_clock::duration interval_; + boost::asio::strand strand_; + + enum class State { Running, Stopped }; + std::atomic state_ = State::Stopped; + + boost::asio::cancellation_signal cancelSignal_; + +public: + RepeatedTask(std::chrono::steady_clock::duration interval, AsioExecutorType& ctx) + : interval_(interval), strand_(boost::asio::make_strand(ctx)) + { + } + + ~RepeatedTask() + { + stop(); + } + + template + requires std::invocable or std::invocable + void + run(Fn&& f) + { + ASSERT(state_ == State::Stopped, "Can only be ran once"); + state_ = State::Running; + util::spawn(strand_, [this, t = std::forward(f)](boost::asio::yield_context yield) { + boost::asio::steady_timer timer(yield.get_executor()); + boost::system::error_code ec; + + while (state_ == State::Running) { + timer.expires_after(interval_); + auto token = cancelSignal_.slot(); + timer.async_wait(boost::asio::bind_cancellation_slot(token, yield[ec])); + + if (ec == boost::asio::error::operation_aborted or state_ != State::Running) + break; + + if constexpr (std::invocable) { + t(yield); + } else { + t(); + } + } + }); + } + + void + stop() + { + if (state_ == State::Stopped) + return; + + state_ = State::Stopped; + boost::asio::spawn( + strand_, + [this](auto&&) { cancelSignal_.emit(boost::asio::cancellation_type::all); }, + boost::asio::use_future + ) + .wait(); + } +}; + +} // namespace cluster::impl diff --git a/src/etl/ETLService.cpp b/src/etl/ETLService.cpp index 743f76bdb1..2addca8bc6 100644 --- a/src/etl/ETLService.cpp +++ b/src/etl/ETLService.cpp @@ -78,6 +78,7 @@ namespace etl { std::shared_ptr ETLService::makeETLService( util::config::ClioConfigDefinition const& config, + std::shared_ptr state, util::async::AnyExecutionContext ctx, std::shared_ptr backend, std::shared_ptr subscriptions, @@ -87,9 +88,6 @@ ETLService::makeETLService( { std::shared_ptr ret; - auto state = std::make_shared(); - state->isStrictReadonly = config.get("read_only"); - auto fetcher = std::make_shared(backend, balancer); auto extractor = std::make_shared(fetcher); auto publisher = std::make_shared(ctx, backend, subscriptions, *state); diff --git a/src/etl/ETLService.hpp b/src/etl/ETLService.hpp index 45185d4be4..221a488336 100644 --- a/src/etl/ETLService.hpp +++ b/src/etl/ETLService.hpp @@ -137,6 +137,7 @@ class ETLService : public ETLServiceInterface { static std::shared_ptr makeETLService( util::config::ClioConfigDefinition const& config, + std::shared_ptr state, util::async::AnyExecutionContext ctx, std::shared_ptr backend, std::shared_ptr subscriptions, diff --git a/src/etl/SystemState.hpp b/src/etl/SystemState.hpp index 8de89e75a3..786fe534c8 100644 --- a/src/etl/SystemState.hpp +++ b/src/etl/SystemState.hpp @@ -19,11 +19,14 @@ #pragma once +#include "util/config/ConfigDefinition.hpp" +#include "util/log/Logger.hpp" #include "util/prometheus/Bool.hpp" #include "util/prometheus/Label.hpp" #include "util/prometheus/Prometheus.hpp" #include +#include namespace etl { @@ -31,6 +34,14 @@ namespace etl { * @brief Represents the state of the ETL subsystem. */ struct SystemState { + static std::shared_ptr + makeSystemState(util::config::ClioConfigDefinition const& config) + { + auto state = std::make_shared(); + state->isStrictReadonly = config.get("read_only"); + return state; + } + /** * @brief Whether the process is in strict read-only mode. * diff --git a/src/etl/WriterState.cpp b/src/etl/WriterState.cpp index 0602a2d9ca..da8f520174 100644 --- a/src/etl/WriterState.cpp +++ b/src/etl/WriterState.cpp @@ -30,6 +30,12 @@ WriterState::WriterState(std::shared_ptr state) : systemState_(std: { } +bool +WriterState::isReadOnly() const +{ + return systemState_->isStrictReadonly; +} + bool WriterState::isWriting() const { diff --git a/src/etl/WriterState.hpp b/src/etl/WriterState.hpp index c75c6a0082..b034274f0f 100644 --- a/src/etl/WriterState.hpp +++ b/src/etl/WriterState.hpp @@ -36,6 +36,9 @@ class WriterStateInterface { public: virtual ~WriterStateInterface() = default; + [[nodiscard]] virtual bool + isReadOnly() const = 0; + /** * @brief Check if the ETL process is currently writing to the database. * @return true if the process is writing, false otherwise @@ -80,6 +83,9 @@ class WriterState : public WriterStateInterface { */ WriterState(std::shared_ptr state); + bool + isReadOnly() const override; + /** * @brief Check if the ETL process is currently writing to the database. * @return true if the process is writing, false otherwise diff --git a/tests/unit/cluster/ClusterCommunicationServiceTests.cpp b/tests/unit/cluster/ClusterCommunicationServiceTests.cpp index 544d7e3dbd..46552e8e4f 100644 --- a/tests/unit/cluster/ClusterCommunicationServiceTests.cpp +++ b/tests/unit/cluster/ClusterCommunicationServiceTests.cpp @@ -49,6 +49,7 @@ using namespace cluster; +/* namespace { std::vector const kOTHER_NODES_DATA = { ClioNode{ @@ -226,3 +227,4 @@ TEST_F(ClusterCommunicationServiceTest, Read_Success) EXPECT_TRUE(isHealthyMetric); EXPECT_EQ(nodesInClusterMetric.value(), 3); } +*/ From a68a874964ce9f8923227d702393be68547e171a Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Tue, 25 Nov 2025 17:26:15 +0000 Subject: [PATCH 06/41] Fix docs --- src/cluster/Backend.hpp | 52 ++++++++++++++++++++++++++++++++++++++++ src/cluster/ClioNode.hpp | 7 ++++++ src/etl/ETLService.hpp | 3 ++- src/etl/SystemState.hpp | 6 +++++ src/etl/WriterState.hpp | 6 ++++- 5 files changed, 72 insertions(+), 2 deletions(-) diff --git a/src/cluster/Backend.hpp b/src/cluster/Backend.hpp index 855606fe21..a866c68420 100644 --- a/src/cluster/Backend.hpp +++ b/src/cluster/Backend.hpp @@ -45,24 +45,50 @@ namespace cluster { +/** + * @brief Backend communication handler for cluster state synchronization. + * + * This class manages reading and writing cluster state information to/from the backend database. + * It periodically reads the state of other nodes in the cluster and writes the current node's state, + * enabling cluster-wide coordination and awareness. + */ class Backend { public: + /** @brief Type representing cluster data result - either a vector of nodes or an error message */ using ClusterData = std::expected const>, std::string>; private: + /** @brief Logger for cluster communication activities */ util::Logger log_{"ClusterCommunication"}; + /** @brief Interface to the backend database for reading/writing cluster state */ std::shared_ptr backend_; + + /** @brief State indicating whether this node is writing to the database */ std::unique_ptr writerState_; + /** @brief Repeated task for reading cluster state from the backend */ impl::RepeatedTask readerTask_; + + /** @brief Repeated task for writing this node's state to the backend */ impl::RepeatedTask writerTask_; + /** @brief UUID uniquely identifying this node in the cluster */ ClioNode::UUID selfUuid_; + /** @brief Signal emitted when new cluster state is available */ boost::signals2::signal onNewState_; public: + /** + * @brief Construct a Backend communication handler. + * + * @param ctx The executor context for asynchronous operations + * @param backend Interface to the backend database + * @param writerState State indicating whether this node is writing to the database + * @param readInterval How often to read cluster state from the backend + * @param writeInterval How often to write this node's state to the backend + */ Backend( boost::asio::thread_pool::executor_type ctx, std::shared_ptr backend, @@ -80,12 +106,29 @@ class Backend { Backend& operator=(Backend const&) = delete; + /** + * @brief Start the backend read and write tasks. + * + * Begins periodic reading of cluster state from the backend and writing of this node's state. + */ void run(); + /** + * @brief Stop the backend read and write tasks. + * + * Stops all periodic tasks and waits for them to complete. + */ void stop(); + /** + * @brief Subscribe to new cluster state notifications. + * + * @tparam S Callable type accepting (ClioNode::cUUID, ClusterData) + * @param s Subscriber callback to be invoked when new cluster state is available + * @return A connection object that can be used to unsubscribe + */ template requires std::invocable boost::signals2::connection @@ -95,9 +138,18 @@ class Backend { } private: + /** + * @brief Read cluster state from the backend. + * + * @param yield Coroutine yield context + * @return Cluster data containing all nodes' state, or an error message + */ std::expected const>, std::string> doRead(boost::asio::yield_context yield); + /** + * @brief Write this node's state to the backend. + */ void doWrite(); }; diff --git a/src/cluster/ClioNode.hpp b/src/cluster/ClioNode.hpp index 220b63a3e2..3b70d6f017 100644 --- a/src/cluster/ClioNode.hpp +++ b/src/cluster/ClioNode.hpp @@ -49,6 +49,13 @@ struct ClioNode { std::chrono::system_clock::time_point updateTime; ///< The time the data about the node was last updated. DbRole dbRole; ///< The database role of the node + /** + * @brief Create a ClioNode from writer state. + * + * @param uuid The UUID of the node + * @param writerState The writer state to determine the node's database role + * @return A ClioNode with the current time and role derived from writerState + */ static ClioNode from(UUID uuid, etl::WriterStateInterface const& writerState); }; diff --git a/src/etl/ETLService.hpp b/src/etl/ETLService.hpp index 221a488336..ab85436cfc 100644 --- a/src/etl/ETLService.hpp +++ b/src/etl/ETLService.hpp @@ -127,6 +127,7 @@ class ETLService : public ETLServiceInterface { * Creates and runs the ETL service. * * @param config The configuration to use + * @param state The system state tracking object * @param ctx Execution context for asynchronous operations * @param backend BackendInterface implementation * @param subscriptions Subscription manager @@ -161,7 +162,7 @@ class ETLService : public ETLServiceInterface { * @param initialLoadObserver The observer for initial data loading * @param taskManagerProvider The provider of the task manager instance * @param monitorProvider The provider of the monitor instance - * @param state System state tracking object + * @param state The system state tracking object */ ETLService( util::async::AnyExecutionContext ctx, diff --git a/src/etl/SystemState.hpp b/src/etl/SystemState.hpp index 786fe534c8..9813d59bb4 100644 --- a/src/etl/SystemState.hpp +++ b/src/etl/SystemState.hpp @@ -34,6 +34,12 @@ namespace etl { * @brief Represents the state of the ETL subsystem. */ struct SystemState { + /** + * @brief Factory method to create a SystemState instance. + * + * @param config The configuration to use for initializing the system state + * @return A shared pointer to the newly created SystemState + */ static std::shared_ptr makeSystemState(util::config::ClioConfigDefinition const& config) { diff --git a/src/etl/WriterState.hpp b/src/etl/WriterState.hpp index b034274f0f..df619ba548 100644 --- a/src/etl/WriterState.hpp +++ b/src/etl/WriterState.hpp @@ -36,6 +36,10 @@ class WriterStateInterface { public: virtual ~WriterStateInterface() = default; + /** + * @brief Check if the ETL process is in strict read-only mode. + * @return true if the process is in strict read-only mode, false otherwise + */ [[nodiscard]] virtual bool isReadOnly() const = 0; @@ -106,7 +110,7 @@ class WriterState : public WriterStateInterface { * @brief Request to stop writing to the database. * * If not currently writing, this method does nothing. Otherwise, it sets the - * shouldTakeoverWriting flag in the system state to signal the request. + * shouldGiveUpWriter flag in the system state to signal the request. */ void giveUpWriting() override; From ff7cc6aa3f5f083e7cd1e2e2cd7cab440924d9c4 Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Tue, 25 Nov 2025 18:03:00 +0000 Subject: [PATCH 07/41] Add metrics subservice --- src/cluster/Backend.cpp | 7 +-- src/cluster/Backend.hpp | 8 +-- src/cluster/ClusterCommunicationService.cpp | 5 +- src/cluster/ClusterCommunicationService.hpp | 17 +----- src/cluster/Metrics.hpp | 61 +++++++++++++++++++++ 5 files changed, 73 insertions(+), 25 deletions(-) create mode 100644 src/cluster/Metrics.hpp diff --git a/src/cluster/Backend.cpp b/src/cluster/Backend.cpp index bd1211dceb..2f34632c03 100644 --- a/src/cluster/Backend.cpp +++ b/src/cluster/Backend.cpp @@ -43,7 +43,6 @@ #include #include -#include #include #include @@ -69,7 +68,7 @@ Backend::run() { readerTask_.run([this](boost::asio::yield_context yield) { auto clusterState = doRead(yield); - onNewState_(selfUuid_, std::move(clusterState)); + onNewState_(selfUuid_, std::make_shared(std::move(clusterState))); }); writerTask_.run([this]() { doWrite(); }); @@ -87,7 +86,7 @@ Backend::stop() writerTask_.stop(); } -std::expected const>, std::string> +Backend::ClusterData Backend::doRead(boost::asio::yield_context yield) { BackendInterface::ClioNodesDataFetchResult expectedResult; @@ -121,7 +120,7 @@ Backend::doRead(boost::asio::yield_context yield) otherNodesData.push_back(std::move(expectedNodeData).value()); } otherNodesData.push_back(ClioNode::from(selfUuid_, *writerState_)); - return std::make_shared>(otherNodesData); + return std::vector(otherNodesData); } void diff --git a/src/cluster/Backend.hpp b/src/cluster/Backend.hpp index a866c68420..e72f49a8c5 100644 --- a/src/cluster/Backend.hpp +++ b/src/cluster/Backend.hpp @@ -55,7 +55,7 @@ namespace cluster { class Backend { public: /** @brief Type representing cluster data result - either a vector of nodes or an error message */ - using ClusterData = std::expected const>, std::string>; + using ClusterData = std::expected, std::string>; private: /** @brief Logger for cluster communication activities */ @@ -77,7 +77,7 @@ class Backend { ClioNode::UUID selfUuid_; /** @brief Signal emitted when new cluster state is available */ - boost::signals2::signal onNewState_; + boost::signals2::signal)> onNewState_; public: /** @@ -130,7 +130,7 @@ class Backend { * @return A connection object that can be used to unsubscribe */ template - requires std::invocable + requires std::invocable> boost::signals2::connection subscribeToNewState(S&& s) { @@ -144,7 +144,7 @@ class Backend { * @param yield Coroutine yield context * @return Cluster data containing all nodes' state, or an error message */ - std::expected const>, std::string> + ClusterData doRead(boost::asio::yield_context yield); /** diff --git a/src/cluster/ClusterCommunicationService.cpp b/src/cluster/ClusterCommunicationService.cpp index 0a428d2e6d..81f3def6bf 100644 --- a/src/cluster/ClusterCommunicationService.cpp +++ b/src/cluster/ClusterCommunicationService.cpp @@ -51,13 +51,14 @@ ClusterCommunicationService::ClusterCommunicationService( ) : backend_(ctx_.executor(), std::move(backend), std::move(writerState), readInterval, writeInterval) { - nodesInClusterMetric_.set(1); // The node always sees itself - isHealthy_ = true; } void ClusterCommunicationService::run() { + backend_.subscribeToNewState([this](auto&&... args) { + metrics_.onNewState(std::forward(args)...); + }); backend_.run(); } diff --git a/src/cluster/ClusterCommunicationService.hpp b/src/cluster/ClusterCommunicationService.hpp index 899e403705..26bb33bdb6 100644 --- a/src/cluster/ClusterCommunicationService.hpp +++ b/src/cluster/ClusterCommunicationService.hpp @@ -20,11 +20,9 @@ #pragma once #include "cluster/Backend.hpp" +#include "cluster/Metrics.hpp" #include "data/BackendInterface.hpp" #include "etl/WriterState.hpp" -#include "util/prometheus/Bool.hpp" -#include "util/prometheus/Gauge.hpp" -#include "util/prometheus/Prometheus.hpp" #include #include @@ -34,7 +32,6 @@ #include #include -#include namespace cluster { @@ -42,20 +39,10 @@ namespace cluster { * @brief Service to post and read messages to/from the cluster. It uses a backend to communicate with the cluster. */ class ClusterCommunicationService { - util::prometheus::GaugeInt& nodesInClusterMetric_ = PrometheusService::gaugeInt( - "cluster_nodes_total_number", - {}, - "Total number of nodes this node can detect in the cluster." - ); - util::prometheus::Bool isHealthy_ = PrometheusService::boolMetric( - "cluster_communication_is_healthy", - {}, - "Whether cluster communication service is operating healthy (1 - healthy, 0 - we have a problem)" - ); - // TODO: Use util::async::CoroExecutionContext after https://github.com/XRPLF/clio/issues/1973 is implemented boost::asio::thread_pool ctx_{1}; Backend backend_; + Metrics metrics_; public: static constexpr std::chrono::milliseconds kDEFAULT_READ_INTERVAL{1000}; diff --git a/src/cluster/Metrics.hpp b/src/cluster/Metrics.hpp new file mode 100644 index 0000000000..bd75fd3018 --- /dev/null +++ b/src/cluster/Metrics.hpp @@ -0,0 +1,61 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#pragma once + +#include "cluster/Backend.hpp" +#include "cluster/ClioNode.hpp" +#include "util/prometheus/Bool.hpp" +#include "util/prometheus/Gauge.hpp" +#include "util/prometheus/Prometheus.hpp" + +#include + +namespace cluster { + +class Metrics { + util::prometheus::GaugeInt& nodesInClusterMetric_ = PrometheusService::gaugeInt( + "cluster_nodes_total_number", + {}, + "Total number of nodes this node can detect in the cluster." + ); + util::prometheus::Bool isHealthy_ = PrometheusService::boolMetric( + "cluster_communication_is_healthy", + {}, + "Whether cluster communication service is operating healthy (1 - healthy, 0 - we have a problem)" + ); + +public: + Metrics() + { + nodesInClusterMetric_.set(1); // The node always sees itself + isHealthy_ = true; + } + + void + onNewState(ClioNode::cUUID, std::shared_ptr clusterData) + { + isHealthy_ = clusterData->has_value(); + if (clusterData->has_value()) { + nodesInClusterMetric_.set(clusterData->value().size()); + } + } +}; + +} // namespace cluster From 0e514d774781f255276f5edf26d3942f1ab37f4b Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Wed, 26 Nov 2025 12:32:24 +0000 Subject: [PATCH 08/41] Move implementation to cpp and add docs --- src/cluster/CMakeLists.txt | 2 +- src/cluster/Metrics.cpp | 39 ++++++++++++++++++++++++++++++++++++++ src/cluster/Metrics.hpp | 39 ++++++++++++++++++++++++++------------ 3 files changed, 67 insertions(+), 13 deletions(-) create mode 100644 src/cluster/Metrics.cpp diff --git a/src/cluster/CMakeLists.txt b/src/cluster/CMakeLists.txt index 724c195e7e..2c24c1400c 100644 --- a/src/cluster/CMakeLists.txt +++ b/src/cluster/CMakeLists.txt @@ -1,5 +1,5 @@ add_library(clio_cluster) -target_sources(clio_cluster PRIVATE Backend.cpp ClioNode.cpp ClusterCommunicationService.cpp) +target_sources(clio_cluster PRIVATE Backend.cpp ClioNode.cpp ClusterCommunicationService.cpp Metrics.cpp) target_link_libraries(clio_cluster PRIVATE clio_util clio_data) diff --git a/src/cluster/Metrics.cpp b/src/cluster/Metrics.cpp new file mode 100644 index 0000000000..979b0ea8f1 --- /dev/null +++ b/src/cluster/Metrics.cpp @@ -0,0 +1,39 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#include "cluster/Metrics.hpp" + +namespace cluster { + +Metrics::Metrics() +{ + nodesInClusterMetric_.set(1); // The node always sees itself + isHealthy_ = true; +} + +void +Metrics::onNewState(ClioNode::cUUID, std::shared_ptr clusterData) +{ + isHealthy_ = clusterData->has_value(); + if (clusterData->has_value()) { + nodesInClusterMetric_.set(clusterData->value().size()); + } +} + +} // namespace cluster diff --git a/src/cluster/Metrics.hpp b/src/cluster/Metrics.hpp index bd75fd3018..147f72620e 100644 --- a/src/cluster/Metrics.hpp +++ b/src/cluster/Metrics.hpp @@ -29,12 +29,22 @@ namespace cluster { +/** + * @brief Manages Prometheus metrics for cluster communication and node tracking. + * + * This class tracks cluster-related metrics including: + * - Total number of nodes detected in the cluster + * - Health status of cluster communication + */ class Metrics { + /** @brief Gauge tracking the total number of nodes visible in the cluster */ util::prometheus::GaugeInt& nodesInClusterMetric_ = PrometheusService::gaugeInt( "cluster_nodes_total_number", {}, "Total number of nodes this node can detect in the cluster." ); + + /** @brief Boolean metric indicating whether cluster communication is healthy */ util::prometheus::Bool isHealthy_ = PrometheusService::boolMetric( "cluster_communication_is_healthy", {}, @@ -42,20 +52,25 @@ class Metrics { ); public: - Metrics() - { - nodesInClusterMetric_.set(1); // The node always sees itself - isHealthy_ = true; - } + /** + * @brief Constructs a Metrics instance and initializes metrics. + * + * Sets the initial node count to 1 (self) and marks communication as healthy. + */ + Metrics(); + /** + * @brief Updates metrics based on new cluster state. + * + * This callback is invoked when cluster state changes. It updates: + * - Health status based on whether cluster data is available + * - Node count to reflect the current cluster size + * + * @param uuid The UUID of the node (unused in current implementation) + * @param clusterData Shared pointer to the current cluster data; may be empty if communication failed + */ void - onNewState(ClioNode::cUUID, std::shared_ptr clusterData) - { - isHealthy_ = clusterData->has_value(); - if (clusterData->has_value()) { - nodesInClusterMetric_.set(clusterData->value().size()); - } - } + onNewState(ClioNode::cUUID uuid, std::shared_ptr clusterData); }; } // namespace cluster From e84cf0aa28eb690faf03a019e62f7d61edf1aa67 Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Wed, 26 Nov 2025 14:56:05 +0000 Subject: [PATCH 09/41] Add WriterDecider --- src/cluster/Backend.cpp | 2 +- src/cluster/Backend.hpp | 26 ++----- src/cluster/ClusterCommunicationService.cpp | 6 +- src/cluster/ClusterCommunicationService.hpp | 2 + src/cluster/Metrics.cpp | 5 ++ src/cluster/WriterDecider.hpp | 78 +++++++++++++++++++++ src/cluster/impl/RepeatedTask.hpp | 6 +- src/etl/WriterState.cpp | 7 ++ src/etl/WriterState.hpp | 6 ++ src/util/Spawn.hpp | 1 + 10 files changed, 112 insertions(+), 27 deletions(-) create mode 100644 src/cluster/WriterDecider.hpp diff --git a/src/cluster/Backend.cpp b/src/cluster/Backend.cpp index 2f34632c03..1f760dcbd7 100644 --- a/src/cluster/Backend.cpp +++ b/src/cluster/Backend.cpp @@ -49,7 +49,7 @@ namespace cluster { Backend::Backend( - boost::asio::thread_pool::executor_type ctx, + boost::asio::thread_pool& ctx, std::shared_ptr backend, std::unique_ptr writerState, std::chrono::steady_clock::duration readInterval, diff --git a/src/cluster/Backend.hpp b/src/cluster/Backend.hpp index e72f49a8c5..7e8a8d6baf 100644 --- a/src/cluster/Backend.hpp +++ b/src/cluster/Backend.hpp @@ -58,39 +58,30 @@ class Backend { using ClusterData = std::expected, std::string>; private: - /** @brief Logger for cluster communication activities */ util::Logger log_{"ClusterCommunication"}; - /** @brief Interface to the backend database for reading/writing cluster state */ std::shared_ptr backend_; - - /** @brief State indicating whether this node is writing to the database */ std::unique_ptr writerState_; - /** @brief Repeated task for reading cluster state from the backend */ - impl::RepeatedTask readerTask_; - - /** @brief Repeated task for writing this node's state to the backend */ - impl::RepeatedTask writerTask_; + impl::RepeatedTask readerTask_; + impl::RepeatedTask writerTask_; - /** @brief UUID uniquely identifying this node in the cluster */ ClioNode::UUID selfUuid_; - /** @brief Signal emitted when new cluster state is available */ boost::signals2::signal)> onNewState_; public: /** * @brief Construct a Backend communication handler. * - * @param ctx The executor context for asynchronous operations + * @param ctx The execution context for asynchronous operations * @param backend Interface to the backend database * @param writerState State indicating whether this node is writing to the database * @param readInterval How often to read cluster state from the backend * @param writeInterval How often to write this node's state to the backend */ Backend( - boost::asio::thread_pool::executor_type ctx, + boost::asio::thread_pool& ctx, std::shared_ptr backend, std::unique_ptr writerState, std::chrono::steady_clock::duration readInterval, @@ -138,18 +129,9 @@ class Backend { } private: - /** - * @brief Read cluster state from the backend. - * - * @param yield Coroutine yield context - * @return Cluster data containing all nodes' state, or an error message - */ ClusterData doRead(boost::asio::yield_context yield); - /** - * @brief Write this node's state to the backend. - */ void doWrite(); }; diff --git a/src/cluster/ClusterCommunicationService.cpp b/src/cluster/ClusterCommunicationService.cpp index 81f3def6bf..79f89836e4 100644 --- a/src/cluster/ClusterCommunicationService.cpp +++ b/src/cluster/ClusterCommunicationService.cpp @@ -49,7 +49,8 @@ ClusterCommunicationService::ClusterCommunicationService( std::chrono::steady_clock::duration readInterval, std::chrono::steady_clock::duration writeInterval ) - : backend_(ctx_.executor(), std::move(backend), std::move(writerState), readInterval, writeInterval) + : backend_(ctx_, std::move(backend), writerState->clone(), readInterval, writeInterval) + , writerDecider_(ctx_, std::move(writerState)) { } @@ -59,6 +60,9 @@ ClusterCommunicationService::run() backend_.subscribeToNewState([this](auto&&... args) { metrics_.onNewState(std::forward(args)...); }); + backend_.subscribeToNewState([this](auto&&... args) { + writerDecider_.onNewState(std::forward(args)...); + }); backend_.run(); } diff --git a/src/cluster/ClusterCommunicationService.hpp b/src/cluster/ClusterCommunicationService.hpp index 26bb33bdb6..aa94c81b56 100644 --- a/src/cluster/ClusterCommunicationService.hpp +++ b/src/cluster/ClusterCommunicationService.hpp @@ -21,6 +21,7 @@ #include "cluster/Backend.hpp" #include "cluster/Metrics.hpp" +#include "cluster/WriterDecider.hpp" #include "data/BackendInterface.hpp" #include "etl/WriterState.hpp" @@ -43,6 +44,7 @@ class ClusterCommunicationService { boost::asio::thread_pool ctx_{1}; Backend backend_; Metrics metrics_; + WriterDecider writerDecider_; public: static constexpr std::chrono::milliseconds kDEFAULT_READ_INTERVAL{1000}; diff --git a/src/cluster/Metrics.cpp b/src/cluster/Metrics.cpp index 979b0ea8f1..ecef9dccad 100644 --- a/src/cluster/Metrics.cpp +++ b/src/cluster/Metrics.cpp @@ -19,6 +19,11 @@ #include "cluster/Metrics.hpp" +#include "cluster/Backend.hpp" +#include "cluster/ClioNode.hpp" + +#include + namespace cluster { Metrics::Metrics() diff --git a/src/cluster/WriterDecider.hpp b/src/cluster/WriterDecider.hpp new file mode 100644 index 0000000000..299742b656 --- /dev/null +++ b/src/cluster/WriterDecider.hpp @@ -0,0 +1,78 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#pragma once + +#include "cluster/Backend.hpp" +#include "cluster/ClioNode.hpp" +#include "etl/WriterState.hpp" +#include "util/Spawn.hpp" +#include "util/log/Logger.hpp" + +#include + +#include +#include +#include + +namespace cluster { + +class WriterDecider { + boost::asio::thread_pool& ctx_; + std::unique_ptr writerState_; + +public: + WriterDecider(boost::asio::thread_pool& ctx, std::unique_ptr writerState) + : ctx_(ctx), writerState_(std::move(writerState)) + { + } + + void + onNewState(ClioNode::cUUID selfId, std::shared_ptr clusterData) + { + util::spawn( + ctx_, + [writerState = writerState_->clone(), + selfId = std::move(selfId), + clusterData = std::move(clusterData)](auto&&) { + if (not clusterData->has_value()) + return; + auto data = clusterData->value(); + std::ranges::sort(data, [](ClioNode const& lhs, ClioNode const& rhs) { return lhs.uuid < rhs.uuid; }); + + auto const it = std::ranges::find_if(data, [](ClioNode const& node) { + return node.dbRole != ClioNode::DbRole::ReadOnly; + }); + + if (it == data.end()) { + LOG(util::LogService::warn()) << "No nodes allowed to write in the cluster"; + return; + } + + if (it->uuid == selfId) { + writerState->startWriting(); + } else { + writerState->giveUpWriting(); + } + } + ); + } +}; + +} // namespace cluster diff --git a/src/cluster/impl/RepeatedTask.hpp b/src/cluster/impl/RepeatedTask.hpp index ac387f4a30..1583eea276 100644 --- a/src/cluster/impl/RepeatedTask.hpp +++ b/src/cluster/impl/RepeatedTask.hpp @@ -39,10 +39,10 @@ namespace cluster::impl { // TODO: Try to replace util/Repeat by this -template +template class RepeatedTask { std::chrono::steady_clock::duration interval_; - boost::asio::strand strand_; + boost::asio::strand strand_; enum class State { Running, Stopped }; std::atomic state_ = State::Stopped; @@ -50,7 +50,7 @@ class RepeatedTask { boost::asio::cancellation_signal cancelSignal_; public: - RepeatedTask(std::chrono::steady_clock::duration interval, AsioExecutorType& ctx) + RepeatedTask(std::chrono::steady_clock::duration interval, Context& ctx) : interval_(interval), strand_(boost::asio::make_strand(ctx)) { } diff --git a/src/etl/WriterState.cpp b/src/etl/WriterState.cpp index da8f520174..d455ebc625 100644 --- a/src/etl/WriterState.cpp +++ b/src/etl/WriterState.cpp @@ -60,4 +60,11 @@ WriterState::giveUpWriting() systemState_->shouldTakeoverWriting = true; } +std::unique_ptr +WriterState::clone() const +{ + auto c = WriterState(*this); + return std::make_unique(std::move(c)); +} + } // namespace etl diff --git a/src/etl/WriterState.hpp b/src/etl/WriterState.hpp index df619ba548..458bac1043 100644 --- a/src/etl/WriterState.hpp +++ b/src/etl/WriterState.hpp @@ -67,6 +67,9 @@ class WriterStateInterface { */ virtual void giveUpWriting() = 0; + + [[nodiscard]] virtual std::unique_ptr + clone() const = 0; }; /** @@ -114,6 +117,9 @@ class WriterState : public WriterStateInterface { */ void giveUpWriting() override; + + std::unique_ptr + clone() const override; }; } // namespace etl diff --git a/src/util/Spawn.hpp b/src/util/Spawn.hpp index bdb90bc578..79229cf9e7 100644 --- a/src/util/Spawn.hpp +++ b/src/util/Spawn.hpp @@ -22,6 +22,7 @@ #include #include +#include #include #include From 6181abc4eb1e8c6d145472e84a39f34c685a5435 Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Wed, 26 Nov 2025 15:03:20 +0000 Subject: [PATCH 10/41] Create cpp file and add docs --- src/cluster/CMakeLists.txt | 4 +- src/cluster/WriterDecider.cpp | 66 ++++++++++++++++++++++++++++++++ src/cluster/WriterDecider.hpp | 71 +++++++++++++++++------------------ 3 files changed, 103 insertions(+), 38 deletions(-) create mode 100644 src/cluster/WriterDecider.cpp diff --git a/src/cluster/CMakeLists.txt b/src/cluster/CMakeLists.txt index 2c24c1400c..f6460bb23b 100644 --- a/src/cluster/CMakeLists.txt +++ b/src/cluster/CMakeLists.txt @@ -1,5 +1,7 @@ add_library(clio_cluster) -target_sources(clio_cluster PRIVATE Backend.cpp ClioNode.cpp ClusterCommunicationService.cpp Metrics.cpp) +target_sources( + clio_cluster PRIVATE Backend.cpp ClioNode.cpp ClusterCommunicationService.cpp Metrics.cpp WriterDecider.cpp +) target_link_libraries(clio_cluster PRIVATE clio_util clio_data) diff --git a/src/cluster/WriterDecider.cpp b/src/cluster/WriterDecider.cpp new file mode 100644 index 0000000000..59fc92d99e --- /dev/null +++ b/src/cluster/WriterDecider.cpp @@ -0,0 +1,66 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#include "cluster/WriterDecider.hpp" + +#include "util/Spawn.hpp" +#include "util/log/Logger.hpp" + +#include +#include + +namespace cluster { + +WriterDecider::WriterDecider(boost::asio::thread_pool& ctx, std::unique_ptr writerState) + : ctx_(ctx), writerState_(std::move(writerState)) +{ +} + +void +WriterDecider::onNewState(ClioNode::cUUID selfId, std::shared_ptr clusterData) +{ + util::spawn( + ctx_, + [writerState = writerState_->clone(), + selfId = std::move(selfId), + clusterData = std::move(clusterData)](auto&&) { + if (not clusterData->has_value()) + return; + auto data = clusterData->value(); + std::ranges::sort(data, [](ClioNode const& lhs, ClioNode const& rhs) { return lhs.uuid < rhs.uuid; }); + + auto const it = std::ranges::find_if(data, [](ClioNode const& node) { + return node.dbRole != ClioNode::DbRole::ReadOnly; + }); + + if (it == data.end()) { + LOG(util::LogService::warn()) << "No nodes allowed to write in the cluster"; + return; + } + + if (it->uuid == selfId) { + writerState->startWriting(); + } else { + writerState->giveUpWriting(); + } + } + ); +} + +} // namespace cluster diff --git a/src/cluster/WriterDecider.hpp b/src/cluster/WriterDecider.hpp index 299742b656..8ee38990d0 100644 --- a/src/cluster/WriterDecider.hpp +++ b/src/cluster/WriterDecider.hpp @@ -22,57 +22,54 @@ #include "cluster/Backend.hpp" #include "cluster/ClioNode.hpp" #include "etl/WriterState.hpp" -#include "util/Spawn.hpp" -#include "util/log/Logger.hpp" #include -#include #include -#include namespace cluster { +/** + * @brief Decides which node in the cluster should be the writer based on cluster state. + * + * This class monitors cluster state changes and determines whether the current node + * should act as the writer to the database. The decision is made by: + * 1. Sorting all nodes by UUID for deterministic ordering + * 2. Selecting the first node that is allowed to write (not ReadOnly) + * 3. Activating writing on this node if it's the current node, otherwise deactivating + * + * This ensures only one node in the cluster actively writes to the database at a time. + */ class WriterDecider { + /** @brief Thread pool for spawning asynchronous tasks */ boost::asio::thread_pool& ctx_; + + /** @brief Interface for controlling the writer state of this node */ std::unique_ptr writerState_; public: - WriterDecider(boost::asio::thread_pool& ctx, std::unique_ptr writerState) - : ctx_(ctx), writerState_(std::move(writerState)) - { - } + /** + * @brief Constructs a WriterDecider. + * + * @param ctx Thread pool for executing asynchronous operations + * @param writerState Writer state interface for controlling write operations + */ + WriterDecider(boost::asio::thread_pool& ctx, std::unique_ptr writerState); + /** + * @brief Handles cluster state changes and decides whether this node should be the writer. + * + * This method is called when cluster state changes. It asynchronously: + * - Sorts all nodes by UUID to establish a deterministic order + * - Identifies the first node allowed to write (not ReadOnly) + * - Activates writing if this node is selected, otherwise deactivates writing + * - Logs a warning if no nodes in the cluster are allowed to write + * + * @param selfId The UUID of the current node + * @param clusterData Shared pointer to current cluster data; may be empty if communication failed + */ void - onNewState(ClioNode::cUUID selfId, std::shared_ptr clusterData) - { - util::spawn( - ctx_, - [writerState = writerState_->clone(), - selfId = std::move(selfId), - clusterData = std::move(clusterData)](auto&&) { - if (not clusterData->has_value()) - return; - auto data = clusterData->value(); - std::ranges::sort(data, [](ClioNode const& lhs, ClioNode const& rhs) { return lhs.uuid < rhs.uuid; }); - - auto const it = std::ranges::find_if(data, [](ClioNode const& node) { - return node.dbRole != ClioNode::DbRole::ReadOnly; - }); - - if (it == data.end()) { - LOG(util::LogService::warn()) << "No nodes allowed to write in the cluster"; - return; - } - - if (it->uuid == selfId) { - writerState->startWriting(); - } else { - writerState->giveUpWriting(); - } - } - ); - } + onNewState(ClioNode::cUUID selfId, std::shared_ptr clusterData); }; } // namespace cluster From f855d8dc9d123d455faa0abd44abcbfffdfd7f99 Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Wed, 26 Nov 2025 15:15:53 +0000 Subject: [PATCH 11/41] Add tests for ClioNode --- tests/common/util/MockWriterState.hpp | 36 ++++++++++++++++ tests/unit/cluster/ClioNodeTests.cpp | 60 +++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) create mode 100644 tests/common/util/MockWriterState.hpp diff --git a/tests/common/util/MockWriterState.hpp b/tests/common/util/MockWriterState.hpp new file mode 100644 index 0000000000..25b818be98 --- /dev/null +++ b/tests/common/util/MockWriterState.hpp @@ -0,0 +1,36 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#pragma once + +#include "etl/WriterState.hpp" + +#include + +#include + +struct MockWriterStateBase : public etl::WriterStateInterface { + MOCK_METHOD(bool, isReadOnly, (), (const, override)); + MOCK_METHOD(bool, isWriting, (), (const, override)); + MOCK_METHOD(void, startWriting, (), (override)); + MOCK_METHOD(void, giveUpWriting, (), (override)); + MOCK_METHOD(std::unique_ptr, clone, (), (const, override)); +}; + +using MockWriterState = testing::StrictMock; diff --git a/tests/unit/cluster/ClioNodeTests.cpp b/tests/unit/cluster/ClioNodeTests.cpp index f15f4fa61b..8dbd219f28 100644 --- a/tests/unit/cluster/ClioNodeTests.cpp +++ b/tests/unit/cluster/ClioNodeTests.cpp @@ -18,6 +18,7 @@ //============================================================================== #include "cluster/ClioNode.hpp" +#include "util/MockWriterState.hpp" #include "util/NameGenerator.hpp" #include "util/TimeUtils.hpp" @@ -27,6 +28,7 @@ #include #include #include +#include #include #include @@ -147,3 +149,61 @@ TEST_F(ClioNodeDbRoleTest, DeserializationMissingDbRole) boost::json::value const jsonValue = {{"update_time", updateTimeStr}}; EXPECT_THROW(boost::json::value_to(jsonValue), std::runtime_error); } + +struct ClioNodeFromTestBundle { + std::string testName; + bool readOnly; + bool writing; + ClioNode::DbRole expectedRole; +}; + +struct ClioNodeFromTest : ClioNodeTest, testing::WithParamInterface { + std::shared_ptr uuid = std::make_shared(boost::uuids::random_generator()()); + + MockWriterState writerState; +}; + +INSTANTIATE_TEST_SUITE_P( + AllWriterStates, + ClioNodeFromTest, + testing::Values( + ClioNodeFromTestBundle{ + .testName = "ReadOnly", + .readOnly = true, + .writing = false, + .expectedRole = ClioNode::DbRole::ReadOnly + }, + ClioNodeFromTestBundle{ + .testName = "NotWriterNotReadOnly", + .readOnly = false, + .writing = false, + .expectedRole = ClioNode::DbRole::NotWriter + }, + ClioNodeFromTestBundle{ + .testName = "Writer", + .readOnly = false, + .writing = true, + .expectedRole = ClioNode::DbRole::Writer + } + ), + tests::util::kNAME_GENERATOR +); + +TEST_P(ClioNodeFromTest, FromWriterState) +{ + auto const& param = GetParam(); + + EXPECT_CALL(writerState, isReadOnly()).WillOnce(testing::Return(param.readOnly)); + if (not param.readOnly) { + EXPECT_CALL(writerState, isWriting()).WillOnce(testing::Return(param.writing)); + } + + auto const beforeTime = std::chrono::system_clock::now(); + auto const node = ClioNode::from(uuid, writerState); + auto const afterTime = std::chrono::system_clock::now(); + + EXPECT_EQ(node.uuid, uuid); + EXPECT_EQ(node.dbRole, param.expectedRole); + EXPECT_GE(node.updateTime, beforeTime); + EXPECT_LE(node.updateTime, afterTime); +} From e8409a346e64bc46e0f462fb2205d308213fe613 Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Wed, 26 Nov 2025 17:46:57 +0000 Subject: [PATCH 12/41] WIP --- src/cluster/impl/RepeatedTask.hpp | 5 +- src/util/Assert.cpp | 2 +- src/util/log/Logger.cpp | 6 +++ src/util/log/Logger.hpp | 8 +++ tests/unit/CMakeLists.txt | 1 + tests/unit/cluster/BackendTests.cpp | 79 +++++++++++++++++++++++++++++ 6 files changed, 98 insertions(+), 3 deletions(-) create mode 100644 tests/unit/cluster/BackendTests.cpp diff --git a/src/cluster/impl/RepeatedTask.hpp b/src/cluster/impl/RepeatedTask.hpp index 1583eea276..f692e52bab 100644 --- a/src/cluster/impl/RepeatedTask.hpp +++ b/src/cluster/impl/RepeatedTask.hpp @@ -70,11 +70,12 @@ class RepeatedTask { util::spawn(strand_, [this, t = std::forward(f)](boost::asio::yield_context yield) { boost::asio::steady_timer timer(yield.get_executor()); boost::system::error_code ec; + auto token = cancelSignal_.slot(); + auto slot = boost::asio::bind_cancellation_slot(token, yield[ec]); while (state_ == State::Running) { timer.expires_after(interval_); - auto token = cancelSignal_.slot(); - timer.async_wait(boost::asio::bind_cancellation_slot(token, yield[ec])); + timer.async_wait(slot); if (ec == boost::asio::error::operation_aborted or state_ != State::Running) break; diff --git a/src/util/Assert.cpp b/src/util/Assert.cpp index 87165d607b..0e923504fc 100644 --- a/src/util/Assert.cpp +++ b/src/util/Assert.cpp @@ -54,7 +54,7 @@ OnAssert::resetAction() void OnAssert::defaultAction(std::string_view message) { - if (LogServiceState::initialized()) { + if (LogServiceState::initialized() and LogServiceState::hasSinks()) { LOG(LogService::fatal()) << message; } else { std::cerr << message; diff --git a/src/util/log/Logger.cpp b/src/util/log/Logger.cpp index 5a3c2f771c..28b6cab143 100644 --- a/src/util/log/Logger.cpp +++ b/src/util/log/Logger.cpp @@ -271,6 +271,12 @@ LogServiceState::initialized() return initialized_; } +bool +LogServiceState::hasSinks() +{ + return not sinks_.empty(); +} + void LogServiceState::reset() { diff --git a/src/util/log/Logger.hpp b/src/util/log/Logger.hpp index 370625890a..9e7b61eb49 100644 --- a/src/util/log/Logger.hpp +++ b/src/util/log/Logger.hpp @@ -267,6 +267,14 @@ class LogServiceState { [[nodiscard]] static bool initialized(); + /** + * @brief Whether the LogService has any sink. If there is no sink, logger will not log messages anywhere. + * + * @return true if the LogService has at least one sink + */ + [[nodiscard]] static bool + hasSinks(); + /** * @brief Reset the logging service to uninitialized state. */ diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 4d1dfd08e7..110e4efb86 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -21,6 +21,7 @@ target_sources( data/impl/LedgerCacheFileTests.cpp data/impl/OutputFileTests.cpp # Cluster + cluster/BackendTests.cpp cluster/ClioNodeTests.cpp cluster/ClusterCommunicationServiceTests.cpp # ETL diff --git a/tests/unit/cluster/BackendTests.cpp b/tests/unit/cluster/BackendTests.cpp new file mode 100644 index 0000000000..c67fef3482 --- /dev/null +++ b/tests/unit/cluster/BackendTests.cpp @@ -0,0 +1,79 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#include "cluster/Backend.hpp" +#include "cluster/ClioNode.hpp" +#include "data/BackendInterface.hpp" +#include "util/MockBackendTestFixture.hpp" +#include "util/MockPrometheus.hpp" +#include "util/MockWriterState.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +using namespace cluster; + +struct ClusterBackendTest : util::prometheus::WithPrometheus, MockBackendTestStrict { + ~ClusterBackendTest() override + { + ctx.stop(); + ctx.join(); + } + + boost::asio::thread_pool ctx; + std::unique_ptr writerState = std::make_unique(); + MockWriterState& writerStateRef = *writerState; + testing::StrictMock)>> + callbackMock; +}; + +TEST_F(ClusterBackendTest, Stop) +{ + Backend clusterBackend{ + ctx, backend_, std::move(writerState), std::chrono::milliseconds(1), std::chrono::milliseconds(1) + }; + + EXPECT_CALL(*backend_, fetchClioNodesData) + .Times(testing::AtLeast(1)) + .WillRepeatedly(testing::Return(BackendInterface::ClioNodesDataFetchResult{})); + EXPECT_CALL(*backend_, writeNodeMessage).Times(testing::AtLeast(1)); + EXPECT_CALL(callbackMock, Call).Times(testing::AtLeast(1)); + EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(true)); + + clusterBackend.run(); + std::this_thread::sleep_for(std::chrono::milliseconds{20}); + clusterBackend.stop(); + + testing::Mock::VerifyAndClearExpectations(&(*backend_)); + // Wait to make sure there is no new calls of mockDbBackend + std::this_thread::sleep_for(std::chrono::milliseconds{20}); +} From 8fa1cb9a33609b4bcc99c4fc9318980dab02a08d Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Thu, 27 Nov 2025 14:39:30 +0000 Subject: [PATCH 13/41] Add tests for RepeatedTask --- src/cluster/impl/RepeatedTask.hpp | 40 ++-- tests/unit/CMakeLists.txt | 1 + tests/unit/cluster/RepeatedTaskTests.cpp | 226 +++++++++++++++++++++++ 3 files changed, 245 insertions(+), 22 deletions(-) create mode 100644 tests/unit/cluster/RepeatedTaskTests.cpp diff --git a/src/cluster/impl/RepeatedTask.hpp b/src/cluster/impl/RepeatedTask.hpp index f692e52bab..37f70cd953 100644 --- a/src/cluster/impl/RepeatedTask.hpp +++ b/src/cluster/impl/RepeatedTask.hpp @@ -35,6 +35,7 @@ #include #include #include +#include namespace cluster::impl { @@ -47,11 +48,12 @@ class RepeatedTask { enum class State { Running, Stopped }; std::atomic state_ = State::Stopped; - boost::asio::cancellation_signal cancelSignal_; + std::binary_semaphore semaphore_{0}; + boost::asio::steady_timer timer_; public: RepeatedTask(std::chrono::steady_clock::duration interval, Context& ctx) - : interval_(interval), strand_(boost::asio::make_strand(ctx)) + : interval_(interval), strand_(boost::asio::make_strand(ctx)), timer_(strand_) { } @@ -67,41 +69,35 @@ class RepeatedTask { { ASSERT(state_ == State::Stopped, "Can only be ran once"); state_ = State::Running; - util::spawn(strand_, [this, t = std::forward(f)](boost::asio::yield_context yield) { - boost::asio::steady_timer timer(yield.get_executor()); + util::spawn(strand_, [this, f = std::forward(f)](boost::asio::yield_context yield) { boost::system::error_code ec; - auto token = cancelSignal_.slot(); - auto slot = boost::asio::bind_cancellation_slot(token, yield[ec]); while (state_ == State::Running) { - timer.expires_after(interval_); - timer.async_wait(slot); + timer_.expires_after(interval_); + timer_.async_wait(yield[ec]); - if (ec == boost::asio::error::operation_aborted or state_ != State::Running) + if (ec or state_ != State::Running) break; - if constexpr (std::invocable) { - t(yield); + if constexpr (std::invocable) { + f(yield); } else { - t(); + f(); } } + + semaphore_.release(); }); } void stop() { - if (state_ == State::Stopped) - return; - - state_ = State::Stopped; - boost::asio::spawn( - strand_, - [this](auto&&) { cancelSignal_.emit(boost::asio::cancellation_type::all); }, - boost::asio::use_future - ) - .wait(); + if (auto expected = State::Running; not state_.compare_exchange_strong(expected, State::Stopped)) + return; // Already stopped or not started + + boost::asio::spawn(strand_, [this](auto&&) { timer_.cancel(); }, boost::asio::use_future).wait(); + semaphore_.acquire(); } }; diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index 110e4efb86..f91d7af15b 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -24,6 +24,7 @@ target_sources( cluster/BackendTests.cpp cluster/ClioNodeTests.cpp cluster/ClusterCommunicationServiceTests.cpp + cluster/RepeatedTaskTests.cpp # ETL etl/AmendmentBlockHandlerTests.cpp etl/CacheLoaderSettingsTests.cpp diff --git a/tests/unit/cluster/RepeatedTaskTests.cpp b/tests/unit/cluster/RepeatedTaskTests.cpp new file mode 100644 index 0000000000..90c0fc6b9a --- /dev/null +++ b/tests/unit/cluster/RepeatedTaskTests.cpp @@ -0,0 +1,226 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#include "cluster/impl/RepeatedTask.hpp" +#include "util/AsioContextTestFixture.hpp" + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +using namespace cluster::impl; +using namespace testing; + +struct RepeatedTaskTest : AsyncAsioContextTest { + static constexpr auto kTIMEOUT = std::chrono::seconds{5}; +}; + +template +struct RepeatedTaskTypedTest : RepeatedTaskTest { + std::atomic_int32_t callCount{0}; + std::binary_semaphore semaphore{0}; + testing::StrictMock mockFn; + + void + expectCalls(int const expectedCalls) + { + callCount = 0; + + EXPECT_CALL(mockFn, Call).Times(AtLeast(expectedCalls)).WillRepeatedly([this, expectedCalls](auto&&...) { + ++callCount; + if (callCount >= expectedCalls) { + semaphore.release(); + } + }); + } +}; + +namespace { + +using TypesToTest = Types, MockFunction>; + +} // namespace + +TYPED_TEST_SUITE(RepeatedTaskTypedTest, TypesToTest); + +TYPED_TEST(RepeatedTaskTypedTest, CallsFunctionRepeatedly) +{ + RepeatedTask task(std::chrono::milliseconds(1), this->ctx_); + + this->expectCalls(3); + + task.run(this->mockFn.AsStdFunction()); + + EXPECT_TRUE(this->semaphore.try_acquire_for(TestFixture::kTIMEOUT)); + + task.stop(); +} + +TYPED_TEST(RepeatedTaskTypedTest, StopsImmediately) +{ + auto const interval = std::chrono::seconds(5); + RepeatedTask task(interval, this->ctx_); + + task.run(this->mockFn.AsStdFunction()); + + std::this_thread::sleep_for(std::chrono::milliseconds(5)); + + auto start = std::chrono::steady_clock::now(); + task.stop(); + EXPECT_LT(std::chrono::steady_clock::now() - start, interval); +} + +TYPED_TEST(RepeatedTaskTypedTest, MultipleStops) +{ + RepeatedTask task(std::chrono::milliseconds(1), this->ctx_); + + this->expectCalls(3); + + task.run(this->mockFn.AsStdFunction()); + + EXPECT_TRUE(this->semaphore.try_acquire_for(TestFixture::kTIMEOUT)); + + task.stop(); + task.stop(); + task.stop(); +} + +TYPED_TEST(RepeatedTaskTypedTest, DestructorStopsTask) +{ + this->expectCalls(3); + + { + RepeatedTask task(std::chrono::milliseconds(1), this->ctx_); + + task.run(this->mockFn.AsStdFunction()); + + EXPECT_TRUE(this->semaphore.try_acquire_for(TestFixture::kTIMEOUT)); + + // Destructor will call stop() + } + + auto const countAfterDestruction = this->callCount.load(); + + // Wait a bit - no more calls should happen + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + EXPECT_EQ(this->callCount, countAfterDestruction); +} + +TYPED_TEST(RepeatedTaskTypedTest, StopWithoutRunIsNoOp) +{ + RepeatedTask task(std::chrono::milliseconds(1), this->ctx_); + + // Should not crash or hang + task.stop(); +} + +TEST_F(RepeatedTaskTest, MultipleTasksRunConcurrently) +{ + StrictMock> mockFn1; + StrictMock> mockFn2; + + RepeatedTask task1(std::chrono::milliseconds(1), ctx_); + RepeatedTask task2(std::chrono::milliseconds(2), ctx_); + + std::atomic_int32_t callCount1{0}; + std::atomic_int32_t callCount2{0}; + std::binary_semaphore semaphore1{0}; + std::binary_semaphore semaphore2{0}; + + EXPECT_CALL(mockFn1, Call).Times(AtLeast(10)).WillRepeatedly([&]() { + if (++callCount1 >= 10) { + semaphore1.release(); + } + }); + + EXPECT_CALL(mockFn2, Call).Times(AtLeast(5)).WillRepeatedly([&]() { + if (++callCount2 >= 5) { + semaphore2.release(); + } + }); + + task1.run(mockFn1.AsStdFunction()); + task2.run(mockFn2.AsStdFunction()); + + EXPECT_TRUE(semaphore1.try_acquire_for(kTIMEOUT)); + EXPECT_TRUE(semaphore2.try_acquire_for(kTIMEOUT)); + + task1.stop(); + task2.stop(); +} + +TYPED_TEST(RepeatedTaskTypedTest, TaskStateTransitionsCorrectly) +{ + RepeatedTask task(std::chrono::milliseconds(1), this->ctx_); + + // Initially not running + task.stop(); // Should be no-op + + this->expectCalls(3); + + // Start running + task.run(this->mockFn.AsStdFunction()); + + EXPECT_TRUE(this->semaphore.try_acquire_for(TestFixture::kTIMEOUT)); + + // Stop + task.stop(); + + // Stop again should be no-op + task.stop(); +} + +TEST_F(RepeatedTaskTest, FunctionCanAccessYieldContext) +{ + StrictMock> mockFn; + std::atomic_bool yieldContextUsed = false; + std::binary_semaphore semaphore{0}; + + RepeatedTask task(std::chrono::milliseconds(1), ctx_); + + EXPECT_CALL(mockFn, Call).Times(AtLeast(1)).WillRepeatedly([&](boost::asio::yield_context yield) { + if (yieldContextUsed) + return; + + // Use the yield context to verify it's valid + boost::asio::steady_timer timer(yield.get_executor()); + timer.expires_after(std::chrono::milliseconds(1)); + boost::system::error_code ec; + timer.async_wait(yield[ec]); + EXPECT_FALSE(ec) << ec.message(); + yieldContextUsed = true; + semaphore.release(); + }); + + task.run(mockFn.AsStdFunction()); + + EXPECT_TRUE(semaphore.try_acquire_for(kTIMEOUT)); + + task.stop(); + + EXPECT_TRUE(yieldContextUsed); +} From 60e3ce07c6be53d405b111d335d4677d8e303ddf Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Tue, 2 Dec 2025 11:44:27 +0000 Subject: [PATCH 14/41] Add Backend tests --- src/cluster/Backend.cpp | 10 +- src/cluster/Backend.hpp | 3 + tests/common/util/MockBackendTestFixture.hpp | 1 + tests/unit/cluster/BackendTests.cpp | 197 ++++++++++++++++++- 4 files changed, 208 insertions(+), 3 deletions(-) diff --git a/src/cluster/Backend.cpp b/src/cluster/Backend.cpp index 1f760dcbd7..dad5f42252 100644 --- a/src/cluster/Backend.cpp +++ b/src/cluster/Backend.cpp @@ -67,8 +67,8 @@ void Backend::run() { readerTask_.run([this](boost::asio::yield_context yield) { - auto clusterState = doRead(yield); - onNewState_(selfUuid_, std::make_shared(std::move(clusterState))); + auto clusterData = doRead(yield); + onNewState_(selfUuid_, std::make_shared(std::move(clusterData))); }); writerTask_.run([this]() { doWrite(); }); @@ -86,6 +86,12 @@ Backend::stop() writerTask_.stop(); } +ClioNode::cUUID +Backend::selfId() const +{ + return selfUuid_; +} + Backend::ClusterData Backend::doRead(boost::asio::yield_context yield) { diff --git a/src/cluster/Backend.hpp b/src/cluster/Backend.hpp index 7e8a8d6baf..31c90b8504 100644 --- a/src/cluster/Backend.hpp +++ b/src/cluster/Backend.hpp @@ -128,6 +128,9 @@ class Backend { return onNewState_.connect(s); } + ClioNode::cUUID + selfId() const; + private: ClusterData doRead(boost::asio::yield_context yield); diff --git a/tests/common/util/MockBackendTestFixture.hpp b/tests/common/util/MockBackendTestFixture.hpp index 06c84e3722..59c6d89df2 100644 --- a/tests/common/util/MockBackendTestFixture.hpp +++ b/tests/common/util/MockBackendTestFixture.hpp @@ -25,6 +25,7 @@ #include "util/config/ConfigDefinition.hpp" #include +#include #include diff --git a/tests/unit/cluster/BackendTests.cpp b/tests/unit/cluster/BackendTests.cpp index c67fef3482..8cc38cb45c 100644 --- a/tests/unit/cluster/BackendTests.cpp +++ b/tests/unit/cluster/BackendTests.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -37,8 +38,12 @@ #include #include +#include +#include +#include #include #include +#include using namespace cluster; @@ -54,8 +59,51 @@ struct ClusterBackendTest : util::prometheus::WithPrometheus, MockBackendTestStr MockWriterState& writerStateRef = *writerState; testing::StrictMock)>> callbackMock; + std::binary_semaphore semaphore{0}; + + class SemaphoreReleaseGuard { + std::binary_semaphore& semaphore_; + + public: + SemaphoreReleaseGuard(std::binary_semaphore& s) : semaphore_(s) + { + } + ~SemaphoreReleaseGuard() + { + semaphore_.release(); + } + }; }; +TEST_F(ClusterBackendTest, SubscribeToNewState) +{ + Backend clusterBackend{ + ctx, backend_, std::move(writerState), std::chrono::milliseconds(1), std::chrono::milliseconds(1) + }; + + clusterBackend.subscribeToNewState(callbackMock.AsStdFunction()); + + EXPECT_CALL(*backend_, fetchClioNodesData) + .Times(testing::AtLeast(1)) + .WillRepeatedly(testing::Return(BackendInterface::ClioNodesDataFetchResult{})); + EXPECT_CALL(*backend_, writeNodeMessage).Times(testing::AtLeast(1)); + EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(true)); + EXPECT_CALL(callbackMock, Call) + .Times(testing::AtLeast(1)) + .WillRepeatedly([this](ClioNode::cUUID selfId, std::shared_ptr clusterData) { + SemaphoreReleaseGuard guard{semaphore}; + ASSERT_TRUE(clusterData->has_value()); + EXPECT_EQ(clusterData->value().size(), 1); + auto const& nodeData = clusterData->value().front(); + EXPECT_EQ(nodeData.uuid, selfId); + EXPECT_EQ(nodeData.dbRole, ClioNode::DbRole::ReadOnly); + EXPECT_LE(nodeData.updateTime, std::chrono::system_clock::now()); + }); + + clusterBackend.run(); + semaphore.acquire(); +} + TEST_F(ClusterBackendTest, Stop) { Backend clusterBackend{ @@ -66,7 +114,6 @@ TEST_F(ClusterBackendTest, Stop) .Times(testing::AtLeast(1)) .WillRepeatedly(testing::Return(BackendInterface::ClioNodesDataFetchResult{})); EXPECT_CALL(*backend_, writeNodeMessage).Times(testing::AtLeast(1)); - EXPECT_CALL(callbackMock, Call).Times(testing::AtLeast(1)); EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(true)); clusterBackend.run(); @@ -77,3 +124,151 @@ TEST_F(ClusterBackendTest, Stop) // Wait to make sure there is no new calls of mockDbBackend std::this_thread::sleep_for(std::chrono::milliseconds{20}); } + +TEST_F(ClusterBackendTest, FetchClioNodesDataThrowsException) +{ + Backend clusterBackend{ + ctx, backend_, std::move(writerState), std::chrono::milliseconds(1), std::chrono::milliseconds(1) + }; + + clusterBackend.subscribeToNewState(callbackMock.AsStdFunction()); + + EXPECT_CALL(*backend_, fetchClioNodesData) + .Times(testing::AtLeast(1)) + .WillRepeatedly(testing::Throw(std::runtime_error("Database connection failed"))); + EXPECT_CALL(*backend_, writeNodeMessage).Times(testing::AtLeast(1)); + EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(true)); + EXPECT_CALL(callbackMock, Call) + .Times(testing::AtLeast(1)) + .WillRepeatedly([this](ClioNode::cUUID, std::shared_ptr clusterData) { + SemaphoreReleaseGuard guard{semaphore}; + ASSERT_FALSE(clusterData->has_value()); + EXPECT_EQ(clusterData->error(), "Failed to fetch nodes data"); + }); + + clusterBackend.run(); + semaphore.acquire(); +} + +TEST_F(ClusterBackendTest, FetchClioNodesDataReturnsDataWithOtherNodes) +{ + Backend clusterBackend{ + ctx, backend_, std::move(writerState), std::chrono::milliseconds(1), std::chrono::milliseconds(1) + }; + + clusterBackend.subscribeToNewState(callbackMock.AsStdFunction()); + + auto const otherUuid = boost::uuids::random_generator{}(); + auto const otherNodeJson = R"({ + "db_role": 2, + "update_time": "2025-01-15T10:30:00Z" + })"; + + EXPECT_CALL(*backend_, fetchClioNodesData) + .Times(testing::AtLeast(1)) + .WillRepeatedly( + testing::Return( + BackendInterface::ClioNodesDataFetchResult{ + std::vector>{{otherUuid, otherNodeJson}} + } + ) + ); + EXPECT_CALL(*backend_, writeNodeMessage).Times(testing::AtLeast(1)); + EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); + EXPECT_CALL(writerStateRef, isWriting).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); + EXPECT_CALL(callbackMock, Call) + .Times(testing::AtLeast(1)) + .WillRepeatedly([&](ClioNode::cUUID selfId, std::shared_ptr clusterData) { + SemaphoreReleaseGuard guard{semaphore}; + ASSERT_TRUE(clusterData->has_value()) << clusterData->error(); + EXPECT_EQ(clusterData->value().size(), 2); + EXPECT_EQ(selfId, clusterBackend.selfId()); + + bool foundSelf = false; + bool foundOther = false; + + for (auto const& node : clusterData->value()) { + if (*node.uuid == *selfId) { + foundSelf = true; + EXPECT_EQ(node.dbRole, ClioNode::DbRole::NotWriter); + } else if (*node.uuid == otherUuid) { + foundOther = true; + EXPECT_EQ(node.dbRole, ClioNode::DbRole::Writer); + } + EXPECT_LE(node.updateTime, std::chrono::system_clock::now()); + } + + EXPECT_TRUE(foundSelf); + EXPECT_TRUE(foundOther); + }); + + clusterBackend.run(); + semaphore.acquire(); +} + +TEST_F(ClusterBackendTest, FetchClioNodesDataReturnsInvalidJson) +{ + Backend clusterBackend{ + ctx, backend_, std::move(writerState), std::chrono::milliseconds(1), std::chrono::milliseconds(1) + }; + + clusterBackend.subscribeToNewState(callbackMock.AsStdFunction()); + + auto const otherUuid = boost::uuids::random_generator{}(); + auto const invalidJson = "{ invalid json"; + + EXPECT_CALL(*backend_, fetchClioNodesData) + .Times(testing::AtLeast(1)) + .WillRepeatedly( + testing::Return( + BackendInterface::ClioNodesDataFetchResult{ + std::vector>{{otherUuid, invalidJson}} + } + ) + ); + EXPECT_CALL(*backend_, writeNodeMessage).Times(testing::AtLeast(1)); + EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(true)); + EXPECT_CALL(callbackMock, Call) + .Times(testing::AtLeast(1)) + .WillRepeatedly([this, invalidJson](ClioNode::cUUID, std::shared_ptr clusterData) { + SemaphoreReleaseGuard guard{semaphore}; + ASSERT_FALSE(clusterData->has_value()); + EXPECT_THAT(clusterData->error(), testing::HasSubstr("Error parsing json from DB")); + EXPECT_THAT(clusterData->error(), testing::HasSubstr(invalidJson)); + }); + + clusterBackend.run(); + semaphore.acquire(); +} + +TEST_F(ClusterBackendTest, WriteNodeMessageWritesSelfDataWithRecentTimestampAndDbRole) +{ + Backend clusterBackend{ + ctx, backend_, std::move(writerState), std::chrono::milliseconds(1), std::chrono::milliseconds(1) + }; + + auto const beforeRun = std::chrono::floor(std::chrono::system_clock::now()); + + EXPECT_CALL(*backend_, fetchClioNodesData) + .Times(testing::AtLeast(1)) + .WillRepeatedly(testing::Return(BackendInterface::ClioNodesDataFetchResult{})); + EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); + EXPECT_CALL(writerStateRef, isWriting).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); + EXPECT_CALL(*backend_, writeNodeMessage) + .Times(testing::AtLeast(1)) + .WillRepeatedly([&](boost::uuids::uuid const& uuid, std::string message) { + SemaphoreReleaseGuard guard{semaphore}; + auto const afterWrite = std::chrono::system_clock::now(); + + EXPECT_EQ(uuid, *clusterBackend.selfId()); + auto const json = boost::json::parse(message); + auto const node = boost::json::try_value_to(json); + ASSERT_TRUE(node.has_value()); + EXPECT_EQ(node->dbRole, ClioNode::DbRole::NotWriter); + EXPECT_GE(node->updateTime, beforeRun); + EXPECT_LE(node->updateTime, afterWrite); + }); + + clusterBackend.run(); + semaphore.acquire(); +} From 55ba0ab0ce18a902223717a4140a073b8ec71b1e Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Tue, 2 Dec 2025 17:58:14 +0000 Subject: [PATCH 15/41] Add tests for Metrics --- tests/unit/CMakeLists.txt | 1 + tests/unit/cluster/MetricsTests.cpp | 192 ++++++++++++++++++++++++++++ 2 files changed, 193 insertions(+) create mode 100644 tests/unit/cluster/MetricsTests.cpp diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index f91d7af15b..bf1f21108d 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -24,6 +24,7 @@ target_sources( cluster/BackendTests.cpp cluster/ClioNodeTests.cpp cluster/ClusterCommunicationServiceTests.cpp + cluster/MetricsTests.cpp cluster/RepeatedTaskTests.cpp # ETL etl/AmendmentBlockHandlerTests.cpp diff --git a/tests/unit/cluster/MetricsTests.cpp b/tests/unit/cluster/MetricsTests.cpp new file mode 100644 index 0000000000..477895d18d --- /dev/null +++ b/tests/unit/cluster/MetricsTests.cpp @@ -0,0 +1,192 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#include "cluster/Backend.hpp" +#include "cluster/ClioNode.hpp" +#include "cluster/Metrics.hpp" +#include "util/MockPrometheus.hpp" +#include "util/prometheus/Gauge.hpp" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +using namespace cluster; +using namespace util::prometheus; +using namespace testing; + +struct MetricsTest : WithMockPrometheus { + std::shared_ptr uuid1 = + std::make_shared(boost::uuids::random_generator()()); + std::shared_ptr uuid2 = + std::make_shared(boost::uuids::random_generator()()); + std::shared_ptr uuid3 = + std::make_shared(boost::uuids::random_generator()()); +}; + +TEST_F(MetricsTest, InitializesMetricsOnConstruction) +{ + auto& nodesInClusterMock = makeMock("cluster_nodes_total_number", ""); + auto& isHealthyMock = makeMock("cluster_communication_is_healthy", ""); + + EXPECT_CALL(nodesInClusterMock, set(1)); + EXPECT_CALL(isHealthyMock, set(1)); + + Metrics metrics; +} + +TEST_F(MetricsTest, OnNewStateWithValidClusterData) +{ + auto& nodesInClusterMock = makeMock("cluster_nodes_total_number", ""); + auto& isHealthyMock = makeMock("cluster_communication_is_healthy", ""); + + // Initial construction expectations + EXPECT_CALL(nodesInClusterMock, set(1)); + EXPECT_CALL(isHealthyMock, set(1)); + + Metrics metrics; + + // Create cluster data with 3 nodes + ClioNode node1{.uuid = uuid1, .updateTime = std::chrono::system_clock::now(), .dbRole = ClioNode::DbRole::Writer}; + ClioNode node2{.uuid = uuid2, .updateTime = std::chrono::system_clock::now(), .dbRole = ClioNode::DbRole::ReadOnly}; + ClioNode node3{ + .uuid = uuid3, .updateTime = std::chrono::system_clock::now(), .dbRole = ClioNode::DbRole::NotWriter + }; + + std::vector nodes = {node1, node2, node3}; + Backend::ClusterData clusterData = std::expected, std::string>(nodes); + auto sharedClusterData = std::make_shared(clusterData); + + // Expect metrics to be updated: health = true (1), node count = 3 + EXPECT_CALL(isHealthyMock, set(1)); + EXPECT_CALL(nodesInClusterMock, set(3)); + + metrics.onNewState(uuid1, sharedClusterData); +} + +TEST_F(MetricsTest, OnNewStateWithEmptyClusterData) +{ + auto& nodesInClusterMock = makeMock("cluster_nodes_total_number", ""); + auto& isHealthyMock = makeMock("cluster_communication_is_healthy", ""); + + // Initial construction expectations + EXPECT_CALL(nodesInClusterMock, set(1)); + EXPECT_CALL(isHealthyMock, set(1)); + + Metrics metrics; + + // Create empty cluster data (0 nodes) + std::vector nodes = {}; + Backend::ClusterData clusterData = std::expected, std::string>(nodes); + auto sharedClusterData = std::make_shared(clusterData); + + // Expect metrics to be updated: health = true (1), node count = 0 + EXPECT_CALL(isHealthyMock, set(1)); + EXPECT_CALL(nodesInClusterMock, set(0)); + + metrics.onNewState(uuid1, sharedClusterData); +} + +TEST_F(MetricsTest, OnNewStateWithFailedClusterData) +{ + auto& nodesInClusterMock = makeMock("cluster_nodes_total_number", ""); + auto& isHealthyMock = makeMock("cluster_communication_is_healthy", ""); + + // Initial construction expectations + EXPECT_CALL(nodesInClusterMock, set(1)); + EXPECT_CALL(isHealthyMock, set(1)); + + Metrics metrics; + + // Create failed cluster data (unexpected error) + Backend::ClusterData clusterData = + std::expected, std::string>(std::unexpected("Connection failed")); + auto sharedClusterData = std::make_shared(clusterData); + + // Expect health to be set to false (0), node count should not be updated + EXPECT_CALL(isHealthyMock, set(0)); + + metrics.onNewState(uuid1, sharedClusterData); +} + +TEST_F(MetricsTest, OnNewStateWithSingleNode) +{ + auto& nodesInClusterMock = makeMock("cluster_nodes_total_number", ""); + auto& isHealthyMock = makeMock("cluster_communication_is_healthy", ""); + + // Initial construction expectations + EXPECT_CALL(nodesInClusterMock, set(1)); + EXPECT_CALL(isHealthyMock, set(1)); + + Metrics metrics; + + // Create cluster data with just 1 node (self) + ClioNode node1{.uuid = uuid1, .updateTime = std::chrono::system_clock::now(), .dbRole = ClioNode::DbRole::Writer}; + + std::vector nodes = {node1}; + Backend::ClusterData clusterData = std::expected, std::string>(nodes); + auto sharedClusterData = std::make_shared(clusterData); + + // Expect metrics to be updated: health = true (1), node count = 1 + EXPECT_CALL(isHealthyMock, set(1)); + EXPECT_CALL(nodesInClusterMock, set(1)); + + metrics.onNewState(uuid1, sharedClusterData); +} + +TEST_F(MetricsTest, OnNewStateRecoveryFromFailure) +{ + auto& nodesInClusterMock = makeMock("cluster_nodes_total_number", ""); + auto& isHealthyMock = makeMock("cluster_communication_is_healthy", ""); + + // Initial construction expectations + EXPECT_CALL(nodesInClusterMock, set(1)); + EXPECT_CALL(isHealthyMock, set(1)); + + Metrics metrics; + + // First update: failure + Backend::ClusterData clusterData1 = + std::expected, std::string>(std::unexpected("Connection timeout")); + auto sharedClusterData1 = std::make_shared(clusterData1); + + EXPECT_CALL(isHealthyMock, set(0)); + + metrics.onNewState(uuid1, sharedClusterData1); + + // Second update: recovery with 2 nodes + ClioNode node1{.uuid = uuid1, .updateTime = std::chrono::system_clock::now(), .dbRole = ClioNode::DbRole::Writer}; + ClioNode node2{.uuid = uuid2, .updateTime = std::chrono::system_clock::now(), .dbRole = ClioNode::DbRole::ReadOnly}; + + std::vector nodes = {node1, node2}; + Backend::ClusterData clusterData2 = std::expected, std::string>(nodes); + auto sharedClusterData2 = std::make_shared(clusterData2); + + EXPECT_CALL(isHealthyMock, set(1)); + EXPECT_CALL(nodesInClusterMock, set(2)); + + metrics.onNewState(uuid2, sharedClusterData2); +} From 4697b1d60b429f20da3f428bc51c93c8498c9a28 Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Wed, 3 Dec 2025 15:40:25 +0000 Subject: [PATCH 16/41] Add tests for WriterDecider --- src/cluster/WriterDecider.cpp | 25 ++- tests/unit/CMakeLists.txt | 1 + tests/unit/cluster/WriterDeciderTests.cpp | 226 ++++++++++++++++++++++ 3 files changed, 244 insertions(+), 8 deletions(-) create mode 100644 tests/unit/cluster/WriterDeciderTests.cpp diff --git a/src/cluster/WriterDecider.cpp b/src/cluster/WriterDecider.cpp index 59fc92d99e..d7b6048eee 100644 --- a/src/cluster/WriterDecider.cpp +++ b/src/cluster/WriterDecider.cpp @@ -19,11 +19,18 @@ #include "cluster/WriterDecider.hpp" +#include "cluster/Backend.hpp" +#include "cluster/ClioNode.hpp" +#include "etl/WriterState.hpp" #include "util/Spawn.hpp" #include "util/log/Logger.hpp" +#include + #include +#include #include +#include namespace cluster { @@ -35,26 +42,28 @@ WriterDecider::WriterDecider(boost::asio::thread_pool& ctx, std::unique_ptr clusterData) { + if (not clusterData->has_value()) + return; + util::spawn( ctx_, [writerState = writerState_->clone(), selfId = std::move(selfId), - clusterData = std::move(clusterData)](auto&&) { - if (not clusterData->has_value()) - return; - auto data = clusterData->value(); - std::ranges::sort(data, [](ClioNode const& lhs, ClioNode const& rhs) { return lhs.uuid < rhs.uuid; }); + clusterData = clusterData->value()](auto&&) mutable { + std::ranges::sort(clusterData, [](ClioNode const& lhs, ClioNode const& rhs) { + return *lhs.uuid < *rhs.uuid; + }); - auto const it = std::ranges::find_if(data, [](ClioNode const& node) { + auto const it = std::ranges::find_if(clusterData, [](ClioNode const& node) { return node.dbRole != ClioNode::DbRole::ReadOnly; }); - if (it == data.end()) { + if (it == clusterData.end()) { LOG(util::LogService::warn()) << "No nodes allowed to write in the cluster"; return; } - if (it->uuid == selfId) { + if (*it->uuid == *selfId) { writerState->startWriting(); } else { writerState->giveUpWriting(); diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index bf1f21108d..6c410eaf19 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -26,6 +26,7 @@ target_sources( cluster/ClusterCommunicationServiceTests.cpp cluster/MetricsTests.cpp cluster/RepeatedTaskTests.cpp + cluster/WriterDeciderTests.cpp # ETL etl/AmendmentBlockHandlerTests.cpp etl/CacheLoaderSettingsTests.cpp diff --git a/tests/unit/cluster/WriterDeciderTests.cpp b/tests/unit/cluster/WriterDeciderTests.cpp new file mode 100644 index 0000000000..ecfd94459c --- /dev/null +++ b/tests/unit/cluster/WriterDeciderTests.cpp @@ -0,0 +1,226 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#include "cluster/Backend.hpp" +#include "cluster/ClioNode.hpp" +#include "cluster/WriterDecider.hpp" +#include "util/MockWriterState.hpp" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +using namespace cluster; + +enum class ExpectedAction { StartWriting, GiveUpWriting, NoAction }; + +struct WriterDeciderTestParams { + std::string testName; + uint8_t selfUuidValue; + std::vector> nodes; + ExpectedAction expectedAction; + bool useEmptyClusterData = false; +}; + +struct WriterDeciderTest : testing::TestWithParam { + ~WriterDeciderTest() override + { + ctx.stop(); + ctx.join(); + } + + boost::asio::thread_pool ctx{1}; + std::unique_ptr writerState = std::make_unique(); + MockWriterState& writerStateRef = *writerState; + + static ClioNode + makeNode(boost::uuids::uuid const& uuid, ClioNode::DbRole role) + { + return ClioNode{ + .uuid = std::make_shared(uuid), + .updateTime = std::chrono::system_clock::now(), + .dbRole = role + }; + } + + static boost::uuids::uuid + makeUuid(uint8_t value) + { + boost::uuids::uuid uuid{}; + std::ranges::fill(uuid, value); + return uuid; + } +}; + +TEST_P(WriterDeciderTest, WriterSelection) +{ + auto const& params = GetParam(); + + auto const selfUuid = makeUuid(params.selfUuidValue); + + WriterDecider decider{ctx, std::move(writerState)}; + + auto clonedState = std::make_unique(); + + // Set up expectations based on expected action + switch (params.expectedAction) { + case ExpectedAction::StartWriting: + EXPECT_CALL(*clonedState, startWriting()); + EXPECT_CALL(writerStateRef, clone()).WillOnce(testing::Return(testing::ByMove(std::move(clonedState)))); + break; + case ExpectedAction::GiveUpWriting: + EXPECT_CALL(*clonedState, giveUpWriting()); + EXPECT_CALL(writerStateRef, clone()).WillOnce(testing::Return(testing::ByMove(std::move(clonedState)))); + break; + case ExpectedAction::NoAction: + if (not params.useEmptyClusterData) { + // For all-ReadOnly case, we still clone but don't call any action + EXPECT_CALL(writerStateRef, clone()).WillOnce(testing::Return(testing::ByMove(std::move(clonedState)))); + } + // For empty cluster data, clone is never called + break; + } + + std::shared_ptr clusterData; + + if (params.useEmptyClusterData) { + clusterData = std::make_shared(std::unexpected(std::string("Communication failed"))); + } else { + std::vector nodes; + nodes.reserve(params.nodes.size()); + for (auto const& [uuidValue, role] : params.nodes) { + nodes.push_back(makeNode(makeUuid(uuidValue), role)); + } + clusterData = std::make_shared(std::move(nodes)); + } + + decider.onNewState(std::make_shared(selfUuid), clusterData); + + ctx.join(); +} + +INSTANTIATE_TEST_SUITE_P( + WriterDeciderTests, + WriterDeciderTest, + testing::Values( + WriterDeciderTestParams{ + .testName = "SelfNodeIsSelectedAsWriter", + .selfUuidValue = 0x01, + .nodes = {{0x01, ClioNode::DbRole::Writer}, {0x02, ClioNode::DbRole::Writer}}, + .expectedAction = ExpectedAction::StartWriting + }, + WriterDeciderTestParams{ + .testName = "OtherNodeIsSelectedAsWriter", + .selfUuidValue = 0x02, + .nodes = {{0x01, ClioNode::DbRole::Writer}, {0x02, ClioNode::DbRole::Writer}}, + .expectedAction = ExpectedAction::GiveUpWriting + }, + WriterDeciderTestParams{ + .testName = "NodesAreSortedByUUID", + .selfUuidValue = 0x02, + .nodes = + {{0x03, ClioNode::DbRole::Writer}, {0x02, ClioNode::DbRole::Writer}, {0x01, ClioNode::DbRole::Writer}}, + .expectedAction = ExpectedAction::GiveUpWriting + }, + WriterDeciderTestParams{ + .testName = "FirstNodeAfterReadOnlyIsNotSelf", + .selfUuidValue = 0x03, + .nodes = + {{0x01, ClioNode::DbRole::ReadOnly}, + {0x02, ClioNode::DbRole::Writer}, + {0x03, ClioNode::DbRole::NotWriter}}, + .expectedAction = ExpectedAction::GiveUpWriting + }, + WriterDeciderTestParams{ + .testName = "FirstNodeAfterReadOnlyIsSelf", + .selfUuidValue = 0x02, + .nodes = + {{0x01, ClioNode::DbRole::ReadOnly}, + {0x02, ClioNode::DbRole::Writer}, + {0x03, ClioNode::DbRole::NotWriter}}, + .expectedAction = ExpectedAction::StartWriting + }, + WriterDeciderTestParams{ + .testName = "AllNodesReadOnlyNoActionTaken", + .selfUuidValue = 0x01, + .nodes = {{0x01, ClioNode::DbRole::ReadOnly}, {0x02, ClioNode::DbRole::ReadOnly}}, + .expectedAction = ExpectedAction::NoAction + }, + WriterDeciderTestParams{ + .testName = "EmptyClusterDataNoActionTaken", + .selfUuidValue = 0x01, + .nodes = {}, + .expectedAction = ExpectedAction::NoAction, + .useEmptyClusterData = true + }, + WriterDeciderTestParams{ + .testName = "SingleNodeClusterSelfIsWriter", + .selfUuidValue = 0x01, + .nodes = {{0x01, ClioNode::DbRole::Writer}}, + .expectedAction = ExpectedAction::StartWriting + }, + WriterDeciderTestParams{ + .testName = "NotWriterRoleIsSelectedWhenNoWriterRole", + .selfUuidValue = 0x01, + .nodes = {{0x01, ClioNode::DbRole::NotWriter}, {0x02, ClioNode::DbRole::NotWriter}}, + .expectedAction = ExpectedAction::StartWriting + }, + WriterDeciderTestParams{ + .testName = "MixedRolesFirstNonReadOnlyIsSelected", + .selfUuidValue = 0x03, + .nodes = + {{0x01, ClioNode::DbRole::ReadOnly}, + {0x02, ClioNode::DbRole::Writer}, + {0x03, ClioNode::DbRole::NotWriter}, + {0x04, ClioNode::DbRole::Writer}}, + .expectedAction = ExpectedAction::GiveUpWriting + }, + WriterDeciderTestParams{ + .testName = "ShuffledNodesAreSortedCorrectly", + .selfUuidValue = 0x04, + .nodes = + {{0x04, ClioNode::DbRole::Writer}, + {0x01, ClioNode::DbRole::Writer}, + {0x03, ClioNode::DbRole::Writer}, + {0x02, ClioNode::DbRole::Writer}}, + .expectedAction = ExpectedAction::GiveUpWriting + }, + WriterDeciderTestParams{ + .testName = "ShuffledNodesWithReadOnlySelfIsSelected", + .selfUuidValue = 0x03, + .nodes = + {{0x05, ClioNode::DbRole::Writer}, + {0x01, ClioNode::DbRole::ReadOnly}, + {0x04, ClioNode::DbRole::Writer}, + {0x03, ClioNode::DbRole::Writer}, + {0x02, ClioNode::DbRole::ReadOnly}}, + .expectedAction = ExpectedAction::StartWriting + } + ), + [](testing::TestParamInfo const& info) { return info.param.testName; } +); From 575857526fbda11cd7a62e4862f1c6893490ce28 Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Wed, 3 Dec 2025 16:48:57 +0000 Subject: [PATCH 17/41] Add tests for ClusterCommunicationService --- tests/common/util/MockWriterState.hpp | 1 + .../ClusterCommunicationServiceTests.cpp | 302 +++++++++--------- 2 files changed, 146 insertions(+), 157 deletions(-) diff --git a/tests/common/util/MockWriterState.hpp b/tests/common/util/MockWriterState.hpp index 25b818be98..fbd4f8fff0 100644 --- a/tests/common/util/MockWriterState.hpp +++ b/tests/common/util/MockWriterState.hpp @@ -34,3 +34,4 @@ struct MockWriterStateBase : public etl::WriterStateInterface { }; using MockWriterState = testing::StrictMock; +using NiceMockWriterState = testing::NiceMock; diff --git a/tests/unit/cluster/ClusterCommunicationServiceTests.cpp b/tests/unit/cluster/ClusterCommunicationServiceTests.cpp index 46552e8e4f..6c287d5033 100644 --- a/tests/unit/cluster/ClusterCommunicationServiceTests.cpp +++ b/tests/unit/cluster/ClusterCommunicationServiceTests.cpp @@ -13,7 +13,7 @@ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ //============================================================================== @@ -22,209 +22,197 @@ #include "data/BackendInterface.hpp" #include "util/MockBackendTestFixture.hpp" #include "util/MockPrometheus.hpp" -#include "util/TimeUtils.hpp" -#include "util/prometheus/Bool.hpp" -#include "util/prometheus/Gauge.hpp" +#include "util/MockWriterState.hpp" #include "util/prometheus/Prometheus.hpp" -#include +#include #include -#include -#include #include -#include #include -#include #include #include #include +#include #include -#include +#include #include -#include +#include #include +#include #include #include using namespace cluster; -/* -namespace { -std::vector const kOTHER_NODES_DATA = { - ClioNode{ - .uuid = std::make_shared(boost::uuids::random_generator()()), - .updateTime = util::systemTpFromUtcStr("2015-05-15T12:00:00Z", ClioNode::kTIME_FORMAT).value() - }, - ClioNode{ - .uuid = std::make_shared(boost::uuids::random_generator()()), - .updateTime = util::systemTpFromUtcStr("2015-05-15T12:00:01Z", ClioNode::kTIME_FORMAT).value() - }, -}; -} // namespace +struct ClusterCommunicationServiceTest : util::prometheus::WithPrometheus, MockBackendTest { + std::unique_ptr writerState = std::make_unique(); + NiceMockWriterState& writerStateRef = *writerState; -struct ClusterCommunicationServiceTest : util::prometheus::WithPrometheus, MockBackendTestStrict { - ClusterCommunicationService clusterCommunicationService{ - backend_, - std::chrono::milliseconds{5}, - std::chrono::milliseconds{9} - }; + static constexpr std::chrono::milliseconds kSHORT_INTERVAL{1}; - util::prometheus::GaugeInt& nodesInClusterMetric = PrometheusService::gaugeInt("cluster_nodes_total_number", {}); - util::prometheus::Bool isHealthyMetric = PrometheusService::boolMetric("cluster_communication_is_healthy", {}); + static boost::uuids::uuid + makeUuid(uint8_t value) + { + boost::uuids::uuid uuid{}; + std::ranges::fill(uuid, value); + return uuid; + } - std::mutex mtx; - std::condition_variable cv; + static ClioNode + makeNode(boost::uuids::uuid const& uuid, ClioNode::DbRole role) + { + return ClioNode{ + .uuid = std::make_shared(uuid), + .updateTime = std::chrono::system_clock::now(), + .dbRole = role + }; + } + + static std::string + nodeToJson(ClioNode const& node) + { + boost::json::value v = boost::json::value_from(node); + return boost::json::serialize(v); + } - void - notify() + ClusterCommunicationServiceTest() { - std::unique_lock const lock{mtx}; - cv.notify_one(); + ON_CALL(writerStateRef, clone()).WillByDefault(testing::Invoke([]() { + auto state = std::make_unique(); + ON_CALL(*state, isReadOnly()).WillByDefault(testing::Return(false)); + ON_CALL(*state, isWriting()).WillByDefault(testing::Return(true)); + return state; + })); + ON_CALL(writerStateRef, isReadOnly()).WillByDefault(testing::Return(false)); + ON_CALL(writerStateRef, isWriting()).WillByDefault(testing::Return(true)); } - void - wait() + static bool + waitForSignal(std::binary_semaphore& sem, std::chrono::milliseconds timeout = std::chrono::milliseconds{1000}) { - std::unique_lock lock{mtx}; - cv.wait_until(lock, std::chrono::steady_clock::now() + std::chrono::milliseconds{100}); + return sem.try_acquire_for(timeout); } }; -TEST_F(ClusterCommunicationServiceTest, Write) +TEST_F(ClusterCommunicationServiceTest, BackendReadsAndWritesData) { - auto const selfUuid = *clusterCommunicationService.selfUuid(); + auto const otherUuid = makeUuid(0x02); + std::binary_semaphore fetchSemaphore{0}; + std::binary_semaphore writeSemaphore{0}; - auto const nowStr = util::systemTpToUtcStr(std::chrono::system_clock::now(), ClioNode::kTIME_FORMAT); - auto const nowStrPrefix = nowStr.substr(0, nowStr.size() - 3); + BackendInterface::ClioNodesDataFetchResult fetchResult{std::vector>{ + {otherUuid, nodeToJson(makeNode(otherUuid, ClioNode::DbRole::Writer))} + }}; - EXPECT_CALL(*backend_, writeNodeMessage(selfUuid, testing::_)).WillOnce([&](auto&&, std::string const& jsonStr) { - auto const jv = boost::json::parse(jsonStr); - ASSERT_TRUE(jv.is_object()); - auto const& obj = jv.as_object(); - ASSERT_TRUE(obj.contains("update_time")); - ASSERT_TRUE(obj.at("update_time").is_string()); - EXPECT_THAT(std::string{obj.at("update_time").as_string()}, testing::StartsWith(nowStrPrefix)); + ON_CALL(*backend_, fetchClioNodesData).WillByDefault(testing::Invoke([&](auto) { + fetchSemaphore.release(); + return fetchResult; + })); - notify(); - }); + ON_CALL(*backend_, writeNodeMessage).WillByDefault(testing::Invoke([&](auto, auto) { writeSemaphore.release(); })); - clusterCommunicationService.run(); - wait(); - // destructor of clusterCommunicationService calls .stop() -} + ClusterCommunicationService service{backend_, std::move(writerState), kSHORT_INTERVAL, kSHORT_INTERVAL}; -TEST_F(ClusterCommunicationServiceTest, Read_FetchFailed) -{ - EXPECT_TRUE(isHealthyMetric); - EXPECT_CALL(*backend_, writeNodeMessage).Times(2).WillOnce([](auto&&, auto&&) {}).WillOnce([this](auto&&, auto&&) { - notify(); - }); - EXPECT_CALL(*backend_, fetchClioNodesData).WillRepeatedly([](auto&&) { return std::unexpected{"Failed"}; }); - - clusterCommunicationService.run(); - wait(); - // call .stop() manually so that workers exit before expectations are called more times than we want - clusterCommunicationService.stop(); - - EXPECT_FALSE(isHealthyMetric); -} + service.run(); -TEST_F(ClusterCommunicationServiceTest, Read_FetchThrew) -{ - EXPECT_TRUE(isHealthyMetric); - EXPECT_CALL(*backend_, writeNodeMessage).Times(2).WillOnce([](auto&&, auto&&) {}).WillOnce([this](auto&&, auto&&) { - notify(); - }); - EXPECT_CALL(*backend_, fetchClioNodesData).WillRepeatedly(testing::Throw(data::DatabaseTimeout{})); - - clusterCommunicationService.run(); - wait(); - clusterCommunicationService.stop(); - - EXPECT_FALSE(isHealthyMetric); - EXPECT_FALSE(clusterCommunicationService.clusterData().has_value()); + EXPECT_TRUE(waitForSignal(fetchSemaphore)); + EXPECT_TRUE(waitForSignal(writeSemaphore)); + + service.stop(); } -TEST_F(ClusterCommunicationServiceTest, Read_GotInvalidJson) +TEST_F(ClusterCommunicationServiceTest, MetricsGetsNewStateFromBackend) { - EXPECT_TRUE(isHealthyMetric); - EXPECT_CALL(*backend_, writeNodeMessage).Times(2).WillOnce([](auto&&, auto&&) {}).WillOnce([this](auto&&, auto&&) { - notify(); - }); - EXPECT_CALL(*backend_, fetchClioNodesData).WillRepeatedly([](auto&&) { - return std::vector>{ - {boost::uuids::random_generator()(), "invalid json"} - }; - }); + auto const otherUuid = makeUuid(0x02); + std::binary_semaphore writerActionSemaphore{0}; + + BackendInterface::ClioNodesDataFetchResult fetchResult{std::vector>{ + {otherUuid, nodeToJson(makeNode(otherUuid, ClioNode::DbRole::Writer))} + }}; + + ON_CALL(*backend_, fetchClioNodesData).WillByDefault(testing::Invoke([&](auto) { return fetchResult; })); - clusterCommunicationService.run(); - wait(); - clusterCommunicationService.stop(); + ON_CALL(writerStateRef, clone()).WillByDefault(testing::Invoke([&]() mutable { + auto state = std::make_unique(); + ON_CALL(*state, startWriting()).WillByDefault(testing::Invoke([&]() { writerActionSemaphore.release(); })); + ON_CALL(*state, giveUpWriting()).WillByDefault(testing::Invoke([&]() { writerActionSemaphore.release(); })); + return state; + })); - EXPECT_FALSE(isHealthyMetric); - EXPECT_FALSE(clusterCommunicationService.clusterData().has_value()); + auto& nodesInClusterMetric = PrometheusService::gaugeInt("cluster_nodes_total_number", {}); + auto isHealthyMetric = PrometheusService::boolMetric("cluster_communication_is_healthy", {}); + + ClusterCommunicationService service{backend_, std::move(writerState), kSHORT_INTERVAL, kSHORT_INTERVAL}; + + service.run(); + + // WriterDecider is called after metrics are updated so we could use it as a signal to stop + EXPECT_TRUE(waitForSignal(writerActionSemaphore)); + + service.stop(); + + EXPECT_EQ(nodesInClusterMetric.value(), 2); + EXPECT_TRUE(static_cast(isHealthyMetric)); } -TEST_F(ClusterCommunicationServiceTest, Read_GotInvalidNodeData) +TEST_F(ClusterCommunicationServiceTest, WriterDeciderCallsWriterStateMethodsAccordingly) { - EXPECT_TRUE(isHealthyMetric); - EXPECT_CALL(*backend_, writeNodeMessage).Times(2).WillOnce([](auto&&, auto&&) {}).WillOnce([this](auto&&, auto&&) { - notify(); - }); - EXPECT_CALL(*backend_, fetchClioNodesData).WillRepeatedly([](auto&&) { - return std::vector>{{boost::uuids::random_generator()(), "{}"}}; - }); - - clusterCommunicationService.run(); - wait(); - clusterCommunicationService.stop(); - - EXPECT_FALSE(isHealthyMetric); - EXPECT_FALSE(clusterCommunicationService.clusterData().has_value()); + auto const smallerUuid = makeUuid(0x00); + std::binary_semaphore fetchSemaphore{0}; + std::binary_semaphore writerActionSemaphore{0}; + + BackendInterface::ClioNodesDataFetchResult fetchResult{std::vector>{ + {smallerUuid, nodeToJson(makeNode(smallerUuid, ClioNode::DbRole::Writer))} + }}; + + ON_CALL(*backend_, fetchClioNodesData).WillByDefault(testing::Invoke([&](auto) { + fetchSemaphore.release(); + return fetchResult; + })); + + ON_CALL(*backend_, writeNodeMessage).WillByDefault(testing::Return()); + + ON_CALL(writerStateRef, clone()).WillByDefault(testing::Invoke([&]() mutable { + auto state = std::make_unique(); + ON_CALL(*state, startWriting()).WillByDefault(testing::Invoke([&]() { writerActionSemaphore.release(); })); + ON_CALL(*state, giveUpWriting()).WillByDefault(testing::Invoke([&]() { writerActionSemaphore.release(); })); + return state; + })); + + ClusterCommunicationService service{backend_, std::move(writerState), kSHORT_INTERVAL, kSHORT_INTERVAL}; + + service.run(); + + EXPECT_TRUE(waitForSignal(fetchSemaphore)); + EXPECT_TRUE(waitForSignal(writerActionSemaphore)); + + service.stop(); } -TEST_F(ClusterCommunicationServiceTest, Read_Success) +TEST_F(ClusterCommunicationServiceTest, StopHaltsBackendOperations) { - EXPECT_TRUE(isHealthyMetric); - EXPECT_EQ(nodesInClusterMetric.value(), 1); - - EXPECT_CALL(*backend_, writeNodeMessage).Times(2).WillOnce([](auto&&, auto&&) {}).WillOnce([this](auto&&, auto&&) { - auto const clusterData = clusterCommunicationService.clusterData(); - ASSERT_TRUE(clusterData.has_value()); - ASSERT_EQ(clusterData->size(), kOTHER_NODES_DATA.size() + 1); - for (auto const& node : kOTHER_NODES_DATA) { - auto const it = - std::ranges::find_if(*clusterData, [&](ClioNode const& n) { return *(n.uuid) == *(node.uuid); }); - EXPECT_NE(it, clusterData->cend()) << boost::uuids::to_string(*node.uuid); - } - auto const selfUuid = clusterCommunicationService.selfUuid(); - auto const it = - std::ranges::find_if(*clusterData, [&selfUuid](ClioNode const& node) { return node.uuid == selfUuid; }); - EXPECT_NE(it, clusterData->end()); - - notify(); - }); - - EXPECT_CALL(*backend_, fetchClioNodesData).WillRepeatedly([this](auto&&) { - auto const selfUuid = clusterCommunicationService.selfUuid(); - std::vector> result = { - {*selfUuid, R"JSON({"update_time": "2015-05-15:12:00:00"})JSON"}, - }; + std::atomic backendOperationsCount{0}; + std::binary_semaphore fetchSemaphore{0}; + + BackendInterface::ClioNodesDataFetchResult fetchResult{std::vector>{}}; - for (auto const& node : kOTHER_NODES_DATA) { - boost::json::value jsonValue; - boost::json::value_from(node, jsonValue); - result.emplace_back(*node.uuid, boost::json::serialize(jsonValue)); - } - return result; - }); + ON_CALL(*backend_, fetchClioNodesData).WillByDefault(testing::Invoke([&](auto) { + backendOperationsCount++; + fetchSemaphore.release(); + return fetchResult; + })); + ON_CALL(*backend_, writeNodeMessage).WillByDefault(testing::Invoke([&](auto&&, auto&&) { + backendOperationsCount++; + })); - clusterCommunicationService.run(); - wait(); - clusterCommunicationService.stop(); + ClusterCommunicationService service{backend_, std::move(writerState), kSHORT_INTERVAL, kSHORT_INTERVAL}; - EXPECT_TRUE(isHealthyMetric); - EXPECT_EQ(nodesInClusterMetric.value(), 3); + service.run(); + EXPECT_TRUE(waitForSignal(fetchSemaphore)); + service.stop(); + + auto const countAfterStop = backendOperationsCount.load(); + std::this_thread::sleep_for(std::chrono::milliseconds{50}); + EXPECT_EQ(backendOperationsCount.load(), countAfterStop); } -*/ From da5008b1cda483d44a3b393c7494b064fc15a05c Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Wed, 3 Dec 2025 17:27:09 +0000 Subject: [PATCH 18/41] Fix typo in variable name --- src/etl/ETLService.cpp | 4 ++-- src/etl/SystemState.hpp | 2 +- src/etl/WriterState.cpp | 2 +- src/etl/impl/Loading.cpp | 2 +- tests/unit/etl/ETLServiceTests.cpp | 10 +++++----- tests/unit/etl/WriterStateTests.cpp | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/etl/ETLService.cpp b/src/etl/ETLService.cpp index 2addca8bc6..5aa35f923f 100644 --- a/src/etl/ETLService.cpp +++ b/src/etl/ETLService.cpp @@ -349,7 +349,7 @@ ETLService::startMonitor(uint32_t seq) monitorNewSeqSubscription_ = monitor_->subscribeToNewSequence([this](uint32_t seq) { LOG(log_.info()) << "ETLService (via Monitor) got new seq from db: " << seq; - if (state_->shouldGiveUpWriter) { + if (state_->shouldGiveUpWriting) { giveUpWriter(); } @@ -406,7 +406,7 @@ ETLService::giveUpWriter() { ASSERT(not state_->isStrictReadonly, "This should only happen on writer nodes"); state_->isWriting = false; - state_->shouldGiveUpWriter = false; + state_->shouldGiveUpWriting = false; LOG(log_.info()) << "Giving up writer seat"; taskMan_ = nullptr; } diff --git a/src/etl/SystemState.hpp b/src/etl/SystemState.hpp index 9813d59bb4..d3cc8e15bd 100644 --- a/src/etl/SystemState.hpp +++ b/src/etl/SystemState.hpp @@ -69,7 +69,7 @@ struct SystemState { std::atomic_bool isStopping = false; /**< @brief Whether the software is stopping. */ std::atomic_bool shouldTakeoverWriting = false; /**< @brief Whether ETL should start writing to DB. */ - std::atomic_bool shouldGiveUpWriter = false; /**< @brief Whether ETL should stop writing to DB. */ + std::atomic_bool shouldGiveUpWriting = false; /**< @brief Whether ETL should stop writing to DB. */ /** * @brief Whether clio detected an amendment block. diff --git a/src/etl/WriterState.cpp b/src/etl/WriterState.cpp index d455ebc625..f2c850b8a2 100644 --- a/src/etl/WriterState.cpp +++ b/src/etl/WriterState.cpp @@ -57,7 +57,7 @@ WriterState::giveUpWriting() if (not isWriting()) return; - systemState_->shouldTakeoverWriting = true; + systemState_->shouldGiveUpWriting = true; } std::unique_ptr diff --git a/src/etl/impl/Loading.cpp b/src/etl/impl/Loading.cpp index f27cc64f37..23ff2d0288 100644 --- a/src/etl/impl/Loading.cpp +++ b/src/etl/impl/Loading.cpp @@ -75,7 +75,7 @@ Loader::load(model::LedgerData const& data) << "; took " << duration << "ms"; if (not success) { - state_->shouldGiveUpWriter = true; + state_->shouldGiveUpWriting = true; LOG(log_.warn()) << "Another node wrote a ledger into the DB - we have a write conflict"; return std::unexpected(LoaderError::WriteConflict); } diff --git a/tests/unit/etl/ETLServiceTests.cpp b/tests/unit/etl/ETLServiceTests.cpp index e458a6332d..867ac7d370 100644 --- a/tests/unit/etl/ETLServiceTests.cpp +++ b/tests/unit/etl/ETLServiceTests.cpp @@ -370,13 +370,13 @@ TEST_F(ETLServiceTests, HandlesWriteConflictInMonitorSubscription) EXPECT_CALL(*cacheLoader_, load(kSEQ)); service_.run(); - systemState_->shouldGiveUpWriter = true; + systemState_->shouldGiveUpWriting = true; EXPECT_CALL(*publisher_, publish(kSEQ + 1, testing::_, testing::_)); ASSERT_TRUE(capturedCallback); capturedCallback(kSEQ + 1); - EXPECT_FALSE(systemState_->shouldGiveUpWriter); + EXPECT_FALSE(systemState_->shouldGiveUpWriting); EXPECT_FALSE(systemState_->isWriting); } @@ -483,15 +483,15 @@ TEST_F(ETLServiceTests, GiveUpWriterAfterWriteConflict) service_.run(); systemState_->isWriting = true; - systemState_->shouldGiveUpWriter = true; // got a write conflict along the way + systemState_->shouldGiveUpWriting = true; // got a write conflict along the way EXPECT_CALL(*publisher_, publish(kSEQ + 1, testing::_, testing::_)); ASSERT_TRUE(capturedCallback); capturedCallback(kSEQ + 1); - EXPECT_FALSE(systemState_->isWriting); // gives up writing - EXPECT_FALSE(systemState_->shouldGiveUpWriter); // and removes write conflict flag + EXPECT_FALSE(systemState_->isWriting); // gives up writing + EXPECT_FALSE(systemState_->shouldGiveUpWriting); // and removes write conflict flag } TEST_F(ETLServiceTests, CancelledLoadInitialLedger) diff --git a/tests/unit/etl/WriterStateTests.cpp b/tests/unit/etl/WriterStateTests.cpp index 62ae06a1ed..14858489bc 100644 --- a/tests/unit/etl/WriterStateTests.cpp +++ b/tests/unit/etl/WriterStateTests.cpp @@ -64,11 +64,11 @@ TEST_F(WriterStateTest, StartWritingDoesNothingWhenAlreadyWriting) TEST_F(WriterStateTest, GiveUpWritingSetsFlag) { systemState->isWriting = true; - systemState->shouldTakeoverWriting = false; + systemState->shouldGiveUpWriting = false; writerState.giveUpWriting(); - EXPECT_TRUE(systemState->shouldTakeoverWriting); + EXPECT_TRUE(systemState->shouldGiveUpWriting); } TEST_F(WriterStateTest, GiveUpWritingDoesNothingWhenNotWriting) From e16efbfc12ead2454f1cc926afae207ee0037191 Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Thu, 4 Dec 2025 17:32:31 +0000 Subject: [PATCH 19/41] Add clusterCommunicationService to graceful shutdown --- src/app/ClioApplication.cpp | 15 ++++++++-- src/app/Stopper.hpp | 9 +++++- src/cluster/ClusterCommunicationService.hpp | 5 ++-- src/cluster/Concepts.hpp | 33 +++++++++++++++++++++ tests/unit/app/StopperTests.cpp | 10 +++++++ 5 files changed, 67 insertions(+), 5 deletions(-) create mode 100644 src/cluster/Concepts.hpp diff --git a/src/app/ClioApplication.cpp b/src/app/ClioApplication.cpp index 64928e3a2f..436813d14a 100644 --- a/src/app/ClioApplication.cpp +++ b/src/app/ClioApplication.cpp @@ -205,7 +205,16 @@ ClioApplication::run(bool const useNgWebServer) } appStopper_.setOnStop( - Stopper::makeOnStopCallback(httpServer.value(), *balancer, *etl, *subscriptions, *backend, cacheSaver, ioc) + Stopper::makeOnStopCallback( + httpServer.value(), + *balancer, + *etl, + *subscriptions, + *backend, + cacheSaver, + clusterCommunicationService, + ioc + ) ); // Blocks until stopped. @@ -221,7 +230,9 @@ ClioApplication::run(bool const useNgWebServer) auto const httpServer = web::makeHttpServer(config_, ioc, dosGuard, handler, cache); appStopper_.setOnStop( - Stopper::makeOnStopCallback(*httpServer, *balancer, *etl, *subscriptions, *backend, cacheSaver, ioc) + Stopper::makeOnStopCallback( + *httpServer, *balancer, *etl, *subscriptions, *backend, cacheSaver, clusterCommunicationService, ioc + ) ); // Blocks until stopped. diff --git a/src/app/Stopper.hpp b/src/app/Stopper.hpp index 190dd5df94..5b00d975c2 100644 --- a/src/app/Stopper.hpp +++ b/src/app/Stopper.hpp @@ -19,6 +19,7 @@ #pragma once +#include "cluster/Concepts.hpp" #include "data/BackendInterface.hpp" #include "data/LedgerCacheSaver.hpp" #include "etl/ETLServiceInterface.hpp" @@ -85,7 +86,10 @@ class Stopper { * @param ioc The io_context to stop. * @return The callback to be called on application stop. */ - template + template < + web::SomeServer ServerType, + data::SomeLedgerCacheSaver LedgerCacheSaverType, + cluster::SomeClusterCommunicationService ClusterCommunicationServiceType> static std::function makeOnStopCallback( ServerType& server, @@ -94,6 +98,7 @@ class Stopper { feed::SubscriptionManagerInterface& subscriptions, data::BackendInterface& backend, LedgerCacheSaverType& cacheSaver, + ClusterCommunicationServiceType& clusterCommunicationService, boost::asio::io_context& ioc ) { @@ -111,6 +116,8 @@ class Stopper { }); coroutineGroup.asyncWait(yield); + clusterCommunicationService.stop(); + etl.stop(); LOG(util::LogService::info()) << "ETL stopped"; diff --git a/src/cluster/ClusterCommunicationService.hpp b/src/cluster/ClusterCommunicationService.hpp index aa94c81b56..f73a7cd32b 100644 --- a/src/cluster/ClusterCommunicationService.hpp +++ b/src/cluster/ClusterCommunicationService.hpp @@ -20,6 +20,7 @@ #pragma once #include "cluster/Backend.hpp" +#include "cluster/Concepts.hpp" #include "cluster/Metrics.hpp" #include "cluster/WriterDecider.hpp" #include "data/BackendInterface.hpp" @@ -39,7 +40,7 @@ namespace cluster { /** * @brief Service to post and read messages to/from the cluster. It uses a backend to communicate with the cluster. */ -class ClusterCommunicationService { +class ClusterCommunicationService : public ClusterCommunicationServiceTag { // TODO: Use util::async::CoroExecutionContext after https://github.com/XRPLF/clio/issues/1973 is implemented boost::asio::thread_pool ctx_{1}; Backend backend_; @@ -65,7 +66,7 @@ class ClusterCommunicationService { std::chrono::steady_clock::duration writeInterval = kDEFAULT_WRITE_INTERVAL ); - ~ClusterCommunicationService(); + ~ClusterCommunicationService() override; ClusterCommunicationService(ClusterCommunicationService&&) = delete; ClusterCommunicationService(ClusterCommunicationService const&) = delete; diff --git a/src/cluster/Concepts.hpp b/src/cluster/Concepts.hpp new file mode 100644 index 0000000000..5b161dc866 --- /dev/null +++ b/src/cluster/Concepts.hpp @@ -0,0 +1,33 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2025, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#pragma once + +#include + +namespace cluster { + +struct ClusterCommunicationServiceTag { + virtual ~ClusterCommunicationServiceTag() = default; +}; + +template +concept SomeClusterCommunicationService = std::derived_from; + +} // namespace cluster diff --git a/tests/unit/app/StopperTests.cpp b/tests/unit/app/StopperTests.cpp index a9a03eb564..eb3f9b0704 100644 --- a/tests/unit/app/StopperTests.cpp +++ b/tests/unit/app/StopperTests.cpp @@ -17,6 +17,7 @@ */ //============================================================================== #include "app/Stopper.hpp" +#include "cluster/Concepts.hpp" #include "util/AsioContextTestFixture.hpp" #include "util/MockBackend.hpp" #include "util/MockETLService.hpp" @@ -87,6 +88,10 @@ struct StopperMakeCallbackTest : util::prometheus::WithPrometheus, SyncAsioConte MOCK_METHOD(void, waitToFinish, ()); }; + struct MockClusterCommunicationService : cluster::ClusterCommunicationServiceTag { + MOCK_METHOD(void, stop, (), ()); + }; + protected: testing::StrictMock serverMock_; testing::StrictMock loadBalancerMock_; @@ -94,6 +99,7 @@ struct StopperMakeCallbackTest : util::prometheus::WithPrometheus, SyncAsioConte testing::StrictMock subscriptionManagerMock_; testing::StrictMock backendMock_{util::config::ClioConfigDefinition{}}; testing::StrictMock cacheSaverMock_; + testing::StrictMock clusterCommunicationServiceMock_; boost::asio::io_context ioContextToStop_; bool @@ -115,6 +121,7 @@ TEST_F(StopperMakeCallbackTest, makeCallbackTest) subscriptionManagerMock_, backendMock_, cacheSaverMock_, + clusterCommunicationServiceMock_, ioContextToStop_ ); @@ -122,6 +129,9 @@ TEST_F(StopperMakeCallbackTest, makeCallbackTest) EXPECT_CALL(cacheSaverMock_, save).InSequence(s1).WillOnce([this]() { EXPECT_FALSE(isContextStopped()); }); EXPECT_CALL(serverMock_, stop).InSequence(s1).WillOnce([this]() { EXPECT_FALSE(isContextStopped()); }); EXPECT_CALL(loadBalancerMock_, stop).InSequence(s2).WillOnce([this]() { EXPECT_FALSE(isContextStopped()); }); + EXPECT_CALL(clusterCommunicationServiceMock_, stop).InSequence(s1, s2).WillOnce([this]() { + EXPECT_FALSE(isContextStopped()); + }); EXPECT_CALL(etlServiceMock_, stop).InSequence(s1, s2).WillOnce([this]() { EXPECT_FALSE(isContextStopped()); }); EXPECT_CALL(subscriptionManagerMock_, stop).InSequence(s1, s2).WillOnce([this]() { EXPECT_FALSE(isContextStopped()); From 847ed98516abb6025c63cbc3cf85415e46b1e31e Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Thu, 8 Jan 2026 12:24:28 +0000 Subject: [PATCH 20/41] Add writing command to etl::SystemState --- src/etl/ETLService.cpp | 20 ++++--- src/etl/ETLService.hpp | 2 +- src/etl/SystemState.hpp | 23 +++++++- src/etl/impl/LedgerPublisher.hpp | 17 +++++- src/etl/impl/Loading.cpp | 2 +- tests/unit/etl/ETLServiceTests.cpp | 78 +++++++++++++++++++++++-- tests/unit/etl/LedgerPublisherTests.cpp | 4 +- tests/unit/etl/LoadingTests.cpp | 50 ++++++++++++++++ 8 files changed, 175 insertions(+), 21 deletions(-) diff --git a/src/etl/ETLService.cpp b/src/etl/ETLService.cpp index 6f8c0d4ee6..595411cb4a 100644 --- a/src/etl/ETLService.cpp +++ b/src/etl/ETLService.cpp @@ -348,14 +348,21 @@ ETLService::startMonitor(uint32_t seq) { monitor_ = monitorProvider_->make(ctx_, backend_, ledgers_, seq); + systemStateWriteCommandSubscription_ = + state_->writeCommandSignal.connect([this](SystemState::WriteCommand command) { + switch (command) { + case etl::SystemState::WriteCommand::StartWriting: + attemptTakeoverWriter(); + break; + case etl::SystemState::WriteCommand::StopWriting: + giveUpWriter(); + break; + } + }); + monitorNewSeqSubscription_ = monitor_->subscribeToNewSequence([this](uint32_t seq) { LOG(log_.info()) << "ETLService (via Monitor) got new seq from db: " << seq; - if (state_->writeConflict) { - LOG(log_.info()) << "Got a write conflict; Giving up writer seat immediately"; - giveUpWriter(); - } - if (not state_->isWriting) { auto const diff = data::synchronousAndRetryOnTimeout([this, seq](auto yield) { return backend_->fetchLedgerDiff(seq, yield); @@ -371,7 +378,7 @@ ETLService::startMonitor(uint32_t seq) monitorDbStalledSubscription_ = monitor_->subscribeToDbStalled([this]() { LOG(log_.warn()) << "ETLService received DbStalled signal from Monitor"; if (not state_->isStrictReadonly and not state_->isWriting) - attemptTakeoverWriter(); + state_->writeCommandSignal(SystemState::WriteCommand::StartWriting); }); monitor_->run(); @@ -404,7 +411,6 @@ ETLService::giveUpWriter() { ASSERT(not state_->isStrictReadonly, "This should only happen on writer nodes"); state_->isWriting = false; - state_->writeConflict = false; taskMan_ = nullptr; } diff --git a/src/etl/ETLService.hpp b/src/etl/ETLService.hpp index 45185d4be4..689d4d14d1 100644 --- a/src/etl/ETLService.hpp +++ b/src/etl/ETLService.hpp @@ -74,7 +74,6 @@ #include #include #include -#include namespace etl { @@ -117,6 +116,7 @@ class ETLService : public ETLServiceInterface { boost::signals2::scoped_connection monitorNewSeqSubscription_; boost::signals2::scoped_connection monitorDbStalledSubscription_; + boost::signals2::scoped_connection systemStateWriteCommandSubscription_; std::optional> mainLoop_; diff --git a/src/etl/SystemState.hpp b/src/etl/SystemState.hpp index 7f841665f4..188d53fcbd 100644 --- a/src/etl/SystemState.hpp +++ b/src/etl/SystemState.hpp @@ -23,7 +23,8 @@ #include "util/prometheus/Label.hpp" #include "util/prometheus/Prometheus.hpp" -#include +#include +#include namespace etl { @@ -50,8 +51,24 @@ struct SystemState { "Whether the process is writing to the database" ); - std::atomic_bool isStopping = false; /**< @brief Whether the software is stopping. */ - std::atomic_bool writeConflict = false; /**< @brief Whether a write conflict was detected. */ + /** + * @brief Commands for controlling the ETL writer state. + * + * These commands are emitted via writeCommandSignal to coordinate writer state transitions across components. + */ + enum class WriteCommand { + StartWriting, /**< Request to attempt taking over as the ETL writer */ + StopWriting /**< Request to give up the ETL writer role (e.g., due to write conflict) */ + }; + + /** + * @brief Signal for coordinating ETL writer state transitions. + * + * This signal allows components to request changes to the writer state without direct coupling. + * - Emitted with StartWriting when database stalls and node should attempt to become writer + * - Emitted with StopWriting when write conflicts are detected + */ + boost::signals2::signal writeCommandSignal; /** * @brief Whether clio detected an amendment block. diff --git a/src/etl/impl/LedgerPublisher.hpp b/src/etl/impl/LedgerPublisher.hpp index 0b48ca3f68..1db50299bc 100644 --- a/src/etl/impl/LedgerPublisher.hpp +++ b/src/etl/impl/LedgerPublisher.hpp @@ -45,6 +45,7 @@ #include #include +#include #include #include #include @@ -76,6 +77,8 @@ class LedgerPublisher : public LedgerPublisherInterface { util::async::AnyStrand publishStrand_; + std::atomic_bool stop_{false}; + std::shared_ptr backend_; std::shared_ptr subscriptions_; std::reference_wrapper state_; // shared state for ETL @@ -125,7 +128,7 @@ class LedgerPublisher : public LedgerPublisherInterface { { LOG(log_.info()) << "Attempting to publish ledger = " << ledgerSequence; size_t numAttempts = 0; - while (not state_.get().isStopping) { + while (not stop_) { auto range = backend_->hardFetchLedgerRangeNoThrow(); if (!range || range->maxSequence < ledgerSequence) { @@ -258,6 +261,18 @@ class LedgerPublisher : public LedgerPublisherInterface { return *lastPublishedSequence_.lock(); } + /** + * @brief Stops publishing + * + * @note This is a basic implementation to satisfy tests. This will be improved in + * https://github.com/XRPLF/clio/issues/2833 + */ + void + stop() + { + stop_ = true; + } + private: void setLastClose(std::chrono::time_point lastCloseTime) diff --git a/src/etl/impl/Loading.cpp b/src/etl/impl/Loading.cpp index 59d2d0a9c7..9c378f253f 100644 --- a/src/etl/impl/Loading.cpp +++ b/src/etl/impl/Loading.cpp @@ -75,7 +75,7 @@ Loader::load(model::LedgerData const& data) << "; took " << duration << "ms"; if (not success) { - state_->writeConflict = true; + state_->writeCommandSignal(SystemState::WriteCommand::StopWriting); LOG(log_.warn()) << "Another node wrote a ledger into the DB - we have a write conflict"; return std::unexpected(LoaderError::WriteConflict); } diff --git a/tests/unit/etl/ETLServiceTests.cpp b/tests/unit/etl/ETLServiceTests.cpp index 253009459c..59a570a44c 100644 --- a/tests/unit/etl/ETLServiceTests.cpp +++ b/tests/unit/etl/ETLServiceTests.cpp @@ -216,6 +216,10 @@ struct ETLServiceTests : util::prometheus::WithPrometheus, MockBackendTest { std::shared_ptr> monitorProvider_ = std::make_shared>(); std::shared_ptr systemState_ = std::make_shared(); + testing::StrictMock> mockWriteSignalCommandCallback_; + boost::signals2::scoped_connection writeCommandConnection_{ + systemState_->writeCommandSignal.connect(mockWriteSignalCommandCallback_.AsStdFunction()) + }; etl::ETLService service_{ ctx_, @@ -370,13 +374,13 @@ TEST_F(ETLServiceTests, HandlesWriteConflictInMonitorSubscription) EXPECT_CALL(*cacheLoader_, load(kSEQ)); service_.run(); - systemState_->writeConflict = true; + writeCommandConnection_.disconnect(); + systemState_->writeCommandSignal(etl::SystemState::WriteCommand::StopWriting); EXPECT_CALL(*publisher_, publish(kSEQ + 1, testing::_, testing::_)); ASSERT_TRUE(capturedCallback); capturedCallback(kSEQ + 1); - EXPECT_FALSE(systemState_->writeConflict); EXPECT_FALSE(systemState_->isWriting); } @@ -447,6 +451,8 @@ TEST_F(ETLServiceTests, AttemptTakeoverWriter) EXPECT_CALL(*taskManagerProvider_, make(testing::_, testing::_, kSEQ + 1, testing::_)) .WillOnce(testing::Return(std::move(mockTaskManager))); + EXPECT_CALL(mockWriteSignalCommandCallback_, Call(etl::SystemState::WriteCommand::StartWriting)); + ASSERT_TRUE(capturedDbStalledCallback); capturedDbStalledCallback(); @@ -477,15 +483,15 @@ TEST_F(ETLServiceTests, GiveUpWriterAfterWriteConflict) service_.run(); systemState_->isWriting = true; - systemState_->writeConflict = true; // got a write conflict along the way + writeCommandConnection_.disconnect(); + systemState_->writeCommandSignal(etl::SystemState::WriteCommand::StopWriting); EXPECT_CALL(*publisher_, publish(kSEQ + 1, testing::_, testing::_)); ASSERT_TRUE(capturedCallback); capturedCallback(kSEQ + 1); - EXPECT_FALSE(systemState_->isWriting); // gives up writing - EXPECT_FALSE(systemState_->writeConflict); // and removes write conflict flag + EXPECT_FALSE(systemState_->isWriting); // gives up writing } TEST_F(ETLServiceTests, CancelledLoadInitialLedger) @@ -539,3 +545,65 @@ TEST_F(ETLServiceTests, RunStopsIfInitialLoadIsCancelledByBalancer) EXPECT_FALSE(service_.isAmendmentBlocked()); EXPECT_FALSE(service_.isCorruptionDetected()); } + +TEST_F(ETLServiceTests, DbStalledDoesNotTriggerSignalWhenStrictReadonly) +{ + auto mockMonitor = std::make_unique>(); + auto& mockMonitorRef = *mockMonitor; + std::function capturedDbStalledCallback; + + EXPECT_CALL(*monitorProvider_, make).WillOnce([&mockMonitor](auto, auto, auto, auto, auto) { + return std::move(mockMonitor); + }); + EXPECT_CALL(mockMonitorRef, subscribeToNewSequence); + EXPECT_CALL(mockMonitorRef, subscribeToDbStalled).WillOnce([&capturedDbStalledCallback](auto callback) { + capturedDbStalledCallback = callback; + return boost::signals2::scoped_connection{}; + }); + EXPECT_CALL(mockMonitorRef, run); + + EXPECT_CALL(*backend_, hardFetchLedgerRange) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); + EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); + EXPECT_CALL(*cacheLoader_, load(kSEQ)); + + service_.run(); + systemState_->isStrictReadonly = true; // strict readonly mode + systemState_->isWriting = false; + + // No signal should be emitted because node is in strict readonly mode + + ASSERT_TRUE(capturedDbStalledCallback); + capturedDbStalledCallback(); +} + +TEST_F(ETLServiceTests, DbStalledDoesNotTriggerSignalWhenAlreadyWriting) +{ + auto mockMonitor = std::make_unique>(); + auto& mockMonitorRef = *mockMonitor; + std::function capturedDbStalledCallback; + + EXPECT_CALL(*monitorProvider_, make).WillOnce([&mockMonitor](auto, auto, auto, auto, auto) { + return std::move(mockMonitor); + }); + EXPECT_CALL(mockMonitorRef, subscribeToNewSequence); + EXPECT_CALL(mockMonitorRef, subscribeToDbStalled).WillOnce([&capturedDbStalledCallback](auto callback) { + capturedDbStalledCallback = callback; + return boost::signals2::scoped_connection{}; + }); + EXPECT_CALL(mockMonitorRef, run); + + EXPECT_CALL(*backend_, hardFetchLedgerRange) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); + EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); + EXPECT_CALL(*cacheLoader_, load(kSEQ)); + + service_.run(); + systemState_->isStrictReadonly = false; + systemState_->isWriting = true; // already writing + + // No signal should be emitted because node is already writing + + ASSERT_TRUE(capturedDbStalledCallback); + capturedDbStalledCallback(); +} diff --git a/tests/unit/etl/LedgerPublisherTests.cpp b/tests/unit/etl/LedgerPublisherTests.cpp index e4d422a9d8..5b3c73d0f3 100644 --- a/tests/unit/etl/LedgerPublisherTests.cpp +++ b/tests/unit/etl/LedgerPublisherTests.cpp @@ -216,15 +216,14 @@ TEST_F(ETLLedgerPublisherTest, PublishLedgerHeaderCloseTimeGreaterThanNow) TEST_F(ETLLedgerPublisherTest, PublishLedgerSeqStopIsTrue) { auto dummyState = etl::SystemState{}; - dummyState.isStopping = true; auto publisher = impl::LedgerPublisher(ctx, backend_, mockSubscriptionManagerPtr, dummyState); + publisher.stop(); EXPECT_FALSE(publisher.publish(kSEQ, {})); } TEST_F(ETLLedgerPublisherTest, PublishLedgerSeqMaxAttempt) { auto dummyState = etl::SystemState{}; - dummyState.isStopping = false; auto publisher = impl::LedgerPublisher(ctx, backend_, mockSubscriptionManagerPtr, dummyState); static constexpr auto kMAX_ATTEMPT = 2; @@ -238,7 +237,6 @@ TEST_F(ETLLedgerPublisherTest, PublishLedgerSeqMaxAttempt) TEST_F(ETLLedgerPublisherTest, PublishLedgerSeqStopIsFalse) { auto dummyState = etl::SystemState{}; - dummyState.isStopping = false; auto publisher = impl::LedgerPublisher(ctx, backend_, mockSubscriptionManagerPtr, dummyState); LedgerRange const range{.minSequence = kSEQ, .maxSequence = kSEQ}; diff --git a/tests/unit/etl/LoadingTests.cpp b/tests/unit/etl/LoadingTests.cpp index 143f915a29..6631fde732 100644 --- a/tests/unit/etl/LoadingTests.cpp +++ b/tests/unit/etl/LoadingTests.cpp @@ -188,3 +188,53 @@ TEST_F(LoadingAssertTest, LoadInitialLedgerHasDataInDB) EXPECT_CLIO_ASSERT_FAIL({ [[maybe_unused]] auto unused = loader_.loadInitialLedger(data); }); } + +TEST_F(LoadingTests, LoadWriteConflictEmitsStopWritingSignal) +{ + state_->isWriting = true; // writer is active + auto const data = createTestData(); + testing::StrictMock> mockSignalCallback; + + auto connection = state_->writeCommandSignal.connect(mockSignalCallback.AsStdFunction()); + + EXPECT_CALL(*mockRegistryPtr_, dispatch(data)); + EXPECT_CALL(*backend_, doFinishWrites()).WillOnce(testing::Return(false)); // simulate write conflict + EXPECT_CALL(mockSignalCallback, Call(etl::SystemState::WriteCommand::StopWriting)); + + auto result = loader_.load(data); + EXPECT_FALSE(result.has_value()); + EXPECT_EQ(result.error(), etl::LoaderError::WriteConflict); +} + +TEST_F(LoadingTests, LoadSuccessDoesNotEmitSignal) +{ + state_->isWriting = true; // writer is active + auto const data = createTestData(); + testing::StrictMock> mockSignalCallback; + + auto connection = state_->writeCommandSignal.connect(mockSignalCallback.AsStdFunction()); + + EXPECT_CALL(*mockRegistryPtr_, dispatch(data)); + EXPECT_CALL(*backend_, doFinishWrites()).WillOnce(testing::Return(true)); // success + // No signal should be emitted on success + + auto result = loader_.load(data); + EXPECT_TRUE(result.has_value()); +} + +TEST_F(LoadingTests, LoadWhenNotWritingDoesNotCheckConflict) +{ + state_->isWriting = false; // not a writer + auto const data = createTestData(); + testing::StrictMock> mockSignalCallback; + + auto connection = state_->writeCommandSignal.connect(mockSignalCallback.AsStdFunction()); + + EXPECT_CALL(*mockRegistryPtr_, dispatch(data)); + // doFinishWrites should not be called when not writing + EXPECT_CALL(*backend_, doFinishWrites()).Times(0); + // No signal should be emitted + + auto result = loader_.load(data); + EXPECT_TRUE(result.has_value()); +} From fd01dbba0a75e0d551f574987f98c917b782c242 Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Thu, 8 Jan 2026 12:25:35 +0000 Subject: [PATCH 21/41] Improve channels --- src/util/Channel.hpp | 82 +++++++++++++++++++++++++------- tests/unit/util/ChannelTests.cpp | 8 ++-- 2 files changed, 70 insertions(+), 20 deletions(-) diff --git a/src/util/Channel.hpp b/src/util/Channel.hpp index 0480389ee6..e88750874e 100644 --- a/src/util/Channel.hpp +++ b/src/util/Channel.hpp @@ -42,15 +42,23 @@ struct ChannelInstantiated; } // namespace detail #endif +enum class ProducerType { Single, Multi }; +enum class ConsumerType { Single, Multi }; + /** * @brief Represents a go-like channel, a multi-producer (Sender) multi-consumer (Receiver) thread-safe data pipe. * @note Use INSTANTIATE_CHANNEL_FOR_CLANG macro when using this class. See docs at the bottom of the file for more * details. * * @tparam T The type of data the channel transfers + * @tparam P ProducerType::Multi (default) for multi-producer or ProducerType::Single for single-producer + * @tparam C ConsumerType::Multi (default) for multi-consumer or ConsumerType::Single for single-consumer */ -template +template class Channel { + static constexpr bool kIS_MULTI_PRODUCER = (P == ProducerType::Multi); + static constexpr bool kIS_MULTI_CONSUMER = (C == ConsumerType::Multi); + private: class ControlBlock { using InternalChannelType = boost::asio::experimental::concurrent_channel; @@ -101,30 +109,52 @@ class Channel { } }; +public: /** * @brief The sending end of a channel. * - * Sender is copyable and movable. The channel remains open as long as at least one Sender exists. + * Sender is movable. For multi-producer channels, Sender is also copyable. + * The channel remains open as long as at least one Sender exists. * When all Sender instances are destroyed, the channel is closed and receivers will receive std::nullopt. */ class Sender { std::shared_ptr shared_; - std::shared_ptr guard_; + std::conditional_t, Guard> guard_; + + friend class Channel; - public: /** * @brief Constructs a Sender from a shared control block. * @param shared The shared control block managing the channel state */ - Sender(std::shared_ptr shared) - : shared_(std::move(shared)), guard_(std::make_shared(shared_)) {}; + Sender(std::shared_ptr shared) : shared_(shared) + { + if constexpr (kIS_MULTI_PRODUCER) { + guard_ = std::make_shared(shared); + } else { + guard_ = Guard{std::move(shared)}; + } + } + public: Sender(Sender&&) = default; - Sender(Sender const&) = default; + Sender(Sender const&) + requires kIS_MULTI_PRODUCER + = default; + Sender(Sender const&) + requires(!kIS_MULTI_PRODUCER) + = delete; + Sender& operator=(Sender&&) = default; Sender& - operator=(Sender const&) = default; + operator=(Sender const&) + requires kIS_MULTI_PRODUCER + = default; + Sender& + operator=(Sender const&) + requires(!kIS_MULTI_PRODUCER) + = delete; /** * @brief Asynchronously sends data through the channel using a coroutine. @@ -201,27 +231,48 @@ class Channel { /** * @brief The receiving end of a channel. * - * Receiver is copyable and movable. Multiple receivers can consume from the same channel concurrently. + * Receiver is movable. For multi-consumer channels, Receiver is also copyable. + * Multiple receivers can consume from the same multi-consumer channel concurrently. * When all Receiver instances are destroyed, the channel is closed and senders will fail to send. */ class Receiver { std::shared_ptr shared_; - std::shared_ptr guard_; + std::conditional_t, Guard> guard_; + + friend class Channel; - public: /** * @brief Constructs a Receiver from a shared control block. * @param shared The shared control block managing the channel state */ - Receiver(std::shared_ptr shared) - : shared_(std::move(shared)), guard_(std::make_shared(shared_)) {}; + Receiver(std::shared_ptr shared) : shared_(shared) + { + if constexpr (kIS_MULTI_CONSUMER) { + guard_ = std::make_shared(shared); + } else { + guard_ = Guard{std::move(shared)}; + } + } + public: Receiver(Receiver&&) = default; - Receiver(Receiver const&) = default; + Receiver(Receiver const&) + requires kIS_MULTI_CONSUMER + = default; + Receiver(Receiver const&) + requires(!kIS_MULTI_CONSUMER) + = delete; + Receiver& operator=(Receiver&&) = default; Receiver& - operator=(Receiver const&) = default; + operator=(Receiver const&) + requires kIS_MULTI_CONSUMER + = default; + Receiver& + operator=(Receiver const&) + requires(!kIS_MULTI_CONSUMER) + = delete; /** * @brief Attempts to receive data from the channel without blocking. @@ -296,7 +347,6 @@ class Channel { } }; -public: /** * @brief Factory function to create channel components. * @param context A supported context type (either io_context or thread_pool) diff --git a/tests/unit/util/ChannelTests.cpp b/tests/unit/util/ChannelTests.cpp index 536f869448..2a43ec373f 100644 --- a/tests/unit/util/ChannelTests.cpp +++ b/tests/unit/util/ChannelTests.cpp @@ -190,7 +190,7 @@ TEST_P(ChannelSpawnTest, MultipleSendersMultipleReceivers) context_.withExecutor([this](auto& executor) { auto [sender, receiver] = util::Channel::create(executor, 10); util::Mutex> receivedValues; - std::vector receivers(kNUM_RECEIVERS, receiver); + std::vector receivers(kNUM_RECEIVERS, receiver); for (auto receiverId = 0uz; receiverId < kNUM_RECEIVERS; ++receiverId) { util::spawn( @@ -405,7 +405,7 @@ TEST_P(ChannelCallbackTest, MultipleSendersMultipleReceivers) context_.withExecutor([this](auto& executor) { auto [sender, receiver] = util::Channel::create(executor, 10); util::Mutex> receivedValues; - std::vector receivers(kNUM_RECEIVERS, receiver); + std::vector receivers(kNUM_RECEIVERS, receiver); for (auto receiverId = 0uz; receiverId < kNUM_RECEIVERS; ++receiverId) { auto& receiverRef = receivers[receiverId]; @@ -531,8 +531,8 @@ TEST_P(ChannelCallbackTest, TryMethodsWithClosedChannel) context_.withExecutor([this](auto& executor) { std::atomic_bool testCompleted{false}; auto [sender, receiver] = util::Channel::create(executor, 3); - auto receiverPtr = std::make_shared(std::move(receiver)); - auto senderPtr = std::make_shared>(std::move(sender)); + auto receiverPtr = std::make_shared::Receiver>(std::move(receiver)); + auto senderPtr = std::make_shared::Sender>>(std::move(sender)); boost::asio::post(executor, [receiverPtr, senderPtr, &testCompleted]() { EXPECT_TRUE(senderPtr->value().trySend(100)); From c9cdcd0a094a8d346f57e504d4204fc90d4f8940 Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Thu, 8 Jan 2026 16:44:02 +0000 Subject: [PATCH 22/41] Add async framework support in Channel --- src/etl/SystemState.hpp | 1 + src/util/Channel.hpp | 26 ++++++++++++++++++- src/util/async/Concepts.hpp | 21 +++++++++++++++ .../async/context/BasicExecutionContext.hpp | 19 ++++++++++++-- 4 files changed, 64 insertions(+), 3 deletions(-) diff --git a/src/etl/SystemState.hpp b/src/etl/SystemState.hpp index 188d53fcbd..1b91fc94ee 100644 --- a/src/etl/SystemState.hpp +++ b/src/etl/SystemState.hpp @@ -22,6 +22,7 @@ #include "util/prometheus/Bool.hpp" #include "util/prometheus/Label.hpp" #include "util/prometheus/Prometheus.hpp" +#include "util/Channel.hpp" #include #include diff --git a/src/util/Channel.hpp b/src/util/Channel.hpp index e88750874e..fc4f00dafa 100644 --- a/src/util/Channel.hpp +++ b/src/util/Channel.hpp @@ -19,6 +19,8 @@ #pragma once +#include "util/async/Concepts.hpp" + #include #include #include @@ -42,7 +44,20 @@ struct ChannelInstantiated; } // namespace detail #endif +/** + * @brief Specifies the producer concurrency model for a Channel. + * + * - Single: Only one Sender can exist (non-copyable). Uses direct Guard ownership for zero overhead. + * - Multi: Multiple Senders can exist (copyable). Uses shared_ptr for shared ownership. + */ enum class ProducerType { Single, Multi }; + +/** + * @brief Specifies the consumer concurrency model for a Channel. + * + * - Single: Only one Receiver can exist (non-copyable). Uses direct Guard ownership for zero overhead. + * - Multi: Multiple Receivers can exist (copyable). Uses shared_ptr for shared ownership. + */ enum class ConsumerType { Single, Multi }; /** @@ -66,7 +81,16 @@ class Channel { InternalChannelType ch_; public: - ControlBlock(auto&& context, std::size_t capacity) : executor_(context.get_executor()), ch_(context, capacity) + template + requires(not async::SomeExecutionContext) + ControlBlock(ContextType&& context, std::size_t capacity) + : executor_(context.get_executor()), ch_(context, capacity) + { + } + + template + ControlBlock(ContextType&& context, std::size_t capacity) + : executor_(context.getExecutor().get_executor()), ch_(context.getExecutor(), capacity) { } diff --git a/src/util/async/Concepts.hpp b/src/util/async/Concepts.hpp index 7a50999f56..1bde21e6eb 100644 --- a/src/util/async/Concepts.hpp +++ b/src/util/async/Concepts.hpp @@ -29,6 +29,27 @@ namespace util::async { +/** + * @brief Tag type for identifying execution context types. + * + * Types that inherit from this tag can be detected using the SomeExecutionContext concept. + * This allows generic code to differentiate between raw Boost.Asio contexts and wrapped execution contexts. + */ +struct ExecutionContextTag { + virtual ~ExecutionContextTag() = default; +}; + +/** + * @brief Concept that identifies types derived from ExecutionContextTag. + * + * This concept is used to detect custom execution context wrappers (like BasicExecutionContext) + * and distinguish them from raw Boost.Asio contexts (io_context, thread_pool, etc.). + * + * @tparam T The type to check + */ +template +concept SomeExecutionContext = std::derived_from, ExecutionContextTag>; + /** * @brief Specifies the interface for an entity that can be stopped */ diff --git a/src/util/async/context/BasicExecutionContext.hpp b/src/util/async/context/BasicExecutionContext.hpp index 5fab8fdfcf..74c7f4435d 100644 --- a/src/util/async/context/BasicExecutionContext.hpp +++ b/src/util/async/context/BasicExecutionContext.hpp @@ -129,7 +129,7 @@ template < typename DispatcherType, typename TimerContextProvider = impl::SelfContextProvider, typename ErrorHandlerType = impl::DefaultErrorHandler> -class BasicExecutionContext { +class BasicExecutionContext : public ExecutionContextTag { ContextType context_; /** @cond */ @@ -182,7 +182,7 @@ class BasicExecutionContext { /** * @brief Stops the underlying thread pool. */ - ~BasicExecutionContext() + ~BasicExecutionContext() override { stop(); } @@ -402,6 +402,20 @@ class BasicExecutionContext { { context_.join(); } + + /** + * @brief Get the underlying executor. + * + * Provides access to the wrapped executor for cases where the execution context + * needs to interact with components that require explicit executor access (like Channel). + * + * @return Reference to the underlying executor + */ + typename ContextType::Executor& + getExecutor() + { + return context_.getExecutor(); + } }; /** @@ -428,3 +442,4 @@ using PoolExecutionContext = BasicExecutionContext; } // namespace util::async + From d71c466675031baa07c0463429a1ea2bf61245a1 Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Thu, 8 Jan 2026 17:28:54 +0000 Subject: [PATCH 23/41] Apply bug fixes --- src/etl/ETLService.cpp | 43 +++++++++++++++++------ src/etl/ETLService.hpp | 4 +++ src/util/async/context/impl/Execution.hpp | 29 +++++++++------ 3 files changed, 55 insertions(+), 21 deletions(-) diff --git a/src/etl/ETLService.cpp b/src/etl/ETLService.cpp index 595411cb4a..8d01bf3687 100644 --- a/src/etl/ETLService.cpp +++ b/src/etl/ETLService.cpp @@ -173,6 +173,7 @@ ETLService::ETLService( , state_(std::move(state)) , startSequence_(config.get().maybeValue("start_sequence")) , finishSequence_(config.get().maybeValue("finish_sequence")) + , writeCommandStrand_(ctx_.makeStrand()) { ASSERT(not state_->isWriting, "ETL should never start in writer mode"); @@ -232,6 +233,13 @@ ETLService::stop() { LOG(log_.info()) << "Stop called"; + systemStateWriteCommandSubscription_.disconnect(); + auto count = runningWriteCommandHandlers_.load(); + while (count != 0) { + runningWriteCommandHandlers_.wait(count); // Blocks until value changes + count = runningWriteCommandHandlers_.load(); + } + if (mainLoop_) mainLoop_->wait(); if (taskMan_) @@ -350,26 +358,39 @@ ETLService::startMonitor(uint32_t seq) systemStateWriteCommandSubscription_ = state_->writeCommandSignal.connect([this](SystemState::WriteCommand command) { - switch (command) { - case etl::SystemState::WriteCommand::StartWriting: - attemptTakeoverWriter(); - break; - case etl::SystemState::WriteCommand::StopWriting: - giveUpWriter(); - break; - } + ++runningWriteCommandHandlers_; + writeCommandStrand_.submit([this, command]() { + switch (command) { + case etl::SystemState::WriteCommand::StartWriting: + attemptTakeoverWriter(); + break; + case etl::SystemState::WriteCommand::StopWriting: + giveUpWriter(); + break; + } + --runningWriteCommandHandlers_; + runningWriteCommandHandlers_.notify_one(); + }); }); monitorNewSeqSubscription_ = monitor_->subscribeToNewSequence([this](uint32_t seq) { LOG(log_.info()) << "ETLService (via Monitor) got new seq from db: " << seq; - if (not state_->isWriting) { + // TODO(skuznetsov): check that this doesn't break anything + auto const cacheNeedsUpdate = backend_->cache().latestLedgerSequence() < seq; + auto const backendRange = backend_->fetchLedgerRange(); + auto const backendNeedsUpdate = backendRange.has_value() and backendRange->maxSequence < seq; + + if (cacheNeedsUpdate or backendNeedsUpdate) { auto const diff = data::synchronousAndRetryOnTimeout([this, seq](auto yield) { return backend_->fetchLedgerDiff(seq, yield); }); - cacheUpdater_->update(seq, diff); - backend_->updateRange(seq); + if (cacheNeedsUpdate) + cacheUpdater_->update(seq, diff); + + if (backendNeedsUpdate) + backend_->updateRange(seq); } publisher_->publish(seq, {}); diff --git a/src/etl/ETLService.hpp b/src/etl/ETLService.hpp index 689d4d14d1..163edb547a 100644 --- a/src/etl/ETLService.hpp +++ b/src/etl/ETLService.hpp @@ -52,6 +52,7 @@ #include "feed/SubscriptionManagerInterface.hpp" #include "util/async/AnyExecutionContext.hpp" #include "util/async/AnyOperation.hpp" +#include "util/async/AnyStrand.hpp" #include "util/config/ConfigDefinition.hpp" #include "util/log/Logger.hpp" @@ -69,6 +70,7 @@ #include #include +#include #include #include #include @@ -117,6 +119,8 @@ class ETLService : public ETLServiceInterface { boost::signals2::scoped_connection monitorNewSeqSubscription_; boost::signals2::scoped_connection monitorDbStalledSubscription_; boost::signals2::scoped_connection systemStateWriteCommandSubscription_; + util::async::AnyStrand writeCommandStrand_; + std::atomic runningWriteCommandHandlers_{0}; std::optional> mainLoop_; diff --git a/src/util/async/context/impl/Execution.hpp b/src/util/async/context/impl/Execution.hpp index 020773ba5a..ffc32cd452 100644 --- a/src/util/async/context/impl/Execution.hpp +++ b/src/util/async/context/impl/Execution.hpp @@ -36,17 +36,26 @@ struct SpawnDispatchStrategy { { auto op = outcome.getOperation(); - util::spawn( - ctx.getExecutor(), - [outcome = std::forward(outcome), fn = std::forward(fn)](auto yield) mutable { - if constexpr (SomeStoppableOutcome) { - auto& stopSource = outcome.getStopSource(); - std::invoke(std::forward(fn), outcome, stopSource, stopSource[yield]); - } else { + if constexpr (SomeStoppableOutcome) { + util::spawn( + ctx.getExecutor(), + [outcome = std::forward(outcome), fn = std::forward(fn)](auto yield) mutable { + if constexpr (SomeStoppableOutcome) { + auto& stopSource = outcome.getStopSource(); + std::invoke(std::forward(fn), outcome, stopSource, stopSource[yield]); + } else { + std::invoke(std::forward(fn), outcome); + } + } + ); + } else { + boost::asio::post( + ctx.getExecutor(), + [outcome = std::forward(outcome), fn = std::forward(fn)]() mutable { std::invoke(std::forward(fn), outcome); } - } - ); + ); + } return op; } @@ -55,7 +64,7 @@ struct SpawnDispatchStrategy { static void post(ContextType& ctx, FnType&& fn) { - util::spawn(ctx.getExecutor(), [fn = std::forward(fn)](auto) mutable { + boost::asio::post(ctx.getExecutor(), [fn = std::forward(fn)]() mutable { std::invoke(std::forward(fn)); }); } From 4573575331c90afc46d1ea7cf87746de0ec030fd Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Fri, 9 Jan 2026 17:49:36 +0000 Subject: [PATCH 24/41] Add tests --- tests/unit/etl/ETLServiceTests.cpp | 238 +++++++++++++++++++++++++++++ 1 file changed, 238 insertions(+) diff --git a/tests/unit/etl/ETLServiceTests.cpp b/tests/unit/etl/ETLServiceTests.cpp index 59a570a44c..89327fe46b 100644 --- a/tests/unit/etl/ETLServiceTests.cpp +++ b/tests/unit/etl/ETLServiceTests.cpp @@ -607,3 +607,241 @@ TEST_F(ETLServiceTests, DbStalledDoesNotTriggerSignalWhenAlreadyWriting) ASSERT_TRUE(capturedDbStalledCallback); capturedDbStalledCallback(); } + +TEST_F(ETLServiceTests, CacheUpdatesDependOnActualCacheState_WriterMode) +{ + auto mockMonitor = std::make_unique>(); + auto& mockMonitorRef = *mockMonitor; + std::function capturedCallback; + + EXPECT_CALL(*monitorProvider_, make).WillOnce([&mockMonitor](auto, auto, auto, auto, auto) { + return std::move(mockMonitor); + }); + EXPECT_CALL(mockMonitorRef, subscribeToNewSequence).WillOnce([&capturedCallback](auto callback) { + capturedCallback = callback; + return boost::signals2::scoped_connection{}; + }); + EXPECT_CALL(mockMonitorRef, subscribeToDbStalled); + EXPECT_CALL(mockMonitorRef, run); + + EXPECT_CALL(*backend_, hardFetchLedgerRange) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); + EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); + EXPECT_CALL(*cacheLoader_, load(kSEQ)); + + service_.run(); + systemState_->isWriting = true; // In writer mode + + // Simulate cache is behind (e.g., update failed previously) + // Cache latestLedgerSequence returns kSEQ (behind the new seq kSEQ + 1) + std::vector const emptyObjs = {}; + backend_->cache().update(emptyObjs, kSEQ); // Set cache to kSEQ + + std::vector const dummyDiff = {}; + EXPECT_CALL(*backend_, fetchLedgerDiff(kSEQ + 1, testing::_)).WillOnce(testing::Return(dummyDiff)); + + // Cache should be updated even though we're in writer mode + EXPECT_CALL(*cacheUpdater_, update(kSEQ + 1, testing::A const&>())); + + EXPECT_CALL(*publisher_, publish(kSEQ + 1, testing::_, testing::_)); + + ASSERT_TRUE(capturedCallback); + capturedCallback(kSEQ + 1); +} + +TEST_F(ETLServiceTests, OnlyCacheUpdatesWhenBackendIsCurrent) +{ + auto mockMonitor = std::make_unique>(); + auto& mockMonitorRef = *mockMonitor; + std::function capturedCallback; + + EXPECT_CALL(*monitorProvider_, make).WillOnce([&mockMonitor](auto, auto, auto, auto, auto) { + return std::move(mockMonitor); + }); + EXPECT_CALL(mockMonitorRef, subscribeToNewSequence).WillOnce([&capturedCallback](auto callback) { + capturedCallback = callback; + return boost::signals2::scoped_connection{}; + }); + EXPECT_CALL(mockMonitorRef, subscribeToDbStalled); + EXPECT_CALL(mockMonitorRef, run); + + // Set backend range to be at kSEQ + 1 (already current) + EXPECT_CALL(*backend_, hardFetchLedgerRange) + .WillOnce(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ + 1})); + EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); + EXPECT_CALL(*cacheLoader_, load(kSEQ)); + + service_.run(); + systemState_->isWriting = false; + + // Cache is behind (at kSEQ) + std::vector const emptyObjs = {}; + backend_->cache().update(emptyObjs, kSEQ); + + std::vector const dummyDiff = {}; + EXPECT_CALL(*backend_, fetchLedgerDiff(kSEQ + 1, testing::_)).WillOnce(testing::Return(dummyDiff)); + EXPECT_CALL(*cacheUpdater_, update(kSEQ + 1, testing::A const&>())); + + EXPECT_CALL(*publisher_, publish(kSEQ + 1, testing::_, testing::_)); + + ASSERT_TRUE(capturedCallback); + capturedCallback(kSEQ + 1); +} + +TEST_F(ETLServiceTests, NoUpdatesWhenBothCacheAndBackendAreCurrent) +{ + auto mockMonitor = std::make_unique>(); + auto& mockMonitorRef = *mockMonitor; + std::function capturedCallback; + + EXPECT_CALL(*monitorProvider_, make).WillOnce([&mockMonitor](auto, auto, auto, auto, auto) { + return std::move(mockMonitor); + }); + EXPECT_CALL(mockMonitorRef, subscribeToNewSequence).WillOnce([&capturedCallback](auto callback) { + capturedCallback = callback; + return boost::signals2::scoped_connection{}; + }); + EXPECT_CALL(mockMonitorRef, subscribeToDbStalled); + EXPECT_CALL(mockMonitorRef, run); + + // Set backend range to be at kSEQ + 1 (already current) + EXPECT_CALL(*backend_, hardFetchLedgerRange) + .WillOnce(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ + 1})); + EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); + EXPECT_CALL(*cacheLoader_, load(kSEQ)); + + service_.run(); + + // Cache is current (at kSEQ + 1) + std::vector const emptyObjs = {}; + backend_->cache().update(emptyObjs, kSEQ + 1); + + // Neither should be updated + EXPECT_CALL(*backend_, fetchLedgerDiff).Times(0); + EXPECT_CALL(*cacheUpdater_, update(testing::_, testing::A const&>())).Times(0); + + EXPECT_CALL(*publisher_, publish(kSEQ + 1, testing::_, testing::_)); + + ASSERT_TRUE(capturedCallback); + capturedCallback(kSEQ + 1); +} + +TEST_F(ETLServiceTests, StopWaitsForWriteCommandHandlersToComplete) +{ + auto mockMonitor = std::make_unique>(); + + EXPECT_CALL(*monitorProvider_, make).WillOnce([&mockMonitor](auto, auto, auto, auto, auto) { + return std::move(mockMonitor); + }); + + EXPECT_CALL(*backend_, hardFetchLedgerRange) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); + EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); + EXPECT_CALL(*cacheLoader_, load(kSEQ)); + + service_.run(); + systemState_->isStrictReadonly = false; + + auto mockTaskManager = std::make_unique>(); + + EXPECT_CALL(mockWriteSignalCommandCallback_, Call(etl::SystemState::WriteCommand::StartWriting)); + EXPECT_CALL(*taskManagerProvider_, make(testing::_, testing::_, kSEQ + 1, testing::_)) + .WillOnce(testing::Return(std::move(mockTaskManager))); + + // Emit a command + systemState_->writeCommandSignal(etl::SystemState::WriteCommand::StartWriting); + + // The test context processes operations synchronously, so the handler should have run + // Stop should wait for the handler to complete and disconnect the subscription + service_.stop(); + + // Verify stop() returned, meaning all handlers completed + SUCCEED(); +} + +TEST_F(ETLServiceTests, WriteConflictIsHandledImmediately_NotDelayed) +{ + // This test verifies that write conflicts are handled immediately via signal, + // not delayed until the next sequence notification (the old behavior) + + auto mockMonitor = std::make_unique>(); + auto& mockMonitorRef = *mockMonitor; + std::function capturedNewSeqCallback; + + EXPECT_CALL(*monitorProvider_, make).WillOnce([&mockMonitor](auto, auto, auto, auto, auto) { + return std::move(mockMonitor); + }); + EXPECT_CALL(mockMonitorRef, subscribeToNewSequence).WillOnce([&capturedNewSeqCallback](auto callback) { + capturedNewSeqCallback = callback; + return boost::signals2::scoped_connection{}; + }); + EXPECT_CALL(mockMonitorRef, subscribeToDbStalled); + EXPECT_CALL(mockMonitorRef, run); + + EXPECT_CALL(*backend_, hardFetchLedgerRange) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); + EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); + EXPECT_CALL(*cacheLoader_, load(kSEQ)); + + service_.run(); + systemState_->isWriting = true; + + // Emit StopWriting signal (simulating write conflict from Loader) + EXPECT_CALL(mockWriteSignalCommandCallback_, Call(etl::SystemState::WriteCommand::StopWriting)); + systemState_->writeCommandSignal(etl::SystemState::WriteCommand::StopWriting); + + // The test context processes operations synchronously, so the handler should have run immediately + // Verify that isWriting is immediately set to false + EXPECT_FALSE(systemState_->isWriting); +} + +TEST_F(ETLServiceTests, WriteCommandsAreSerializedOnStrand) +{ + auto mockMonitor = std::make_unique>(); + + EXPECT_CALL(*monitorProvider_, make).WillOnce([&mockMonitor](auto, auto, auto, auto, auto) { + return std::move(mockMonitor); + }); + + EXPECT_CALL(*backend_, hardFetchLedgerRange) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); + EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); + EXPECT_CALL(*cacheLoader_, load(kSEQ)); + + service_.run(); + systemState_->isStrictReadonly = false; + systemState_->isWriting = false; + + auto mockTaskManager1 = std::make_unique>(); + auto mockTaskManager2 = std::make_unique>(); + + // Set up expectations for the sequence of write commands + // The signals should be processed in order: StartWriting, StopWriting, StartWriting + { + testing::InSequence seq; + + // First StartWriting + EXPECT_CALL(mockWriteSignalCommandCallback_, Call(etl::SystemState::WriteCommand::StartWriting)); + EXPECT_CALL(*taskManagerProvider_, make(testing::_, testing::_, kSEQ + 1, testing::_)) + .WillOnce(testing::Return(std::move(mockTaskManager1))); + + // Then StopWriting + EXPECT_CALL(mockWriteSignalCommandCallback_, Call(etl::SystemState::WriteCommand::StopWriting)); + + // Finally second StartWriting + EXPECT_CALL(mockWriteSignalCommandCallback_, Call(etl::SystemState::WriteCommand::StartWriting)); + EXPECT_CALL(*taskManagerProvider_, make(testing::_, testing::_, kSEQ + 1, testing::_)) + .WillOnce(testing::Return(std::move(mockTaskManager2))); + } + + // Emit multiple signals rapidly - they should be serialized on the strand + systemState_->writeCommandSignal(etl::SystemState::WriteCommand::StartWriting); + systemState_->writeCommandSignal(etl::SystemState::WriteCommand::StopWriting); + systemState_->writeCommandSignal(etl::SystemState::WriteCommand::StartWriting); + + // The test context processes operations synchronously, so all signals should have been processed + // Final state should be writing (last signal was StartWriting) + EXPECT_TRUE(systemState_->isWriting); +} From 4599a70249b010c67d099d25c72565fbbe0bcf3d Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Fri, 9 Jan 2026 17:50:56 +0000 Subject: [PATCH 25/41] Run pre-commit --- src/etl/ETLService.cpp | 1 - src/etl/SystemState.hpp | 2 +- src/util/async/context/BasicExecutionContext.hpp | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/etl/ETLService.cpp b/src/etl/ETLService.cpp index 8d01bf3687..71e489950e 100644 --- a/src/etl/ETLService.cpp +++ b/src/etl/ETLService.cpp @@ -376,7 +376,6 @@ ETLService::startMonitor(uint32_t seq) monitorNewSeqSubscription_ = monitor_->subscribeToNewSequence([this](uint32_t seq) { LOG(log_.info()) << "ETLService (via Monitor) got new seq from db: " << seq; - // TODO(skuznetsov): check that this doesn't break anything auto const cacheNeedsUpdate = backend_->cache().latestLedgerSequence() < seq; auto const backendRange = backend_->fetchLedgerRange(); auto const backendNeedsUpdate = backendRange.has_value() and backendRange->maxSequence < seq; diff --git a/src/etl/SystemState.hpp b/src/etl/SystemState.hpp index 1b91fc94ee..22245b6d5c 100644 --- a/src/etl/SystemState.hpp +++ b/src/etl/SystemState.hpp @@ -19,10 +19,10 @@ #pragma once +#include "util/Channel.hpp" #include "util/prometheus/Bool.hpp" #include "util/prometheus/Label.hpp" #include "util/prometheus/Prometheus.hpp" -#include "util/Channel.hpp" #include #include diff --git a/src/util/async/context/BasicExecutionContext.hpp b/src/util/async/context/BasicExecutionContext.hpp index 74c7f4435d..be8a6a2001 100644 --- a/src/util/async/context/BasicExecutionContext.hpp +++ b/src/util/async/context/BasicExecutionContext.hpp @@ -442,4 +442,3 @@ using PoolExecutionContext = BasicExecutionContext; } // namespace util::async - From a64c8c43c6eaf3331e777340f31b9c622995c0d5 Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Tue, 13 Jan 2026 14:53:24 +0000 Subject: [PATCH 26/41] Add fallback writer role --- src/cluster/ClioNode.cpp | 3 +++ src/cluster/ClioNode.hpp | 2 +- src/cluster/WriterDecider.cpp | 15 +++++++++++++++ src/etl/ETLService.cpp | 1 + src/etl/SystemState.hpp | 7 +++++++ src/etl/WriterState.cpp | 12 ++++++++++++ src/etl/WriterState.hpp | 12 ++++++++++++ src/etl/impl/Loading.cpp | 1 + tests/common/util/MockWriterState.hpp | 2 ++ 9 files changed, 54 insertions(+), 1 deletion(-) diff --git a/src/cluster/ClioNode.cpp b/src/cluster/ClioNode.cpp index cf16b6d4a8..42d2f1adbd 100644 --- a/src/cluster/ClioNode.cpp +++ b/src/cluster/ClioNode.cpp @@ -53,6 +53,9 @@ ClioNode::from(ClioNode::UUID uuid, etl::WriterStateInterface const& writerState if (writerState.isReadOnly()) { return ClioNode::DbRole::ReadOnly; } + if (writerState.isFallback()) { + return ClioNode::DbRole::Fallback; + } return writerState.isWriting() ? ClioNode::DbRole::Writer : ClioNode::DbRole::NotWriter; }(); diff --git a/src/cluster/ClioNode.hpp b/src/cluster/ClioNode.hpp index 3b70d6f017..ed583b28e5 100644 --- a/src/cluster/ClioNode.hpp +++ b/src/cluster/ClioNode.hpp @@ -40,7 +40,7 @@ struct ClioNode { static constexpr char const* kTIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"; /** @brief Database role */ - enum class DbRole { ReadOnly = 0, NotWriter = 1, Writer = 2, MAX = 2 }; + enum class DbRole { ReadOnly = 0, NotWriter = 1, Writer = 2, Fallback = 3, MAX = 3 }; using UUID = std::shared_ptr; using cUUID = std::shared_ptr; diff --git a/src/cluster/WriterDecider.cpp b/src/cluster/WriterDecider.cpp index d7b6048eee..0d67db1238 100644 --- a/src/cluster/WriterDecider.cpp +++ b/src/cluster/WriterDecider.cpp @@ -22,6 +22,7 @@ #include "cluster/Backend.hpp" #include "cluster/ClioNode.hpp" #include "etl/WriterState.hpp" +#include "util/Assert.hpp" #include "util/Spawn.hpp" #include "util/log/Logger.hpp" @@ -50,6 +51,20 @@ WriterDecider::onNewState(ClioNode::cUUID selfId, std::shared_ptrclone(), selfId = std::move(selfId), clusterData = clusterData->value()](auto&&) mutable { + auto const selfData = + std::ranges::find_if(clusterData, [&selfId](ClioNode const& node) { return node.uuid == selfId; }); + ASSERT(selfData != clusterData.end(), "Self data should always be in the cluster data"); + if (selfData->dbRole == ClioNode::DbRole::ReadOnly or selfData->dbRole == ClioNode::DbRole::Fallback) { + return; + } + + if (std::ranges::any_of(clusterData, [](ClioNode const& node) { + return node.dbRole == ClioNode::DbRole::Fallback; + })) { + writerState->setWriterDecidingFallback(); + return; + } + std::ranges::sort(clusterData, [](ClioNode const& lhs, ClioNode const& rhs) { return *lhs.uuid < *rhs.uuid; }); diff --git a/src/etl/ETLService.cpp b/src/etl/ETLService.cpp index 3c0c8451eb..dc35358216 100644 --- a/src/etl/ETLService.cpp +++ b/src/etl/ETLService.cpp @@ -397,6 +397,7 @@ ETLService::startMonitor(uint32_t seq) LOG(log_.warn()) << "ETLService received DbStalled signal from Monitor"; if (not state_->isStrictReadonly and not state_->isWriting) state_->writeCommandSignal(SystemState::WriteCommand::StartWriting); + state_->isWriterDecidingFallback = true; }); monitor_->run(); diff --git a/src/etl/SystemState.hpp b/src/etl/SystemState.hpp index 671f995111..21bcf8b1c3 100644 --- a/src/etl/SystemState.hpp +++ b/src/etl/SystemState.hpp @@ -27,6 +27,7 @@ #include #include + #include namespace etl { @@ -111,6 +112,12 @@ struct SystemState { util::prometheus::Labels{}, "Whether clio detected a corruption that needs manual attention" ); + + util::prometheus::Bool isWriterDecidingFallback = PrometheusService::boolMetric( + "etl_writing_deciding_fallback", + util::prometheus::Labels{}, + "Whether clio detected a corruption that needs manual attention" + ); }; } // namespace etl diff --git a/src/etl/WriterState.cpp b/src/etl/WriterState.cpp index 3a81f85f97..abbfabfd56 100644 --- a/src/etl/WriterState.cpp +++ b/src/etl/WriterState.cpp @@ -60,6 +60,18 @@ WriterState::giveUpWriting() systemState_->writeCommandSignal(SystemState::WriteCommand::StopWriting); } +void +WriterState::setWriterDecidingFallback() +{ + systemState_->isWriterDecidingFallback = true; +} + +bool +WriterState::isFallback() const +{ + return systemState_->isWriterDecidingFallback; +} + std::unique_ptr WriterState::clone() const { diff --git a/src/etl/WriterState.hpp b/src/etl/WriterState.hpp index 458bac1043..e7b1bfe3d1 100644 --- a/src/etl/WriterState.hpp +++ b/src/etl/WriterState.hpp @@ -68,6 +68,12 @@ class WriterStateInterface { virtual void giveUpWriting() = 0; + [[nodiscard]] virtual bool + isFallback() const = 0; + + virtual void + setWriterDecidingFallback() = 0; + [[nodiscard]] virtual std::unique_ptr clone() const = 0; }; @@ -118,6 +124,12 @@ class WriterState : public WriterStateInterface { void giveUpWriting() override; + void + setWriterDecidingFallback() override; + + bool + isFallback() const override; + std::unique_ptr clone() const override; }; diff --git a/src/etl/impl/Loading.cpp b/src/etl/impl/Loading.cpp index 9c378f253f..aed400e011 100644 --- a/src/etl/impl/Loading.cpp +++ b/src/etl/impl/Loading.cpp @@ -76,6 +76,7 @@ Loader::load(model::LedgerData const& data) if (not success) { state_->writeCommandSignal(SystemState::WriteCommand::StopWriting); + state_->isWriterDecidingFallback = true; LOG(log_.warn()) << "Another node wrote a ledger into the DB - we have a write conflict"; return std::unexpected(LoaderError::WriteConflict); } diff --git a/tests/common/util/MockWriterState.hpp b/tests/common/util/MockWriterState.hpp index fbd4f8fff0..97a57f5dcb 100644 --- a/tests/common/util/MockWriterState.hpp +++ b/tests/common/util/MockWriterState.hpp @@ -30,6 +30,8 @@ struct MockWriterStateBase : public etl::WriterStateInterface { MOCK_METHOD(bool, isWriting, (), (const, override)); MOCK_METHOD(void, startWriting, (), (override)); MOCK_METHOD(void, giveUpWriting, (), (override)); + MOCK_METHOD(void, setWriterDecidingFallback, (), (override)); + MOCK_METHOD(bool, isFallback, (), (const, override)); MOCK_METHOD(std::unique_ptr, clone, (), (const, override)); }; From 584417ada3072e2fd2d28363a63ae8bc304044ab Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Tue, 13 Jan 2026 15:24:29 +0000 Subject: [PATCH 27/41] Fix tests and add new --- src/cluster/ClioNode.cpp | 4 ++ src/cluster/ClioNode.hpp | 14 +++++- src/cluster/WriterDecider.cpp | 6 +++ src/etl/ETLService.cpp | 3 ++ src/etl/SystemState.hpp | 15 ++++++- src/etl/WriterState.hpp | 26 +++++++++++ src/etl/impl/Loading.cpp | 3 ++ tests/unit/cluster/BackendTests.cpp | 2 + tests/unit/cluster/ClioNodeTests.cpp | 23 +++++++++- tests/unit/cluster/WriterDeciderTests.cpp | 53 +++++++++++++++++++++-- tests/unit/etl/ETLServiceTests.cpp | 11 +++++ tests/unit/etl/LoadingTests.cpp | 6 +++ tests/unit/etl/WriterStateTests.cpp | 24 ++++++++++ 13 files changed, 183 insertions(+), 7 deletions(-) diff --git a/src/cluster/ClioNode.cpp b/src/cluster/ClioNode.cpp index 42d2f1adbd..a23464bf25 100644 --- a/src/cluster/ClioNode.cpp +++ b/src/cluster/ClioNode.cpp @@ -49,6 +49,10 @@ struct Fields { ClioNode ClioNode::from(ClioNode::UUID uuid, etl::WriterStateInterface const& writerState) { + // Determine the database role based on writer state priority: + // 1. ReadOnly takes precedence (configured mode) + // 2. Fallback mode indicates cluster-wide fallback mechanism is active + // 3. Otherwise, Writer or NotWriter based on current writing state auto const dbRole = [&writerState]() { if (writerState.isReadOnly()) { return ClioNode::DbRole::ReadOnly; diff --git a/src/cluster/ClioNode.hpp b/src/cluster/ClioNode.hpp index ed583b28e5..48bef5071b 100644 --- a/src/cluster/ClioNode.hpp +++ b/src/cluster/ClioNode.hpp @@ -39,7 +39,19 @@ struct ClioNode { */ static constexpr char const* kTIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ"; - /** @brief Database role */ + /** + * @brief Database role of a node in the cluster. + * + * Roles are used to coordinate which node writes to the database: + * - ReadOnly: Node is configured to never write (strict read-only mode) + * - NotWriter: Node can write but is currently not the designated writer + * - Writer: Node is actively writing to the database + * - Fallback: Node is using the fallback writer decision mechanism + * + * When any node in the cluster is in Fallback mode, the entire cluster switches + * from the cluster communication mechanism to the slower but more reliable + * database-based conflict detection mechanism. + */ enum class DbRole { ReadOnly = 0, NotWriter = 1, Writer = 2, Fallback = 3, MAX = 3 }; using UUID = std::shared_ptr; diff --git a/src/cluster/WriterDecider.cpp b/src/cluster/WriterDecider.cpp index 0d67db1238..9eab301507 100644 --- a/src/cluster/WriterDecider.cpp +++ b/src/cluster/WriterDecider.cpp @@ -51,13 +51,19 @@ WriterDecider::onNewState(ClioNode::cUUID selfId, std::shared_ptrclone(), selfId = std::move(selfId), clusterData = clusterData->value()](auto&&) mutable { + // Find this node's data in the cluster state auto const selfData = std::ranges::find_if(clusterData, [&selfId](ClioNode const& node) { return node.uuid == selfId; }); ASSERT(selfData != clusterData.end(), "Self data should always be in the cluster data"); + + // ReadOnly nodes never participate in writer decisions + // Fallback nodes have already switched to fallback mechanism if (selfData->dbRole == ClioNode::DbRole::ReadOnly or selfData->dbRole == ClioNode::DbRole::Fallback) { return; } + // If any node in the cluster is in Fallback mode, the entire cluster must switch + // to the fallback writer decision mechanism for consistency if (std::ranges::any_of(clusterData, [](ClioNode const& node) { return node.dbRole == ClioNode::DbRole::Fallback; })) { diff --git a/src/etl/ETLService.cpp b/src/etl/ETLService.cpp index dc35358216..8dc5d71d2d 100644 --- a/src/etl/ETLService.cpp +++ b/src/etl/ETLService.cpp @@ -395,6 +395,8 @@ ETLService::startMonitor(uint32_t seq) monitorDbStalledSubscription_ = monitor_->subscribeToDbStalled([this]() { LOG(log_.warn()) << "ETLService received DbStalled signal from Monitor"; + // Database stall detected - no writer has been active for 10 seconds + // This triggers the fallback mechanism and attempts to become the writer if (not state_->isStrictReadonly and not state_->isWriting) state_->writeCommandSignal(SystemState::WriteCommand::StartWriting); state_->isWriterDecidingFallback = true; @@ -435,3 +437,4 @@ ETLService::giveUpWriter() } } // namespace etl + diff --git a/src/etl/SystemState.hpp b/src/etl/SystemState.hpp index 21bcf8b1c3..c486337bd4 100644 --- a/src/etl/SystemState.hpp +++ b/src/etl/SystemState.hpp @@ -113,11 +113,24 @@ struct SystemState { "Whether clio detected a corruption that needs manual attention" ); + /** + * @brief Whether the cluster is using the fallback writer decision mechanism. + * + * The fallback mechanism is triggered when: + * - The database stalls for 10 seconds (detected by Monitor), indicating no active writer + * - A write conflict is detected, indicating multiple nodes attempting to write simultaneously + * + * When fallback mode is active, the cluster stops using the cluster communication mechanism + * (TTL-based role announcements) and relies on the slower but more reliable database-based + * conflict detection. This flag propagates across the cluster - if any node enters fallback + * mode, all nodes in the cluster will switch to fallback mode. + */ util::prometheus::Bool isWriterDecidingFallback = PrometheusService::boolMetric( "etl_writing_deciding_fallback", util::prometheus::Labels{}, - "Whether clio detected a corruption that needs manual attention" + "Whether the cluster is using the fallback writer decision mechanism" ); }; } // namespace etl + diff --git a/src/etl/WriterState.hpp b/src/etl/WriterState.hpp index e7b1bfe3d1..8e3e7116b4 100644 --- a/src/etl/WriterState.hpp +++ b/src/etl/WriterState.hpp @@ -68,9 +68,22 @@ class WriterStateInterface { virtual void giveUpWriting() = 0; + /** + * @brief Check if the cluster is using the fallback writer decision mechanism. + * + * @return true if the cluster has switched to fallback mode, false otherwise + */ [[nodiscard]] virtual bool isFallback() const = 0; + /** + * @brief Switch the cluster to the fallback writer decision mechanism. + * + * This method is called when the cluster needs to transition from the cluster + * communication mechanism to the slower but more reliable fallback mechanism. + * Once set, this flag propagates to all nodes in the cluster through the + * ClioNode DbRole::Fallback state. + */ virtual void setWriterDecidingFallback() = 0; @@ -124,9 +137,20 @@ class WriterState : public WriterStateInterface { void giveUpWriting() override; + /** + * @brief Switch the cluster to the fallback writer decision mechanism. + * + * Sets the isWriterDecidingFallback flag in the system state, which will be + * propagated to other nodes in the cluster through the ClioNode DbRole::Fallback state. + */ void setWriterDecidingFallback() override; + /** + * @brief Check if the cluster is using the fallback writer decision mechanism. + * + * @return true if the cluster has switched to fallback mode, false otherwise + */ bool isFallback() const override; @@ -135,3 +159,5 @@ class WriterState : public WriterStateInterface { }; } // namespace etl + + diff --git a/src/etl/impl/Loading.cpp b/src/etl/impl/Loading.cpp index aed400e011..9bb1d6f690 100644 --- a/src/etl/impl/Loading.cpp +++ b/src/etl/impl/Loading.cpp @@ -75,6 +75,8 @@ Loader::load(model::LedgerData const& data) << "; took " << duration << "ms"; if (not success) { + // Write conflict detected - another node wrote to the database + // This triggers the fallback mechanism and stops this node from writing state_->writeCommandSignal(SystemState::WriteCommand::StopWriting); state_->isWriterDecidingFallback = true; LOG(log_.warn()) << "Another node wrote a ledger into the DB - we have a write conflict"; @@ -155,3 +157,4 @@ Loader::loadInitialLedger(model::LedgerData const& data) } } // namespace etl::impl + diff --git a/tests/unit/cluster/BackendTests.cpp b/tests/unit/cluster/BackendTests.cpp index 8cc38cb45c..d0e4f7c6b2 100644 --- a/tests/unit/cluster/BackendTests.cpp +++ b/tests/unit/cluster/BackendTests.cpp @@ -175,6 +175,7 @@ TEST_F(ClusterBackendTest, FetchClioNodesDataReturnsDataWithOtherNodes) ); EXPECT_CALL(*backend_, writeNodeMessage).Times(testing::AtLeast(1)); EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); + EXPECT_CALL(writerStateRef, isFallback).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); EXPECT_CALL(writerStateRef, isWriting).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); EXPECT_CALL(callbackMock, Call) .Times(testing::AtLeast(1)) @@ -253,6 +254,7 @@ TEST_F(ClusterBackendTest, WriteNodeMessageWritesSelfDataWithRecentTimestampAndD .Times(testing::AtLeast(1)) .WillRepeatedly(testing::Return(BackendInterface::ClioNodesDataFetchResult{})); EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); + EXPECT_CALL(writerStateRef, isFallback).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); EXPECT_CALL(writerStateRef, isWriting).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); EXPECT_CALL(*backend_, writeNodeMessage) .Times(testing::AtLeast(1)) diff --git a/tests/unit/cluster/ClioNodeTests.cpp b/tests/unit/cluster/ClioNodeTests.cpp index 8dbd219f28..beece6a666 100644 --- a/tests/unit/cluster/ClioNodeTests.cpp +++ b/tests/unit/cluster/ClioNodeTests.cpp @@ -111,7 +111,8 @@ INSTANTIATE_TEST_SUITE_P( testing::Values( ClioNodeDbRoleTestBundle{.testName = "ReadOnly", .role = ClioNode::DbRole::ReadOnly}, ClioNodeDbRoleTestBundle{.testName = "NotWriter", .role = ClioNode::DbRole::NotWriter}, - ClioNodeDbRoleTestBundle{.testName = "Writer", .role = ClioNode::DbRole::Writer} + ClioNodeDbRoleTestBundle{.testName = "Writer", .role = ClioNode::DbRole::Writer}, + ClioNodeDbRoleTestBundle{.testName = "Fallback", .role = ClioNode::DbRole::Fallback} ), tests::util::kNAME_GENERATOR ); @@ -153,6 +154,7 @@ TEST_F(ClioNodeDbRoleTest, DeserializationMissingDbRole) struct ClioNodeFromTestBundle { std::string testName; bool readOnly; + bool fallback; bool writing; ClioNode::DbRole expectedRole; }; @@ -170,18 +172,28 @@ INSTANTIATE_TEST_SUITE_P( ClioNodeFromTestBundle{ .testName = "ReadOnly", .readOnly = true, + .fallback = false, .writing = false, .expectedRole = ClioNode::DbRole::ReadOnly }, + ClioNodeFromTestBundle{ + .testName = "Fallback", + .readOnly = false, + .fallback = true, + .writing = false, + .expectedRole = ClioNode::DbRole::Fallback + }, ClioNodeFromTestBundle{ .testName = "NotWriterNotReadOnly", .readOnly = false, + .fallback = false, .writing = false, .expectedRole = ClioNode::DbRole::NotWriter }, ClioNodeFromTestBundle{ .testName = "Writer", .readOnly = false, + .fallback = false, .writing = true, .expectedRole = ClioNode::DbRole::Writer } @@ -195,7 +207,10 @@ TEST_P(ClioNodeFromTest, FromWriterState) EXPECT_CALL(writerState, isReadOnly()).WillOnce(testing::Return(param.readOnly)); if (not param.readOnly) { - EXPECT_CALL(writerState, isWriting()).WillOnce(testing::Return(param.writing)); + EXPECT_CALL(writerState, isFallback()).WillOnce(testing::Return(param.fallback)); + if (not param.fallback) { + EXPECT_CALL(writerState, isWriting()).WillOnce(testing::Return(param.writing)); + } } auto const beforeTime = std::chrono::system_clock::now(); @@ -207,3 +222,7 @@ TEST_P(ClioNodeFromTest, FromWriterState) EXPECT_GE(node.updateTime, beforeTime); EXPECT_LE(node.updateTime, afterTime); } + + + + diff --git a/tests/unit/cluster/WriterDeciderTests.cpp b/tests/unit/cluster/WriterDeciderTests.cpp index ecfd94459c..3cee39e24d 100644 --- a/tests/unit/cluster/WriterDeciderTests.cpp +++ b/tests/unit/cluster/WriterDeciderTests.cpp @@ -37,7 +37,7 @@ using namespace cluster; -enum class ExpectedAction { StartWriting, GiveUpWriting, NoAction }; +enum class ExpectedAction { StartWriting, GiveUpWriting, NoAction, SetFallback }; struct WriterDeciderTestParams { std::string testName; @@ -97,6 +97,10 @@ TEST_P(WriterDeciderTest, WriterSelection) EXPECT_CALL(*clonedState, giveUpWriting()); EXPECT_CALL(writerStateRef, clone()).WillOnce(testing::Return(testing::ByMove(std::move(clonedState)))); break; + case ExpectedAction::SetFallback: + EXPECT_CALL(*clonedState, setWriterDecidingFallback()); + EXPECT_CALL(writerStateRef, clone()).WillOnce(testing::Return(testing::ByMove(std::move(clonedState)))); + break; case ExpectedAction::NoAction: if (not params.useEmptyClusterData) { // For all-ReadOnly case, we still clone but don't call any action @@ -107,19 +111,25 @@ TEST_P(WriterDeciderTest, WriterSelection) } std::shared_ptr clusterData; + ClioNode::cUUID selfIdPtr; if (params.useEmptyClusterData) { clusterData = std::make_shared(std::unexpected(std::string("Communication failed"))); + selfIdPtr = std::make_shared(selfUuid); } else { std::vector nodes; nodes.reserve(params.nodes.size()); for (auto const& [uuidValue, role] : params.nodes) { - nodes.push_back(makeNode(makeUuid(uuidValue), role)); + auto node = makeNode(makeUuid(uuidValue), role); + if (uuidValue == params.selfUuidValue) { + selfIdPtr = node.uuid; // Use the same shared_ptr as in the node + } + nodes.push_back(std::move(node)); } clusterData = std::make_shared(std::move(nodes)); } - decider.onNewState(std::make_shared(selfUuid), clusterData); + decider.onNewState(selfIdPtr, clusterData); ctx.join(); } @@ -220,6 +230,43 @@ INSTANTIATE_TEST_SUITE_P( {0x03, ClioNode::DbRole::Writer}, {0x02, ClioNode::DbRole::ReadOnly}}, .expectedAction = ExpectedAction::StartWriting + }, + WriterDeciderTestParams{ + .testName = "SelfIsFallbackNoActionTaken", + .selfUuidValue = 0x01, + .nodes = {{0x01, ClioNode::DbRole::Fallback}, {0x02, ClioNode::DbRole::Writer}}, + .expectedAction = ExpectedAction::NoAction + }, + WriterDeciderTestParams{ + .testName = "OtherNodeIsFallbackSetsFallbackMode", + .selfUuidValue = 0x01, + .nodes = {{0x01, ClioNode::DbRole::Writer}, {0x02, ClioNode::DbRole::Fallback}}, + .expectedAction = ExpectedAction::SetFallback + }, + WriterDeciderTestParams{ + .testName = "SelfIsReadOnlyOthersAreFallbackNoActionTaken", + .selfUuidValue = 0x01, + .nodes = {{0x01, ClioNode::DbRole::ReadOnly}, {0x02, ClioNode::DbRole::Fallback}}, + .expectedAction = ExpectedAction::NoAction + }, + WriterDeciderTestParams{ + .testName = "MultipleFallbackNodesSelfNotFallbackSetsFallback", + .selfUuidValue = 0x03, + .nodes = + {{0x01, ClioNode::DbRole::Fallback}, + {0x02, ClioNode::DbRole::Fallback}, + {0x03, ClioNode::DbRole::Writer}}, + .expectedAction = ExpectedAction::SetFallback + }, + WriterDeciderTestParams{ + .testName = "MixedRolesWithOneFallbackSetsFallback", + .selfUuidValue = 0x02, + .nodes = + {{0x01, ClioNode::DbRole::Writer}, + {0x02, ClioNode::DbRole::NotWriter}, + {0x03, ClioNode::DbRole::Fallback}, + {0x04, ClioNode::DbRole::Writer}}, + .expectedAction = ExpectedAction::SetFallback } ), [](testing::TestParamInfo const& info) { return info.param.testName; } diff --git a/tests/unit/etl/ETLServiceTests.cpp b/tests/unit/etl/ETLServiceTests.cpp index 7468af1bb1..f7780a8791 100644 --- a/tests/unit/etl/ETLServiceTests.cpp +++ b/tests/unit/etl/ETLServiceTests.cpp @@ -459,8 +459,10 @@ TEST_F(ETLServiceTests, AttemptTakeoverWriter) ASSERT_TRUE(capturedDbStalledCallback); EXPECT_FALSE(systemState_->isWriting); // will attempt to become writer after new sequence appears but not yet + EXPECT_FALSE(systemState_->isWriterDecidingFallback); capturedDbStalledCallback(); EXPECT_TRUE(systemState_->isWriting); // should attempt to become writer + EXPECT_TRUE(systemState_->isWriterDecidingFallback); // fallback mode activated } TEST_F(ETLServiceTests, GiveUpWriterAfterWriteConflict) @@ -576,9 +578,12 @@ TEST_F(ETLServiceTests, DbStalledDoesNotTriggerSignalWhenStrictReadonly) systemState_->isWriting = false; // No signal should be emitted because node is in strict readonly mode + // But fallback flag should still be set ASSERT_TRUE(capturedDbStalledCallback); + EXPECT_FALSE(systemState_->isWriterDecidingFallback); capturedDbStalledCallback(); + EXPECT_TRUE(systemState_->isWriterDecidingFallback); // fallback mode activated even in readonly } TEST_F(ETLServiceTests, DbStalledDoesNotTriggerSignalWhenAlreadyWriting) @@ -607,9 +612,12 @@ TEST_F(ETLServiceTests, DbStalledDoesNotTriggerSignalWhenAlreadyWriting) systemState_->isWriting = true; // already writing // No signal should be emitted because node is already writing + // But fallback flag should still be set ASSERT_TRUE(capturedDbStalledCallback); + EXPECT_FALSE(systemState_->isWriterDecidingFallback); capturedDbStalledCallback(); + EXPECT_TRUE(systemState_->isWriterDecidingFallback); // fallback mode activated } TEST_F(ETLServiceTests, CacheUpdatesDependOnActualCacheState_WriterMode) @@ -849,3 +857,6 @@ TEST_F(ETLServiceTests, WriteCommandsAreSerializedOnStrand) // Final state should be writing (last signal was StartWriting) EXPECT_TRUE(systemState_->isWriting); } + + + diff --git a/tests/unit/etl/LoadingTests.cpp b/tests/unit/etl/LoadingTests.cpp index 6631fde732..e65f77060a 100644 --- a/tests/unit/etl/LoadingTests.cpp +++ b/tests/unit/etl/LoadingTests.cpp @@ -201,9 +201,12 @@ TEST_F(LoadingTests, LoadWriteConflictEmitsStopWritingSignal) EXPECT_CALL(*backend_, doFinishWrites()).WillOnce(testing::Return(false)); // simulate write conflict EXPECT_CALL(mockSignalCallback, Call(etl::SystemState::WriteCommand::StopWriting)); + EXPECT_FALSE(state_->isWriterDecidingFallback); + auto result = loader_.load(data); EXPECT_FALSE(result.has_value()); EXPECT_EQ(result.error(), etl::LoaderError::WriteConflict); + EXPECT_TRUE(state_->isWriterDecidingFallback); } TEST_F(LoadingTests, LoadSuccessDoesNotEmitSignal) @@ -218,8 +221,11 @@ TEST_F(LoadingTests, LoadSuccessDoesNotEmitSignal) EXPECT_CALL(*backend_, doFinishWrites()).WillOnce(testing::Return(true)); // success // No signal should be emitted on success + EXPECT_FALSE(state_->isWriterDecidingFallback); + auto result = loader_.load(data); EXPECT_TRUE(result.has_value()); + EXPECT_FALSE(state_->isWriterDecidingFallback); } TEST_F(LoadingTests, LoadWhenNotWritingDoesNotCheckConflict) diff --git a/tests/unit/etl/WriterStateTests.cpp b/tests/unit/etl/WriterStateTests.cpp index 686a1a75ec..8a0fd5be9a 100644 --- a/tests/unit/etl/WriterStateTests.cpp +++ b/tests/unit/etl/WriterStateTests.cpp @@ -84,3 +84,27 @@ TEST_F(WriterStateTest, GiveUpWritingDoesNothingWhenNotWriting) writerState.giveUpWriting(); } + +TEST_F(WriterStateTest, IsFallbackReturnsFalseByDefault) +{ + EXPECT_FALSE(writerState.isFallback()); +} + +TEST_F(WriterStateTest, SetWriterDecidingFallbackSetsFlag) +{ + EXPECT_FALSE(systemState->isWriterDecidingFallback); + + writerState.setWriterDecidingFallback(); + + EXPECT_TRUE(systemState->isWriterDecidingFallback); +} + +TEST_F(WriterStateTest, IsFallbackReturnsSystemStateValue) +{ + systemState->isWriterDecidingFallback = false; + EXPECT_FALSE(writerState.isFallback()); + + systemState->isWriterDecidingFallback = true; + EXPECT_TRUE(writerState.isFallback()); +} + From 0737efc49efced2cf46aad5984def14482a418ab Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Tue, 13 Jan 2026 15:25:09 +0000 Subject: [PATCH 28/41] Run pre-commit --- src/app/Stopper.hpp | 1 + src/cluster/Backend.hpp | 6 ++++++ src/cluster/Concepts.hpp | 6 ++++++ src/etl/ETLService.cpp | 1 - src/etl/SystemState.hpp | 1 - src/etl/WriterState.hpp | 16 ++++++++++++++++ src/etl/impl/Loading.cpp | 1 - tests/unit/cluster/ClioNodeTests.cpp | 4 ---- tests/unit/etl/ETLServiceTests.cpp | 5 +---- tests/unit/etl/WriterStateTests.cpp | 1 - 10 files changed, 30 insertions(+), 12 deletions(-) diff --git a/src/app/Stopper.hpp b/src/app/Stopper.hpp index 5b00d975c2..b21ffc487a 100644 --- a/src/app/Stopper.hpp +++ b/src/app/Stopper.hpp @@ -83,6 +83,7 @@ class Stopper { * @param subscriptions The subscription manager to stop. * @param backend The backend to stop. * @param cacheSaver The ledger cache saver + * @param clusterCommunicationService The cluster communication service to stop. * @param ioc The io_context to stop. * @return The callback to be called on application stop. */ diff --git a/src/cluster/Backend.hpp b/src/cluster/Backend.hpp index 31c90b8504..8adb766608 100644 --- a/src/cluster/Backend.hpp +++ b/src/cluster/Backend.hpp @@ -128,6 +128,11 @@ class Backend { return onNewState_.connect(s); } + /** + * @brief Get the UUID of this node in the cluster. + * + * @return The UUID of this node. + */ ClioNode::cUUID selfId() const; @@ -140,3 +145,4 @@ class Backend { }; } // namespace cluster + diff --git a/src/cluster/Concepts.hpp b/src/cluster/Concepts.hpp index 5b161dc866..340cb0c62b 100644 --- a/src/cluster/Concepts.hpp +++ b/src/cluster/Concepts.hpp @@ -23,6 +23,12 @@ namespace cluster { +/** + * @brief Tag type for cluster communication service implementations. + * + * This tag is used to identify types that implement cluster communication functionality. + * Types should inherit from this tag to be recognized as cluster communication services. + */ struct ClusterCommunicationServiceTag { virtual ~ClusterCommunicationServiceTag() = default; }; diff --git a/src/etl/ETLService.cpp b/src/etl/ETLService.cpp index 8dc5d71d2d..67579e63b7 100644 --- a/src/etl/ETLService.cpp +++ b/src/etl/ETLService.cpp @@ -437,4 +437,3 @@ ETLService::giveUpWriter() } } // namespace etl - diff --git a/src/etl/SystemState.hpp b/src/etl/SystemState.hpp index c486337bd4..b7dc0a815a 100644 --- a/src/etl/SystemState.hpp +++ b/src/etl/SystemState.hpp @@ -133,4 +133,3 @@ struct SystemState { }; } // namespace etl - diff --git a/src/etl/WriterState.hpp b/src/etl/WriterState.hpp index 8e3e7116b4..36d77811fc 100644 --- a/src/etl/WriterState.hpp +++ b/src/etl/WriterState.hpp @@ -87,6 +87,15 @@ class WriterStateInterface { virtual void setWriterDecidingFallback() = 0; + /** + * @brief Create a clone of this writer state. + * + * Creates a new instance of the writer state with the same underlying system state. + * This is used when spawning operations that need their own writer state instance + * while sharing the same system state. + * + * @return A unique pointer to the cloned writer state. + */ [[nodiscard]] virtual std::unique_ptr clone() const = 0; }; @@ -154,6 +163,13 @@ class WriterState : public WriterStateInterface { bool isFallback() const override; + /** + * @brief Create a clone of this writer state. + * + * Creates a new WriterState instance sharing the same system state. + * + * @return A unique pointer to the cloned writer state. + */ std::unique_ptr clone() const override; }; diff --git a/src/etl/impl/Loading.cpp b/src/etl/impl/Loading.cpp index 9bb1d6f690..4b1e244582 100644 --- a/src/etl/impl/Loading.cpp +++ b/src/etl/impl/Loading.cpp @@ -157,4 +157,3 @@ Loader::loadInitialLedger(model::LedgerData const& data) } } // namespace etl::impl - diff --git a/tests/unit/cluster/ClioNodeTests.cpp b/tests/unit/cluster/ClioNodeTests.cpp index beece6a666..1e3b5adb35 100644 --- a/tests/unit/cluster/ClioNodeTests.cpp +++ b/tests/unit/cluster/ClioNodeTests.cpp @@ -222,7 +222,3 @@ TEST_P(ClioNodeFromTest, FromWriterState) EXPECT_GE(node.updateTime, beforeTime); EXPECT_LE(node.updateTime, afterTime); } - - - - diff --git a/tests/unit/etl/ETLServiceTests.cpp b/tests/unit/etl/ETLServiceTests.cpp index f7780a8791..828a9fbf31 100644 --- a/tests/unit/etl/ETLServiceTests.cpp +++ b/tests/unit/etl/ETLServiceTests.cpp @@ -461,7 +461,7 @@ TEST_F(ETLServiceTests, AttemptTakeoverWriter) EXPECT_FALSE(systemState_->isWriting); // will attempt to become writer after new sequence appears but not yet EXPECT_FALSE(systemState_->isWriterDecidingFallback); capturedDbStalledCallback(); - EXPECT_TRUE(systemState_->isWriting); // should attempt to become writer + EXPECT_TRUE(systemState_->isWriting); // should attempt to become writer EXPECT_TRUE(systemState_->isWriterDecidingFallback); // fallback mode activated } @@ -857,6 +857,3 @@ TEST_F(ETLServiceTests, WriteCommandsAreSerializedOnStrand) // Final state should be writing (last signal was StartWriting) EXPECT_TRUE(systemState_->isWriting); } - - - diff --git a/tests/unit/etl/WriterStateTests.cpp b/tests/unit/etl/WriterStateTests.cpp index 8a0fd5be9a..9dc6e71c3d 100644 --- a/tests/unit/etl/WriterStateTests.cpp +++ b/tests/unit/etl/WriterStateTests.cpp @@ -107,4 +107,3 @@ TEST_F(WriterStateTest, IsFallbackReturnsSystemStateValue) systemState->isWriterDecidingFallback = true; EXPECT_TRUE(writerState.isFallback()); } - From bae896f854aaf930dbf4945ddb169fd3ddd7fbdd Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Fri, 16 Jan 2026 11:25:20 +0000 Subject: [PATCH 29/41] Run pre-commit --- src/cluster/Backend.hpp | 1 - src/etl/WriterState.hpp | 2 -- 2 files changed, 3 deletions(-) diff --git a/src/cluster/Backend.hpp b/src/cluster/Backend.hpp index 8adb766608..558ba6cbb1 100644 --- a/src/cluster/Backend.hpp +++ b/src/cluster/Backend.hpp @@ -145,4 +145,3 @@ class Backend { }; } // namespace cluster - diff --git a/src/etl/WriterState.hpp b/src/etl/WriterState.hpp index 36d77811fc..7373be2f9c 100644 --- a/src/etl/WriterState.hpp +++ b/src/etl/WriterState.hpp @@ -175,5 +175,3 @@ class WriterState : public WriterStateInterface { }; } // namespace etl - - From d4236b1a9005c2ab1f26ed50cb3c2a9bf55ce485 Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Fri, 16 Jan 2026 11:25:20 +0000 Subject: [PATCH 30/41] Fix flaky test --- tests/unit/etl/MonitorTests.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/unit/etl/MonitorTests.cpp b/tests/unit/etl/MonitorTests.cpp index 7cd663cd4e..a5fe71818a 100644 --- a/tests/unit/etl/MonitorTests.cpp +++ b/tests/unit/etl/MonitorTests.cpp @@ -164,7 +164,10 @@ TEST_F(MonitorTests, DbStalledChannelTriggeredWhenTimeoutExceeded) EXPECT_CALL(*ledgers_, subscribe(testing::_)); EXPECT_CALL(*backend_, hardFetchLedgerRange(testing::_)).WillRepeatedly(testing::Return(std::nullopt)); - EXPECT_CALL(dbStalledMock_, Call()).WillOnce([&]() { unblock.release(); }); + EXPECT_CALL(dbStalledMock_, Call()).WillOnce([&]() { + monitor_.stop(); // Prevent monitor to have another loop between semaphore and destructor + unblock.release(); + }); auto subscription = monitor_.subscribeToDbStalled(dbStalledMock_.AsStdFunction()); monitor_.run(std::chrono::nanoseconds{100}); From 39d4b9479a3c01e7aa8e4f95df22407edcf742aa Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Fri, 16 Jan 2026 14:57:52 +0000 Subject: [PATCH 31/41] Run pre-commit --- tests/unit/etl/MonitorTests.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/etl/MonitorTests.cpp b/tests/unit/etl/MonitorTests.cpp index a5fe71818a..ee185679e8 100644 --- a/tests/unit/etl/MonitorTests.cpp +++ b/tests/unit/etl/MonitorTests.cpp @@ -165,7 +165,7 @@ TEST_F(MonitorTests, DbStalledChannelTriggeredWhenTimeoutExceeded) EXPECT_CALL(*ledgers_, subscribe(testing::_)); EXPECT_CALL(*backend_, hardFetchLedgerRange(testing::_)).WillRepeatedly(testing::Return(std::nullopt)); EXPECT_CALL(dbStalledMock_, Call()).WillOnce([&]() { - monitor_.stop(); // Prevent monitor to have another loop between semaphore and destructor + monitor_.stop(); // Prevent monitor to have another loop between semaphore and destructor unblock.release(); }); From 66615bcb1e1db354b1365d90c322e193017cd0a5 Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Wed, 14 Jan 2026 14:58:36 +0000 Subject: [PATCH 32/41] Fix review issues --- src/etl/ETLService.cpp | 12 +++++------- src/util/Channel.hpp | 28 ++++++++++++++++------------ tests/unit/etl/ETLServiceTests.cpp | 3 +-- 3 files changed, 22 insertions(+), 21 deletions(-) diff --git a/src/etl/ETLService.cpp b/src/etl/ETLService.cpp index 67579e63b7..1730417b6f 100644 --- a/src/etl/ETLService.cpp +++ b/src/etl/ETLService.cpp @@ -378,18 +378,16 @@ ETLService::startMonitor(uint32_t seq) auto const backendRange = backend_->fetchLedgerRange(); auto const backendNeedsUpdate = backendRange.has_value() and backendRange->maxSequence < seq; - if (cacheNeedsUpdate or backendNeedsUpdate) { + if (cacheNeedsUpdate) { auto const diff = data::synchronousAndRetryOnTimeout([this, seq](auto yield) { return backend_->fetchLedgerDiff(seq, yield); }); - - if (cacheNeedsUpdate) - cacheUpdater_->update(seq, diff); - - if (backendNeedsUpdate) - backend_->updateRange(seq); + cacheUpdater_->update(seq, diff); } + if (backendNeedsUpdate) + backend_->updateRange(seq); + publisher_->publish(seq, {}); }); diff --git a/src/util/Channel.hpp b/src/util/Channel.hpp index aed4e96a63..d85128ae39 100644 --- a/src/util/Channel.hpp +++ b/src/util/Channel.hpp @@ -151,13 +151,15 @@ class Channel { * @brief Constructs a Sender from a shared control block. * @param shared The shared control block managing the channel state */ - Sender(std::shared_ptr shared) : shared_(shared) + Sender(std::shared_ptr shared) + : shared_(shared), guard_([shared = std::move(shared)]() { + if constexpr (kIS_MULTI_PRODUCER) { + return std::make_shared(std::move(shared)); + } else { + return Guard{std::move(shared)}; + } + }()) { - if constexpr (kIS_MULTI_PRODUCER) { - guard_ = std::make_shared(shared); - } else { - guard_ = Guard{std::move(shared)}; - } } public: @@ -270,13 +272,15 @@ class Channel { * @brief Constructs a Receiver from a shared control block. * @param shared The shared control block managing the channel state */ - Receiver(std::shared_ptr shared) : shared_(shared) + Receiver(std::shared_ptr shared) + : shared_(shared), guard_([shared = std::move(shared)]() { + if constexpr (kIS_MULTI_CONSUMER) { + return std::make_shared(std::move(shared)); + } else { + return Guard{std::move(shared)}; + } + }()) { - if constexpr (kIS_MULTI_CONSUMER) { - guard_ = std::make_shared(shared); - } else { - guard_ = Guard{std::move(shared)}; - } } public: diff --git a/tests/unit/etl/ETLServiceTests.cpp b/tests/unit/etl/ETLServiceTests.cpp index 828a9fbf31..1e84df1e64 100644 --- a/tests/unit/etl/ETLServiceTests.cpp +++ b/tests/unit/etl/ETLServiceTests.cpp @@ -769,8 +769,7 @@ TEST_F(ETLServiceTests, StopWaitsForWriteCommandHandlersToComplete) // Stop should wait for the handler to complete and disconnect the subscription service_.stop(); - // Verify stop() returned, meaning all handlers completed - SUCCEED(); + // The test will hang on stop() or in service_ destructor if there is a problem. } TEST_F(ETLServiceTests, WriteConflictIsHandledImmediately_NotDelayed) From 207a281a0cb917ea0aa7aae83ee52856d718ffbc Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Wed, 14 Jan 2026 13:56:25 +0000 Subject: [PATCH 33/41] Add Loading state --- src/cluster/ClioNode.cpp | 7 +++---- src/cluster/ClioNode.hpp | 2 +- src/cluster/WriterDecider.cpp | 2 +- src/etl/ETLService.cpp | 9 +++++++++ src/etl/SystemState.hpp | 11 +++++++++++ src/etl/WriterState.cpp | 6 ++++++ src/etl/WriterState.hpp | 6 ++++++ tests/common/util/MockWriterState.hpp | 1 + 8 files changed, 38 insertions(+), 6 deletions(-) diff --git a/src/cluster/ClioNode.cpp b/src/cluster/ClioNode.cpp index a23464bf25..7aa13e76e4 100644 --- a/src/cluster/ClioNode.cpp +++ b/src/cluster/ClioNode.cpp @@ -49,10 +49,6 @@ struct Fields { ClioNode ClioNode::from(ClioNode::UUID uuid, etl::WriterStateInterface const& writerState) { - // Determine the database role based on writer state priority: - // 1. ReadOnly takes precedence (configured mode) - // 2. Fallback mode indicates cluster-wide fallback mechanism is active - // 3. Otherwise, Writer or NotWriter based on current writing state auto const dbRole = [&writerState]() { if (writerState.isReadOnly()) { return ClioNode::DbRole::ReadOnly; @@ -60,6 +56,9 @@ ClioNode::from(ClioNode::UUID uuid, etl::WriterStateInterface const& writerState if (writerState.isFallback()) { return ClioNode::DbRole::Fallback; } + if (writerState.isLoadingCache()) { + return ClioNode::DbRole::LoadingCache; + } return writerState.isWriting() ? ClioNode::DbRole::Writer : ClioNode::DbRole::NotWriter; }(); diff --git a/src/cluster/ClioNode.hpp b/src/cluster/ClioNode.hpp index 48bef5071b..2da8445d57 100644 --- a/src/cluster/ClioNode.hpp +++ b/src/cluster/ClioNode.hpp @@ -52,7 +52,7 @@ struct ClioNode { * from the cluster communication mechanism to the slower but more reliable * database-based conflict detection mechanism. */ - enum class DbRole { ReadOnly = 0, NotWriter = 1, Writer = 2, Fallback = 3, MAX = 3 }; + enum class DbRole { ReadOnly = 0, LoadingCache = 1, NotWriter = 2, Writer = 3, Fallback = 4, MAX = 4 }; using UUID = std::shared_ptr; using cUUID = std::shared_ptr; diff --git a/src/cluster/WriterDecider.cpp b/src/cluster/WriterDecider.cpp index 9eab301507..fa3cdfdc1c 100644 --- a/src/cluster/WriterDecider.cpp +++ b/src/cluster/WriterDecider.cpp @@ -76,7 +76,7 @@ WriterDecider::onNewState(ClioNode::cUUID selfId, std::shared_ptrhardFetchLedgerRangeNoThrow(); ASSERT(rng.has_value(), "Ledger range can't be null"); + if (backend_->cache().latestLedgerSequence() != rng->maxSequence) { + LOG(log_.info()) << "Wanted to take over the ETL writer seat but LedgerCache is outdated"; + // Give ETL time to update LedgerCache. This method will be called because ClusterCommunication will likely to + // continue sending StartWriting signal every 1 second + return; + } + state_->isWriting = true; // switch to writer LOG(log_.info()) << "Taking over the ETL writer seat"; startLoading(rng->maxSequence + 1); diff --git a/src/etl/SystemState.hpp b/src/etl/SystemState.hpp index b7dc0a815a..6386e0efe0 100644 --- a/src/etl/SystemState.hpp +++ b/src/etl/SystemState.hpp @@ -36,6 +36,11 @@ namespace etl { * @brief Represents the state of the ETL subsystem. */ struct SystemState { + SystemState() + { + isLoadingCache = true; + } + /** * @brief Factory method to create a SystemState instance. * @@ -69,6 +74,12 @@ struct SystemState { "Whether the process is writing to the database" ); + util::prometheus::Bool isLoadingCache = PrometheusService::boolMetric( + "etl_loading_cache", + util::prometheus::Labels{}, + "Whether etl is loading cache after clio startup" + ); + /** * @brief Commands for controlling the ETL writer state. * diff --git a/src/etl/WriterState.cpp b/src/etl/WriterState.cpp index abbfabfd56..8947febd48 100644 --- a/src/etl/WriterState.cpp +++ b/src/etl/WriterState.cpp @@ -72,6 +72,12 @@ WriterState::isFallback() const return systemState_->isWriterDecidingFallback; } +bool +WriterState::isLoadingCache() const +{ + return systemState_->isLoadingCache; +} + std::unique_ptr WriterState::clone() const { diff --git a/src/etl/WriterState.hpp b/src/etl/WriterState.hpp index 7373be2f9c..3f375ca865 100644 --- a/src/etl/WriterState.hpp +++ b/src/etl/WriterState.hpp @@ -87,6 +87,9 @@ class WriterStateInterface { virtual void setWriterDecidingFallback() = 0; + [[nodiscard]] virtual bool + isLoadingCache() const = 0; + /** * @brief Create a clone of this writer state. * @@ -163,6 +166,9 @@ class WriterState : public WriterStateInterface { bool isFallback() const override; + bool + isLoadingCache() const override; + /** * @brief Create a clone of this writer state. * diff --git a/tests/common/util/MockWriterState.hpp b/tests/common/util/MockWriterState.hpp index 97a57f5dcb..a1821d5994 100644 --- a/tests/common/util/MockWriterState.hpp +++ b/tests/common/util/MockWriterState.hpp @@ -32,6 +32,7 @@ struct MockWriterStateBase : public etl::WriterStateInterface { MOCK_METHOD(void, giveUpWriting, (), (override)); MOCK_METHOD(void, setWriterDecidingFallback, (), (override)); MOCK_METHOD(bool, isFallback, (), (const, override)); + MOCK_METHOD(bool, isLoadingCache, (), (const, override)); MOCK_METHOD(std::unique_ptr, clone, (), (const, override)); }; From 5e562cf5fe8cb91e6aa08793cd3db942e07fa343 Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Thu, 15 Jan 2026 18:00:25 +0000 Subject: [PATCH 34/41] Fix ETLService tests --- src/etl/ETLService.cpp | 17 ++---- tests/unit/etl/ETLServiceTests.cpp | 95 +++++++++++++++++++++++++----- 2 files changed, 87 insertions(+), 25 deletions(-) diff --git a/src/etl/ETLService.cpp b/src/etl/ETLService.cpp index de6919344f..433721c281 100644 --- a/src/etl/ETLService.cpp +++ b/src/etl/ETLService.cpp @@ -212,14 +212,8 @@ ETLService::run() return; } - auto nextSequence = rng->maxSequence + 1; - if (backend_->cache().latestLedgerSequence() != 0) { - nextSequence = backend_->cache().latestLedgerSequence(); - } - + auto const nextSequence = syncCacheWithDb(); LOG(log_.debug()) << "Database is populated. Starting monitor loop. sequence = " << nextSequence; - nextSequence = syncCacheWithDb(); - startMonitor(nextSequence); @@ -358,16 +352,17 @@ uint32_t ETLService::syncCacheWithDb() { auto rng = backend_->hardFetchLedgerRangeNoThrow(); - while (rng->maxSequence > backend_->cache().latestLedgerSequence()) { - LOG(log_.info()) << "Syncing cache with DB. DB latest seq: " << rng->maxSequence << ". Cache latest seq: " - << backend_->cache().latestLedgerSequence(); + + while (not backend_->cache().isDisabled() and rng->maxSequence > backend_->cache().latestLedgerSequence()) { + LOG(log_.info()) << "Syncing cache with DB. DB latest seq: " << rng->maxSequence + << ". Cache latest seq: " << backend_->cache().latestLedgerSequence(); for (auto seq = backend_->cache().latestLedgerSequence(); seq <= rng->maxSequence; ++seq) { LOG(log_.info()) << "ETLService (via syncCacheWithDb) got new seq from db: " << seq; updateCache(seq); } rng = backend_->hardFetchLedgerRangeNoThrow(); } - return rng->maxSequence; + return rng->maxSequence + 1; } void diff --git a/tests/unit/etl/ETLServiceTests.cpp b/tests/unit/etl/ETLServiceTests.cpp index 1e84df1e64..91625600f4 100644 --- a/tests/unit/etl/ETLServiceTests.cpp +++ b/tests/unit/etl/ETLServiceTests.cpp @@ -304,6 +304,7 @@ TEST_F(ETLServiceTests, RunWithEmptyDatabase) auto mockTaskManager = std::make_unique>(); auto& mockTaskManagerRef = *mockTaskManager; auto ledgerData = createTestData(kSEQ); + EXPECT_TRUE(systemState_->isLoadingCache); testing::Sequence const s; EXPECT_CALL(*backend_, hardFetchLedgerRange).InSequence(s).WillOnce(testing::Return(std::nullopt)); @@ -312,25 +313,61 @@ TEST_F(ETLServiceTests, RunWithEmptyDatabase) EXPECT_CALL(*balancer_, loadInitialLedger(kSEQ, testing::_, testing::_)) .WillOnce(testing::Return(std::vector{})); EXPECT_CALL(*loader_, loadInitialLedger).WillOnce(testing::Return(ripple::LedgerHeader{})); - EXPECT_CALL(*backend_, hardFetchLedgerRange) - .InSequence(s) - .WillOnce(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); + // In syncCacheWithDb() + EXPECT_CALL(*backend_, hardFetchLedgerRange).Times(2).InSequence(s).WillRepeatedly([this]() { + backend_->cache().update({}, kSEQ, false); + return data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ}; + }); EXPECT_CALL(mockTaskManagerRef, run); - EXPECT_CALL(*taskManagerProvider_, make(testing::_, testing::_, kSEQ + 1, testing::_)) - .WillOnce(testing::Return(std::unique_ptr(mockTaskManager.release()))); - EXPECT_CALL(*monitorProvider_, make(testing::_, testing::_, testing::_, testing::_, testing::_)) - .WillOnce([](auto, auto, auto, auto, auto) { return std::make_unique>(); }); + EXPECT_CALL(*taskManagerProvider_, make(testing::_, testing::_, kSEQ + 1, testing::_)).WillOnce([&](auto&&...) { + EXPECT_FALSE(systemState_->isLoadingCache); + return std::unique_ptr(mockTaskManager.release()); + }); + EXPECT_CALL(*monitorProvider_, make(testing::_, testing::_, testing::_, kSEQ + 1, testing::_)) + .WillOnce([this](auto, auto, auto, auto, auto) { + EXPECT_TRUE(systemState_->isLoadingCache); + return std::make_unique>(); + }); service_.run(); } TEST_F(ETLServiceTests, RunWithPopulatedDatabase) { + EXPECT_TRUE(systemState_->isLoadingCache); + backend_->cache().update({}, kSEQ, false); EXPECT_CALL(*backend_, hardFetchLedgerRange) .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); - EXPECT_CALL(*monitorProvider_, make).WillOnce([](auto, auto, auto, auto, auto) { - return std::make_unique>(); - }); + EXPECT_CALL(*monitorProvider_, make(testing::_, testing::_, testing::_, kSEQ + 1, testing::_)) + .WillOnce([this](auto, auto, auto, auto, auto) { + EXPECT_TRUE(systemState_->isLoadingCache); + return std::make_unique>(); + }); + EXPECT_CALL(*ledgers_, getMostRecent()).WillRepeatedly(testing::Return(kSEQ)); + EXPECT_CALL(*cacheLoader_, load(kSEQ)); + + service_.run(); +} + +TEST_F(ETLServiceTests, SyncCacheWithDbBeforeStartingMonitor) +{ + EXPECT_TRUE(systemState_->isLoadingCache); + backend_->cache().update({}, kSEQ - 2, false); + EXPECT_CALL(*backend_, hardFetchLedgerRange) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); + + EXPECT_CALL(*backend_, fetchLedgerDiff(kSEQ - 1, testing::_)); + EXPECT_CALL(*cacheUpdater_, update(kSEQ - 1, std::vector())) + .WillOnce([this](auto const seq, auto&&...) { backend_->cache().update({}, seq, false); }); + EXPECT_CALL(*backend_, fetchLedgerDiff(kSEQ, testing::_)); + EXPECT_CALL(*cacheUpdater_, update(kSEQ, std::vector())) + .WillOnce([this](auto const seq, auto&&...) { backend_->cache().update({}, seq, false); }); + + EXPECT_CALL(*monitorProvider_, make(testing::_, testing::_, testing::_, kSEQ + 1, testing::_)) + .WillOnce([this](auto, auto, auto, auto, auto) { + EXPECT_TRUE(systemState_->isLoadingCache); + return std::make_unique>(); + }); EXPECT_CALL(*ledgers_, getMostRecent()).WillRepeatedly(testing::Return(kSEQ)); EXPECT_CALL(*cacheLoader_, load(kSEQ)); @@ -368,8 +405,11 @@ TEST_F(ETLServiceTests, HandlesWriteConflictInMonitorSubscription) EXPECT_CALL(mockMonitorRef, subscribeToDbStalled); EXPECT_CALL(mockMonitorRef, run); + // Set cache to be in sync with DB to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); EXPECT_CALL(*backend_, hardFetchLedgerRange) - .WillOnce(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); + .Times(2) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); EXPECT_CALL(*cacheLoader_, load(kSEQ)); @@ -401,8 +441,11 @@ TEST_F(ETLServiceTests, NormalFlowInMonitorSubscription) EXPECT_CALL(mockMonitorRef, subscribeToDbStalled); EXPECT_CALL(mockMonitorRef, run); + // Set cache to be in sync with DB to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); EXPECT_CALL(*backend_, hardFetchLedgerRange) - .WillOnce(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); + .Times(2) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); EXPECT_CALL(*cacheLoader_, load(kSEQ)); @@ -439,6 +482,8 @@ TEST_F(ETLServiceTests, AttemptTakeoverWriter) }); EXPECT_CALL(mockMonitorRef, run); + // Set cache to be in sync with DB to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); EXPECT_CALL(*backend_, hardFetchLedgerRange) .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); @@ -482,8 +527,11 @@ TEST_F(ETLServiceTests, GiveUpWriterAfterWriteConflict) EXPECT_CALL(mockMonitorRef, subscribeToDbStalled); EXPECT_CALL(mockMonitorRef, run); + // Set cache to be in sync with DB to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); EXPECT_CALL(*backend_, hardFetchLedgerRange) - .WillOnce(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); + .Times(2) + .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); EXPECT_CALL(*cacheLoader_, load(kSEQ)); @@ -568,6 +616,8 @@ TEST_F(ETLServiceTests, DbStalledDoesNotTriggerSignalWhenStrictReadonly) }); EXPECT_CALL(mockMonitorRef, run); + // Set cache to be in sync with DB to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); EXPECT_CALL(*backend_, hardFetchLedgerRange) .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); @@ -602,6 +652,8 @@ TEST_F(ETLServiceTests, DbStalledDoesNotTriggerSignalWhenAlreadyWriting) }); EXPECT_CALL(mockMonitorRef, run); + // Set cache to be in sync with DB to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); EXPECT_CALL(*backend_, hardFetchLedgerRange) .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); @@ -636,6 +688,8 @@ TEST_F(ETLServiceTests, CacheUpdatesDependOnActualCacheState_WriterMode) EXPECT_CALL(mockMonitorRef, subscribeToDbStalled); EXPECT_CALL(mockMonitorRef, run); + // Set cache to be in sync with DB initially to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); EXPECT_CALL(*backend_, hardFetchLedgerRange) .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); @@ -666,6 +720,8 @@ TEST_F(ETLServiceTests, OnlyCacheUpdatesWhenBackendIsCurrent) auto mockMonitor = std::make_unique>(); auto& mockMonitorRef = *mockMonitor; std::function capturedCallback; + // Set cache to be in sync with DB initially to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); EXPECT_CALL(*monitorProvider_, make).WillOnce([&mockMonitor](auto, auto, auto, auto, auto) { return std::move(mockMonitor); @@ -679,6 +735,7 @@ TEST_F(ETLServiceTests, OnlyCacheUpdatesWhenBackendIsCurrent) // Set backend range to be at kSEQ + 1 (already current) EXPECT_CALL(*backend_, hardFetchLedgerRange) + .WillOnce(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})) .WillOnce(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})) .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ + 1})); EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); @@ -706,6 +763,8 @@ TEST_F(ETLServiceTests, NoUpdatesWhenBothCacheAndBackendAreCurrent) auto mockMonitor = std::make_unique>(); auto& mockMonitorRef = *mockMonitor; std::function capturedCallback; + // Set cache to be in sync with DB initially to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); EXPECT_CALL(*monitorProvider_, make).WillOnce([&mockMonitor](auto, auto, auto, auto, auto) { return std::move(mockMonitor); @@ -719,6 +778,7 @@ TEST_F(ETLServiceTests, NoUpdatesWhenBothCacheAndBackendAreCurrent) // Set backend range to be at kSEQ + 1 (already current) EXPECT_CALL(*backend_, hardFetchLedgerRange) + .WillOnce(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})) .WillOnce(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})) .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ + 1})); EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); @@ -743,6 +803,8 @@ TEST_F(ETLServiceTests, NoUpdatesWhenBothCacheAndBackendAreCurrent) TEST_F(ETLServiceTests, StopWaitsForWriteCommandHandlersToComplete) { auto mockMonitor = std::make_unique>(); + // Set cache to be in sync with DB to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); EXPECT_CALL(*monitorProvider_, make).WillOnce([&mockMonitor](auto, auto, auto, auto, auto) { return std::move(mockMonitor); @@ -769,7 +831,8 @@ TEST_F(ETLServiceTests, StopWaitsForWriteCommandHandlersToComplete) // Stop should wait for the handler to complete and disconnect the subscription service_.stop(); - // The test will hang on stop() or in service_ destructor if there is a problem. + // Verify stop() returned, meaning all handlers completed + SUCCEED(); } TEST_F(ETLServiceTests, WriteConflictIsHandledImmediately_NotDelayed) @@ -791,6 +854,8 @@ TEST_F(ETLServiceTests, WriteConflictIsHandledImmediately_NotDelayed) EXPECT_CALL(mockMonitorRef, subscribeToDbStalled); EXPECT_CALL(mockMonitorRef, run); + // Set cache to be in sync with DB to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); EXPECT_CALL(*backend_, hardFetchLedgerRange) .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); @@ -816,6 +881,8 @@ TEST_F(ETLServiceTests, WriteCommandsAreSerializedOnStrand) return std::move(mockMonitor); }); + // Set cache to be in sync with DB to avoid syncCacheWithDb loop + backend_->cache().update({}, kSEQ, false); EXPECT_CALL(*backend_, hardFetchLedgerRange) .WillRepeatedly(testing::Return(data::LedgerRange{.minSequence = 1, .maxSequence = kSEQ})); EXPECT_CALL(*ledgers_, getMostRecent()).WillOnce(testing::Return(kSEQ)); From 57d88637753d6d0921e015cf5515d08ae23b6e5b Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Fri, 16 Jan 2026 14:09:27 +0000 Subject: [PATCH 35/41] Fix other tests --- tests/unit/cluster/BackendTests.cpp | 5 ++- tests/unit/cluster/ClioNodeTests.cpp | 26 +++++++++++- tests/unit/cluster/WriterDeciderTests.cpp | 49 +++++++++++++++++++++-- 3 files changed, 73 insertions(+), 7 deletions(-) diff --git a/tests/unit/cluster/BackendTests.cpp b/tests/unit/cluster/BackendTests.cpp index d0e4f7c6b2..0155ebf5c1 100644 --- a/tests/unit/cluster/BackendTests.cpp +++ b/tests/unit/cluster/BackendTests.cpp @@ -160,7 +160,7 @@ TEST_F(ClusterBackendTest, FetchClioNodesDataReturnsDataWithOtherNodes) auto const otherUuid = boost::uuids::random_generator{}(); auto const otherNodeJson = R"({ - "db_role": 2, + "db_role": 3, "update_time": "2025-01-15T10:30:00Z" })"; @@ -176,6 +176,7 @@ TEST_F(ClusterBackendTest, FetchClioNodesDataReturnsDataWithOtherNodes) EXPECT_CALL(*backend_, writeNodeMessage).Times(testing::AtLeast(1)); EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); EXPECT_CALL(writerStateRef, isFallback).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); + EXPECT_CALL(writerStateRef, isLoadingCache).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); EXPECT_CALL(writerStateRef, isWriting).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); EXPECT_CALL(callbackMock, Call) .Times(testing::AtLeast(1)) @@ -255,6 +256,7 @@ TEST_F(ClusterBackendTest, WriteNodeMessageWritesSelfDataWithRecentTimestampAndD .WillRepeatedly(testing::Return(BackendInterface::ClioNodesDataFetchResult{})); EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); EXPECT_CALL(writerStateRef, isFallback).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); + EXPECT_CALL(writerStateRef, isLoadingCache).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); EXPECT_CALL(writerStateRef, isWriting).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); EXPECT_CALL(*backend_, writeNodeMessage) .Times(testing::AtLeast(1)) @@ -274,3 +276,4 @@ TEST_F(ClusterBackendTest, WriteNodeMessageWritesSelfDataWithRecentTimestampAndD clusterBackend.run(); semaphore.acquire(); } + diff --git a/tests/unit/cluster/ClioNodeTests.cpp b/tests/unit/cluster/ClioNodeTests.cpp index 1e3b5adb35..adcfb2fded 100644 --- a/tests/unit/cluster/ClioNodeTests.cpp +++ b/tests/unit/cluster/ClioNodeTests.cpp @@ -80,7 +80,7 @@ TEST_F(ClioNodeTest, Deserialization) EXPECT_NE(node.uuid, nullptr); EXPECT_EQ(*node.uuid, boost::uuids::uuid{}); EXPECT_EQ(node.updateTime, updateTime); - EXPECT_EQ(node.dbRole, ClioNode::DbRole::NotWriter); + EXPECT_EQ(node.dbRole, ClioNode::DbRole::LoadingCache); } TEST_F(ClioNodeTest, DeserializationInvalidTime) @@ -110,6 +110,7 @@ INSTANTIATE_TEST_SUITE_P( ClioNodeDbRoleTest, testing::Values( ClioNodeDbRoleTestBundle{.testName = "ReadOnly", .role = ClioNode::DbRole::ReadOnly}, + ClioNodeDbRoleTestBundle{.testName = "LoadingCache", .role = ClioNode::DbRole::LoadingCache}, ClioNodeDbRoleTestBundle{.testName = "NotWriter", .role = ClioNode::DbRole::NotWriter}, ClioNodeDbRoleTestBundle{.testName = "Writer", .role = ClioNode::DbRole::Writer}, ClioNodeDbRoleTestBundle{.testName = "Fallback", .role = ClioNode::DbRole::Fallback} @@ -155,6 +156,7 @@ struct ClioNodeFromTestBundle { std::string testName; bool readOnly; bool fallback; + bool loadingCache; bool writing; ClioNode::DbRole expectedRole; }; @@ -173,6 +175,7 @@ INSTANTIATE_TEST_SUITE_P( .testName = "ReadOnly", .readOnly = true, .fallback = false, + .loadingCache = false, .writing = false, .expectedRole = ClioNode::DbRole::ReadOnly }, @@ -180,13 +183,23 @@ INSTANTIATE_TEST_SUITE_P( .testName = "Fallback", .readOnly = false, .fallback = true, + .loadingCache = false, .writing = false, .expectedRole = ClioNode::DbRole::Fallback }, + ClioNodeFromTestBundle{ + .testName = "LoadingCache", + .readOnly = false, + .fallback = false, + .loadingCache = true, + .writing = false, + .expectedRole = ClioNode::DbRole::LoadingCache + }, ClioNodeFromTestBundle{ .testName = "NotWriterNotReadOnly", .readOnly = false, .fallback = false, + .loadingCache = false, .writing = false, .expectedRole = ClioNode::DbRole::NotWriter }, @@ -194,6 +207,7 @@ INSTANTIATE_TEST_SUITE_P( .testName = "Writer", .readOnly = false, .fallback = false, + .loadingCache = false, .writing = true, .expectedRole = ClioNode::DbRole::Writer } @@ -209,7 +223,10 @@ TEST_P(ClioNodeFromTest, FromWriterState) if (not param.readOnly) { EXPECT_CALL(writerState, isFallback()).WillOnce(testing::Return(param.fallback)); if (not param.fallback) { - EXPECT_CALL(writerState, isWriting()).WillOnce(testing::Return(param.writing)); + EXPECT_CALL(writerState, isLoadingCache()).WillOnce(testing::Return(param.loadingCache)); + if (not param.loadingCache) { + EXPECT_CALL(writerState, isWriting()).WillOnce(testing::Return(param.writing)); + } } } @@ -222,3 +239,8 @@ TEST_P(ClioNodeFromTest, FromWriterState) EXPECT_GE(node.updateTime, beforeTime); EXPECT_LE(node.updateTime, afterTime); } + + + + + diff --git a/tests/unit/cluster/WriterDeciderTests.cpp b/tests/unit/cluster/WriterDeciderTests.cpp index 3cee39e24d..d9e50f51c1 100644 --- a/tests/unit/cluster/WriterDeciderTests.cpp +++ b/tests/unit/cluster/WriterDeciderTests.cpp @@ -176,10 +176,10 @@ INSTANTIATE_TEST_SUITE_P( .expectedAction = ExpectedAction::StartWriting }, WriterDeciderTestParams{ - .testName = "AllNodesReadOnlyNoActionTaken", + .testName = "AllNodesReadOnlyGiveUpWriting", .selfUuidValue = 0x01, .nodes = {{0x01, ClioNode::DbRole::ReadOnly}, {0x02, ClioNode::DbRole::ReadOnly}}, - .expectedAction = ExpectedAction::NoAction + .expectedAction = ExpectedAction::GiveUpWriting }, WriterDeciderTestParams{ .testName = "EmptyClusterDataNoActionTaken", @@ -244,10 +244,10 @@ INSTANTIATE_TEST_SUITE_P( .expectedAction = ExpectedAction::SetFallback }, WriterDeciderTestParams{ - .testName = "SelfIsReadOnlyOthersAreFallbackNoActionTaken", + .testName = "SelfIsReadOnlyOthersAreFallbackGiveUpWriting", .selfUuidValue = 0x01, .nodes = {{0x01, ClioNode::DbRole::ReadOnly}, {0x02, ClioNode::DbRole::Fallback}}, - .expectedAction = ExpectedAction::NoAction + .expectedAction = ExpectedAction::GiveUpWriting }, WriterDeciderTestParams{ .testName = "MultipleFallbackNodesSelfNotFallbackSetsFallback", @@ -267,6 +267,47 @@ INSTANTIATE_TEST_SUITE_P( {0x03, ClioNode::DbRole::Fallback}, {0x04, ClioNode::DbRole::Writer}}, .expectedAction = ExpectedAction::SetFallback + }, + WriterDeciderTestParams{ + .testName = "SelfIsLoadingCacheOtherIsWriter", + .selfUuidValue = 0x01, + .nodes = {{0x01, ClioNode::DbRole::LoadingCache}, {0x02, ClioNode::DbRole::Writer}}, + .expectedAction = ExpectedAction::GiveUpWriting + }, + WriterDeciderTestParams{ + .testName = "OtherNodeIsLoadingCacheSkipToNextWriter", + .selfUuidValue = 0x02, + .nodes = + {{0x01, ClioNode::DbRole::LoadingCache}, + {0x02, ClioNode::DbRole::Writer}, + {0x03, ClioNode::DbRole::NotWriter}}, + .expectedAction = ExpectedAction::StartWriting + }, + WriterDeciderTestParams{ + .testName = "AllNodesLoadingCacheNoActionTaken", + .selfUuidValue = 0x01, + .nodes = {{0x01, ClioNode::DbRole::LoadingCache}, {0x02, ClioNode::DbRole::LoadingCache}}, + .expectedAction = ExpectedAction::NoAction + }, + WriterDeciderTestParams{ + .testName = "MixedWithLoadingCacheReadOnlyFirstNonReadOnlyNonLoadingCacheSelected", + .selfUuidValue = 0x03, + .nodes = + {{0x01, ClioNode::DbRole::ReadOnly}, + {0x02, ClioNode::DbRole::LoadingCache}, + {0x03, ClioNode::DbRole::Writer}, + {0x04, ClioNode::DbRole::NotWriter}}, + .expectedAction = ExpectedAction::StartWriting + }, + WriterDeciderTestParams{ + .testName = "LoadingCacheBeforeWriterSkipsLoadingCache", + .selfUuidValue = 0x04, + .nodes = + {{0x01, ClioNode::DbRole::LoadingCache}, + {0x02, ClioNode::DbRole::LoadingCache}, + {0x03, ClioNode::DbRole::Writer}, + {0x04, ClioNode::DbRole::NotWriter}}, + .expectedAction = ExpectedAction::GiveUpWriting } ), [](testing::TestParamInfo const& info) { return info.param.testName; } From edc1dc98e008ad78b9d464cfa3462277d7d76f10 Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Fri, 16 Jan 2026 14:11:43 +0000 Subject: [PATCH 36/41] Run pre-commit --- src/cluster/WriterDecider.cpp | 1 - src/etl/ETLService.hpp | 3 ++- tests/unit/cluster/BackendTests.cpp | 1 - tests/unit/cluster/ClioNodeTests.cpp | 5 ----- 4 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/cluster/WriterDecider.cpp b/src/cluster/WriterDecider.cpp index 656b2df46f..bd7068a57a 100644 --- a/src/cluster/WriterDecider.cpp +++ b/src/cluster/WriterDecider.cpp @@ -24,7 +24,6 @@ #include "etl/WriterState.hpp" #include "util/Assert.hpp" #include "util/Spawn.hpp" -#include "util/log/Logger.hpp" #include diff --git a/src/etl/ETLService.hpp b/src/etl/ETLService.hpp index d4678d1eac..5effa8eecf 100644 --- a/src/etl/ETLService.hpp +++ b/src/etl/ETLService.hpp @@ -215,7 +215,8 @@ class ETLService : public ETLServiceInterface { [[nodiscard]] uint32_t syncCacheWithDb(); - void updateCache(uint32_t seq); + void + updateCache(uint32_t seq); void startMonitor(uint32_t seq); diff --git a/tests/unit/cluster/BackendTests.cpp b/tests/unit/cluster/BackendTests.cpp index 0155ebf5c1..4d350a8c9a 100644 --- a/tests/unit/cluster/BackendTests.cpp +++ b/tests/unit/cluster/BackendTests.cpp @@ -276,4 +276,3 @@ TEST_F(ClusterBackendTest, WriteNodeMessageWritesSelfDataWithRecentTimestampAndD clusterBackend.run(); semaphore.acquire(); } - diff --git a/tests/unit/cluster/ClioNodeTests.cpp b/tests/unit/cluster/ClioNodeTests.cpp index adcfb2fded..634fbe4445 100644 --- a/tests/unit/cluster/ClioNodeTests.cpp +++ b/tests/unit/cluster/ClioNodeTests.cpp @@ -239,8 +239,3 @@ TEST_P(ClioNodeFromTest, FromWriterState) EXPECT_GE(node.updateTime, beforeTime); EXPECT_LE(node.updateTime, afterTime); } - - - - - From 8b02c3f5c5ed7adb94e9ef6f9a36fe9fa89c576b Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Fri, 16 Jan 2026 15:43:44 +0000 Subject: [PATCH 37/41] Fix hanging test --- src/etl/SystemState.hpp | 1 + src/etl/WriterState.hpp | 10 ++++++++++ tests/unit/etl/MonitorTests.cpp | 5 +---- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/etl/SystemState.hpp b/src/etl/SystemState.hpp index 6386e0efe0..5f0f12f65b 100644 --- a/src/etl/SystemState.hpp +++ b/src/etl/SystemState.hpp @@ -74,6 +74,7 @@ struct SystemState { "Whether the process is writing to the database" ); + /** @brief Whether the process is still loading cache after startup. */ util::prometheus::Bool isLoadingCache = PrometheusService::boolMetric( "etl_loading_cache", util::prometheus::Labels{}, diff --git a/src/etl/WriterState.hpp b/src/etl/WriterState.hpp index 3f375ca865..b30c5162a9 100644 --- a/src/etl/WriterState.hpp +++ b/src/etl/WriterState.hpp @@ -87,6 +87,11 @@ class WriterStateInterface { virtual void setWriterDecidingFallback() = 0; + /** + * @brief Whether clio is still loading cache after startup. + * + * @return true if clio is still loading cache, false otherwise. + */ [[nodiscard]] virtual bool isLoadingCache() const = 0; @@ -166,6 +171,11 @@ class WriterState : public WriterStateInterface { bool isFallback() const override; + /** + * @brief Whether clio is still loading cache after startup. + * + * @return true if clio is still loading cache, false otherwise. + */ bool isLoadingCache() const override; diff --git a/tests/unit/etl/MonitorTests.cpp b/tests/unit/etl/MonitorTests.cpp index ee185679e8..7cd663cd4e 100644 --- a/tests/unit/etl/MonitorTests.cpp +++ b/tests/unit/etl/MonitorTests.cpp @@ -164,10 +164,7 @@ TEST_F(MonitorTests, DbStalledChannelTriggeredWhenTimeoutExceeded) EXPECT_CALL(*ledgers_, subscribe(testing::_)); EXPECT_CALL(*backend_, hardFetchLedgerRange(testing::_)).WillRepeatedly(testing::Return(std::nullopt)); - EXPECT_CALL(dbStalledMock_, Call()).WillOnce([&]() { - monitor_.stop(); // Prevent monitor to have another loop between semaphore and destructor - unblock.release(); - }); + EXPECT_CALL(dbStalledMock_, Call()).WillOnce([&]() { unblock.release(); }); auto subscription = monitor_.subscribeToDbStalled(dbStalledMock_.AsStdFunction()); monitor_.run(std::chrono::nanoseconds{100}); From 18f2fd98fd66290c1317ca090d048f6c74c472df Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Fri, 16 Jan 2026 16:28:36 +0000 Subject: [PATCH 38/41] More ClusterBackend tests for better coverage --- tests/unit/cluster/BackendTests.cpp | 73 +++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/tests/unit/cluster/BackendTests.cpp b/tests/unit/cluster/BackendTests.cpp index 4d350a8c9a..f3becaf457 100644 --- a/tests/unit/cluster/BackendTests.cpp +++ b/tests/unit/cluster/BackendTests.cpp @@ -208,6 +208,42 @@ TEST_F(ClusterBackendTest, FetchClioNodesDataReturnsDataWithOtherNodes) semaphore.acquire(); } +TEST_F(ClusterBackendTest, FetchClioNodesDataReturnsOnlySelfData) +{ + Backend clusterBackend{ + ctx, backend_, std::move(writerState), std::chrono::milliseconds(1), std::chrono::milliseconds(1) + }; + + clusterBackend.subscribeToNewState(callbackMock.AsStdFunction()); + + auto const selfNodeJson = R"({ + "db_role": 1, + "update_time": "2025-01-16T10:30:00Z" + })"; + + EXPECT_CALL(*backend_, fetchClioNodesData).Times(testing::AtLeast(1)).WillRepeatedly([&]() { + return BackendInterface::ClioNodesDataFetchResult{ + std::vector>{{*clusterBackend.selfId(), selfNodeJson}} + }; + }); + EXPECT_CALL(*backend_, writeNodeMessage).Times(testing::AtLeast(1)); + EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(true)); + EXPECT_CALL(callbackMock, Call) + .Times(testing::AtLeast(1)) + .WillRepeatedly([this](ClioNode::cUUID selfId, std::shared_ptr clusterData) { + SemaphoreReleaseGuard guard{semaphore}; + ASSERT_TRUE(clusterData->has_value()); + EXPECT_EQ(clusterData->value().size(), 1); + auto const& nodeData = clusterData->value().front(); + EXPECT_EQ(nodeData.uuid, selfId); + EXPECT_EQ(nodeData.dbRole, ClioNode::DbRole::ReadOnly); + EXPECT_LE(nodeData.updateTime, std::chrono::system_clock::now()); + }); + + clusterBackend.run(); + semaphore.acquire(); +} + TEST_F(ClusterBackendTest, FetchClioNodesDataReturnsInvalidJson) { Backend clusterBackend{ @@ -243,6 +279,43 @@ TEST_F(ClusterBackendTest, FetchClioNodesDataReturnsInvalidJson) semaphore.acquire(); } +TEST_F(ClusterBackendTest, FetchClioNodesDataReturnsValidJsonButCannotConvertToClioNode) +{ + Backend clusterBackend{ + ctx, backend_, std::move(writerState), std::chrono::milliseconds(1), std::chrono::milliseconds(1) + }; + + clusterBackend.subscribeToNewState(callbackMock.AsStdFunction()); + + auto const otherUuid = boost::uuids::random_generator{}(); + // Valid JSON but missing required field 'db_role' + auto const validJsonMissingField = R"({ + "update_time": "2025-01-16T10:30:00Z" + })"; + + EXPECT_CALL(*backend_, fetchClioNodesData) + .Times(testing::AtLeast(1)) + .WillRepeatedly( + testing::Return( + BackendInterface::ClioNodesDataFetchResult{ + std::vector>{{otherUuid, validJsonMissingField}} + } + ) + ); + EXPECT_CALL(*backend_, writeNodeMessage).Times(testing::AtLeast(1)); + EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(true)); + EXPECT_CALL(callbackMock, Call) + .Times(testing::AtLeast(1)) + .WillRepeatedly([this](ClioNode::cUUID, std::shared_ptr clusterData) { + SemaphoreReleaseGuard guard{semaphore}; + ASSERT_FALSE(clusterData->has_value()); + EXPECT_THAT(clusterData->error(), testing::HasSubstr("Error converting json to ClioNode")); + }); + + clusterBackend.run(); + semaphore.acquire(); +} + TEST_F(ClusterBackendTest, WriteNodeMessageWritesSelfDataWithRecentTimestampAndDbRole) { Backend clusterBackend{ From cf9c29cc989ccd9b2b439f78ea5b379aec91c339 Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Fri, 16 Jan 2026 16:37:23 +0000 Subject: [PATCH 39/41] Add SystemState tests --- tests/unit/CMakeLists.txt | 1 + tests/unit/etl/SystemStateTests.cpp | 73 +++++++++++++++++++++++++++++ 2 files changed, 74 insertions(+) create mode 100644 tests/unit/etl/SystemStateTests.cpp diff --git a/tests/unit/CMakeLists.txt b/tests/unit/CMakeLists.txt index b070db24f6..600d307f5e 100644 --- a/tests/unit/CMakeLists.txt +++ b/tests/unit/CMakeLists.txt @@ -50,6 +50,7 @@ target_sources( etl/SchedulingTests.cpp etl/SourceImplTests.cpp etl/SubscriptionSourceTests.cpp + etl/SystemStateTests.cpp etl/TaskManagerTests.cpp etl/WriterStateTests.cpp etl/ext/CoreTests.cpp diff --git a/tests/unit/etl/SystemStateTests.cpp b/tests/unit/etl/SystemStateTests.cpp new file mode 100644 index 0000000000..a410011f5b --- /dev/null +++ b/tests/unit/etl/SystemStateTests.cpp @@ -0,0 +1,73 @@ +//------------------------------------------------------------------------------ +/* + This file is part of clio: https://github.com/XRPLF/clio + Copyright (c) 2026, the clio developers. + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +*/ +//============================================================================== + +#include "etl/SystemState.hpp" +#include "util/MockPrometheus.hpp" +#include "util/config/ConfigDefinition.hpp" +#include "util/config/ConfigFileJson.hpp" +#include "util/config/ConfigValue.hpp" +#include "util/config/Types.hpp" + +#include +#include +#include +#include + +#include + +using namespace etl; +using namespace util::config; + +struct SystemStateTest : util::prometheus::WithPrometheus {}; + +TEST_F(SystemStateTest, InitialValuesAreCorrect) +{ + auto state = SystemState{}; + + EXPECT_FALSE(state.isStrictReadonly); + EXPECT_FALSE(state.isWriting); + EXPECT_TRUE(state.isLoadingCache); + EXPECT_FALSE(state.isAmendmentBlocked); + EXPECT_FALSE(state.isCorruptionDetected); + EXPECT_FALSE(state.isWriterDecidingFallback); +} + +struct SystemStateReadOnlyTest : util::prometheus::WithPrometheus, testing::WithParamInterface {}; + +TEST_P(SystemStateReadOnlyTest, MakeSystemStateWithReadOnly) +{ + auto const readOnlyValue = GetParam(); + auto const configJson = boost::json::parse(fmt::format(R"({{"read_only": {}}})", readOnlyValue)); + + auto config = ClioConfigDefinition{{{"read_only", ConfigValue{ConfigType::Boolean}}}}; + auto const configFile = ConfigFileJson{configJson.as_object()}; + auto const errors = config.parse(configFile); + ASSERT_FALSE(errors.has_value()); + + auto state = SystemState::makeSystemState(config); + + EXPECT_EQ(state->isStrictReadonly, readOnlyValue); + EXPECT_FALSE(state->isWriting); + EXPECT_TRUE(state->isLoadingCache); + EXPECT_FALSE(state->isAmendmentBlocked); + EXPECT_FALSE(state->isCorruptionDetected); + EXPECT_FALSE(state->isWriterDecidingFallback); +} + +INSTANTIATE_TEST_SUITE_P(SystemStateTest, SystemStateReadOnlyTest, testing::Values(true, false)); From 8ccd388dc199e3eadb5a50dc93c57c7e488e495e Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Fri, 16 Jan 2026 16:41:15 +0000 Subject: [PATCH 40/41] Add WriterState test --- tests/unit/etl/WriterStateTests.cpp | 53 +++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/tests/unit/etl/WriterStateTests.cpp b/tests/unit/etl/WriterStateTests.cpp index 9dc6e71c3d..872d578fd9 100644 --- a/tests/unit/etl/WriterStateTests.cpp +++ b/tests/unit/etl/WriterStateTests.cpp @@ -107,3 +107,56 @@ TEST_F(WriterStateTest, IsFallbackReturnsSystemStateValue) systemState->isWriterDecidingFallback = true; EXPECT_TRUE(writerState.isFallback()); } + +TEST_F(WriterStateTest, IsReadOnlyReturnsSystemStateValue) +{ + systemState->isStrictReadonly = false; + EXPECT_FALSE(writerState.isReadOnly()); + + systemState->isStrictReadonly = true; + EXPECT_TRUE(writerState.isReadOnly()); +} + +TEST_F(WriterStateTest, IsLoadingCacheReturnsSystemStateValue) +{ + systemState->isLoadingCache = false; + EXPECT_FALSE(writerState.isLoadingCache()); + + systemState->isLoadingCache = true; + EXPECT_TRUE(writerState.isLoadingCache()); +} + +TEST_F(WriterStateTest, CloneCreatesNewInstanceWithSameSystemState) +{ + systemState->isWriting = true; + systemState->isStrictReadonly = true; + systemState->isLoadingCache = false; + + auto cloned = writerState.clone(); + + ASSERT_NE(cloned.get(), &writerState); + EXPECT_TRUE(cloned->isWriting()); + EXPECT_TRUE(cloned->isReadOnly()); + EXPECT_FALSE(cloned->isLoadingCache()); +} + +TEST_F(WriterStateTest, ClonedInstanceSharesSystemState) +{ + auto cloned = writerState.clone(); + + systemState->isWriting = true; + + EXPECT_TRUE(writerState.isWriting()); + EXPECT_TRUE(cloned->isWriting()); + + systemState->isWriting = false; + + EXPECT_FALSE(writerState.isWriting()); + EXPECT_FALSE(cloned->isWriting()); + + EXPECT_FALSE(writerState.isFallback()); + EXPECT_FALSE(cloned->isFallback()); + cloned->setWriterDecidingFallback(); + EXPECT_TRUE(writerState.isFallback()); + EXPECT_TRUE(cloned->isFallback()); +} From 3ad9d7840529afedfbf37342d48c39cab9ac76ec Mon Sep 17 00:00:00 2001 From: Sergey Kuznetsov Date: Tue, 20 Jan 2026 14:00:09 +0000 Subject: [PATCH 41/41] Fix review comments --- src/cluster/Backend.cpp | 6 ++-- src/cluster/Backend.hpp | 8 +++--- src/cluster/ClioNode.cpp | 13 +++++---- src/cluster/ClioNode.hpp | 8 +++--- src/cluster/Metrics.cpp | 7 +++-- src/cluster/Metrics.hpp | 2 +- src/cluster/WriterDecider.cpp | 2 +- src/cluster/WriterDecider.hpp | 2 +- src/cluster/impl/RepeatedTask.hpp | 2 +- src/util/Channel.hpp | 16 +++++------ tests/unit/cluster/BackendTests.cpp | 28 +++++++++---------- .../ClusterCommunicationServiceTests.cpp | 2 +- tests/unit/cluster/MetricsTests.cpp | 17 ++--------- tests/unit/cluster/RepeatedTaskTests.cpp | 3 -- tests/unit/cluster/WriterDeciderTests.cpp | 2 +- tests/unit/etl/SystemStateTests.cpp | 2 +- 16 files changed, 54 insertions(+), 66 deletions(-) diff --git a/src/cluster/Backend.cpp b/src/cluster/Backend.cpp index dad5f42252..78a50cd842 100644 --- a/src/cluster/Backend.cpp +++ b/src/cluster/Backend.cpp @@ -86,7 +86,7 @@ Backend::stop() writerTask_.stop(); } -ClioNode::cUUID +ClioNode::CUuid Backend::selfId() const { return selfUuid_; @@ -103,7 +103,7 @@ Backend::doRead(boost::asio::yield_context yield) } if (!expectedResult.has_value()) { - return std::unexpected{"Failed to fetch nodes data"}; + return std::unexpected{std::move(expectedResult).error()}; } std::vector otherNodesData; @@ -126,7 +126,7 @@ Backend::doRead(boost::asio::yield_context yield) otherNodesData.push_back(std::move(expectedNodeData).value()); } otherNodesData.push_back(ClioNode::from(selfUuid_, *writerState_)); - return std::vector(otherNodesData); + return otherNodesData; } void diff --git a/src/cluster/Backend.hpp b/src/cluster/Backend.hpp index 558ba6cbb1..41ea73f835 100644 --- a/src/cluster/Backend.hpp +++ b/src/cluster/Backend.hpp @@ -66,9 +66,9 @@ class Backend { impl::RepeatedTask readerTask_; impl::RepeatedTask writerTask_; - ClioNode::UUID selfUuid_; + ClioNode::Uuid selfUuid_; - boost::signals2::signal)> onNewState_; + boost::signals2::signal)> onNewState_; public: /** @@ -121,7 +121,7 @@ class Backend { * @return A connection object that can be used to unsubscribe */ template - requires std::invocable> + requires std::invocable> boost::signals2::connection subscribeToNewState(S&& s) { @@ -133,7 +133,7 @@ class Backend { * * @return The UUID of this node. */ - ClioNode::cUUID + ClioNode::CUuid selfId() const; private: diff --git a/src/cluster/ClioNode.cpp b/src/cluster/ClioNode.cpp index 7aa13e76e4..a9dc528e65 100644 --- a/src/cluster/ClioNode.cpp +++ b/src/cluster/ClioNode.cpp @@ -39,7 +39,7 @@ namespace cluster { namespace { -struct Fields { +struct JsonFields { static constexpr std::string_view const kUPDATE_TIME = "update_time"; static constexpr std::string_view const kDB_ROLE = "db_role"; }; @@ -47,7 +47,7 @@ struct Fields { } // namespace ClioNode -ClioNode::from(ClioNode::UUID uuid, etl::WriterStateInterface const& writerState) +ClioNode::from(ClioNode::Uuid uuid, etl::WriterStateInterface const& writerState) { auto const dbRole = [&writerState]() { if (writerState.isReadOnly()) { @@ -69,25 +69,26 @@ void tag_invoke(boost::json::value_from_tag, boost::json::value& jv, ClioNode const& node) { jv = { - {Fields::kUPDATE_TIME, util::systemTpToUtcStr(node.updateTime, ClioNode::kTIME_FORMAT)}, - {Fields::kDB_ROLE, static_cast(node.dbRole)} + {JsonFields::kUPDATE_TIME, util::systemTpToUtcStr(node.updateTime, ClioNode::kTIME_FORMAT)}, + {JsonFields::kDB_ROLE, static_cast(node.dbRole)} }; } ClioNode tag_invoke(boost::json::value_to_tag, boost::json::value const& jv) { - auto const& updateTimeStr = jv.as_object().at(Fields::kUPDATE_TIME).as_string(); + auto const& updateTimeStr = jv.as_object().at(JsonFields::kUPDATE_TIME).as_string(); auto const updateTime = util::systemTpFromUtcStr(std::string(updateTimeStr), ClioNode::kTIME_FORMAT); if (!updateTime.has_value()) { throw std::runtime_error("Failed to parse update time"); } - auto const dbRoleValue = jv.as_object().at(Fields::kDB_ROLE).as_int64(); + auto const dbRoleValue = jv.as_object().at(JsonFields::kDB_ROLE).as_int64(); if (dbRoleValue > static_cast(ClioNode::DbRole::MAX)) throw std::runtime_error("Invalid db_role value"); return ClioNode{ + // Json data doesn't contain uuid so leaving it empty here. It will be filled outside of this parsing .uuid = std::make_shared(), .updateTime = updateTime.value(), .dbRole = static_cast(dbRoleValue) diff --git a/src/cluster/ClioNode.hpp b/src/cluster/ClioNode.hpp index 2da8445d57..ea2a83e971 100644 --- a/src/cluster/ClioNode.hpp +++ b/src/cluster/ClioNode.hpp @@ -54,10 +54,10 @@ struct ClioNode { */ enum class DbRole { ReadOnly = 0, LoadingCache = 1, NotWriter = 2, Writer = 3, Fallback = 4, MAX = 4 }; - using UUID = std::shared_ptr; - using cUUID = std::shared_ptr; + using Uuid = std::shared_ptr; + using CUuid = std::shared_ptr; - UUID uuid; ///< The UUID of the node. + Uuid uuid; ///< The UUID of the node. std::chrono::system_clock::time_point updateTime; ///< The time the data about the node was last updated. DbRole dbRole; ///< The database role of the node @@ -69,7 +69,7 @@ struct ClioNode { * @return A ClioNode with the current time and role derived from writerState */ static ClioNode - from(UUID uuid, etl::WriterStateInterface const& writerState); + from(Uuid uuid, etl::WriterStateInterface const& writerState); }; void diff --git a/src/cluster/Metrics.cpp b/src/cluster/Metrics.cpp index ecef9dccad..371a7a203d 100644 --- a/src/cluster/Metrics.cpp +++ b/src/cluster/Metrics.cpp @@ -33,11 +33,14 @@ Metrics::Metrics() } void -Metrics::onNewState(ClioNode::cUUID, std::shared_ptr clusterData) +Metrics::onNewState(ClioNode::CUuid, std::shared_ptr clusterData) { - isHealthy_ = clusterData->has_value(); if (clusterData->has_value()) { + isHealthy_ = true; nodesInClusterMetric_.set(clusterData->value().size()); + } else { + isHealthy_ = false; + nodesInClusterMetric_.set(1); } } diff --git a/src/cluster/Metrics.hpp b/src/cluster/Metrics.hpp index 147f72620e..affea41e7b 100644 --- a/src/cluster/Metrics.hpp +++ b/src/cluster/Metrics.hpp @@ -70,7 +70,7 @@ class Metrics { * @param clusterData Shared pointer to the current cluster data; may be empty if communication failed */ void - onNewState(ClioNode::cUUID uuid, std::shared_ptr clusterData); + onNewState(ClioNode::CUuid uuid, std::shared_ptr clusterData); }; } // namespace cluster diff --git a/src/cluster/WriterDecider.cpp b/src/cluster/WriterDecider.cpp index bd7068a57a..4713ca807b 100644 --- a/src/cluster/WriterDecider.cpp +++ b/src/cluster/WriterDecider.cpp @@ -40,7 +40,7 @@ WriterDecider::WriterDecider(boost::asio::thread_pool& ctx, std::unique_ptr clusterData) +WriterDecider::onNewState(ClioNode::CUuid selfId, std::shared_ptr clusterData) { if (not clusterData->has_value()) return; diff --git a/src/cluster/WriterDecider.hpp b/src/cluster/WriterDecider.hpp index 8ee38990d0..0a3c0bfce6 100644 --- a/src/cluster/WriterDecider.hpp +++ b/src/cluster/WriterDecider.hpp @@ -69,7 +69,7 @@ class WriterDecider { * @param clusterData Shared pointer to current cluster data; may be empty if communication failed */ void - onNewState(ClioNode::cUUID selfId, std::shared_ptr clusterData); + onNewState(ClioNode::CUuid selfId, std::shared_ptr clusterData); }; } // namespace cluster diff --git a/src/cluster/impl/RepeatedTask.hpp b/src/cluster/impl/RepeatedTask.hpp index 37f70cd953..9037c6a47d 100644 --- a/src/cluster/impl/RepeatedTask.hpp +++ b/src/cluster/impl/RepeatedTask.hpp @@ -39,7 +39,7 @@ namespace cluster::impl { -// TODO: Try to replace util/Repeat by this +// TODO: Try to replace util::Repeat by this. https://github.com/XRPLF/clio/issues/2926 template class RepeatedTask { std::chrono::steady_clock::duration interval_; diff --git a/src/util/Channel.hpp b/src/util/Channel.hpp index d85128ae39..af4fbbeeda 100644 --- a/src/util/Channel.hpp +++ b/src/util/Channel.hpp @@ -46,19 +46,19 @@ struct ChannelInstantiated; /** * @brief Specifies the producer concurrency model for a Channel. - * - * - Single: Only one Sender can exist (non-copyable). Uses direct Guard ownership for zero overhead. - * - Multi: Multiple Senders can exist (copyable). Uses shared_ptr for shared ownership. */ -enum class ProducerType { Single, Multi }; +enum class ProducerType { + Single, /**< Only one Sender can exist (non-copyable). Uses direct Guard ownership for zero overhead. */ + Multi /**< Multiple Senders can exist (copyable). Uses shared_ptr for shared ownership. */ +}; /** * @brief Specifies the consumer concurrency model for a Channel. - * - * - Single: Only one Receiver can exist (non-copyable). Uses direct Guard ownership for zero overhead. - * - Multi: Multiple Receivers can exist (copyable). Uses shared_ptr for shared ownership. */ -enum class ConsumerType { Single, Multi }; +enum class ConsumerType { + Single, /**< Only one Receiver can exist (non-copyable). Uses direct Guard ownership for zero overhead. */ + Multi /**< Multiple Receivers can exist (copyable). Uses shared_ptr for shared ownership. */ +}; /** * @brief Represents a go-like channel, a multi-producer (Sender) multi-consumer (Receiver) thread-safe data pipe. diff --git a/tests/unit/cluster/BackendTests.cpp b/tests/unit/cluster/BackendTests.cpp index f3becaf457..869ff18280 100644 --- a/tests/unit/cluster/BackendTests.cpp +++ b/tests/unit/cluster/BackendTests.cpp @@ -57,7 +57,7 @@ struct ClusterBackendTest : util::prometheus::WithPrometheus, MockBackendTestStr boost::asio::thread_pool ctx; std::unique_ptr writerState = std::make_unique(); MockWriterState& writerStateRef = *writerState; - testing::StrictMock)>> + testing::StrictMock)>> callbackMock; std::binary_semaphore semaphore{0}; @@ -90,7 +90,7 @@ TEST_F(ClusterBackendTest, SubscribeToNewState) EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(true)); EXPECT_CALL(callbackMock, Call) .Times(testing::AtLeast(1)) - .WillRepeatedly([this](ClioNode::cUUID selfId, std::shared_ptr clusterData) { + .WillRepeatedly([this](ClioNode::CUuid selfId, std::shared_ptr clusterData) { SemaphoreReleaseGuard guard{semaphore}; ASSERT_TRUE(clusterData->has_value()); EXPECT_EQ(clusterData->value().size(), 1); @@ -140,10 +140,10 @@ TEST_F(ClusterBackendTest, FetchClioNodesDataThrowsException) EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(true)); EXPECT_CALL(callbackMock, Call) .Times(testing::AtLeast(1)) - .WillRepeatedly([this](ClioNode::cUUID, std::shared_ptr clusterData) { + .WillRepeatedly([this](ClioNode::CUuid, std::shared_ptr clusterData) { SemaphoreReleaseGuard guard{semaphore}; ASSERT_FALSE(clusterData->has_value()); - EXPECT_EQ(clusterData->error(), "Failed to fetch nodes data"); + EXPECT_EQ(clusterData->error(), "Failed to fetch Clio nodes data"); }); clusterBackend.run(); @@ -159,10 +159,10 @@ TEST_F(ClusterBackendTest, FetchClioNodesDataReturnsDataWithOtherNodes) clusterBackend.subscribeToNewState(callbackMock.AsStdFunction()); auto const otherUuid = boost::uuids::random_generator{}(); - auto const otherNodeJson = R"({ + auto const otherNodeJson = R"JSON({ "db_role": 3, "update_time": "2025-01-15T10:30:00Z" - })"; + })JSON"; EXPECT_CALL(*backend_, fetchClioNodesData) .Times(testing::AtLeast(1)) @@ -180,7 +180,7 @@ TEST_F(ClusterBackendTest, FetchClioNodesDataReturnsDataWithOtherNodes) EXPECT_CALL(writerStateRef, isWriting).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false)); EXPECT_CALL(callbackMock, Call) .Times(testing::AtLeast(1)) - .WillRepeatedly([&](ClioNode::cUUID selfId, std::shared_ptr clusterData) { + .WillRepeatedly([&](ClioNode::CUuid selfId, std::shared_ptr clusterData) { SemaphoreReleaseGuard guard{semaphore}; ASSERT_TRUE(clusterData->has_value()) << clusterData->error(); EXPECT_EQ(clusterData->value().size(), 2); @@ -216,10 +216,10 @@ TEST_F(ClusterBackendTest, FetchClioNodesDataReturnsOnlySelfData) clusterBackend.subscribeToNewState(callbackMock.AsStdFunction()); - auto const selfNodeJson = R"({ + auto const selfNodeJson = R"JSON({ "db_role": 1, "update_time": "2025-01-16T10:30:00Z" - })"; + })JSON"; EXPECT_CALL(*backend_, fetchClioNodesData).Times(testing::AtLeast(1)).WillRepeatedly([&]() { return BackendInterface::ClioNodesDataFetchResult{ @@ -230,7 +230,7 @@ TEST_F(ClusterBackendTest, FetchClioNodesDataReturnsOnlySelfData) EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(true)); EXPECT_CALL(callbackMock, Call) .Times(testing::AtLeast(1)) - .WillRepeatedly([this](ClioNode::cUUID selfId, std::shared_ptr clusterData) { + .WillRepeatedly([this](ClioNode::CUuid selfId, std::shared_ptr clusterData) { SemaphoreReleaseGuard guard{semaphore}; ASSERT_TRUE(clusterData->has_value()); EXPECT_EQ(clusterData->value().size(), 1); @@ -268,7 +268,7 @@ TEST_F(ClusterBackendTest, FetchClioNodesDataReturnsInvalidJson) EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(true)); EXPECT_CALL(callbackMock, Call) .Times(testing::AtLeast(1)) - .WillRepeatedly([this, invalidJson](ClioNode::cUUID, std::shared_ptr clusterData) { + .WillRepeatedly([this, invalidJson](ClioNode::CUuid, std::shared_ptr clusterData) { SemaphoreReleaseGuard guard{semaphore}; ASSERT_FALSE(clusterData->has_value()); EXPECT_THAT(clusterData->error(), testing::HasSubstr("Error parsing json from DB")); @@ -289,9 +289,9 @@ TEST_F(ClusterBackendTest, FetchClioNodesDataReturnsValidJsonButCannotConvertToC auto const otherUuid = boost::uuids::random_generator{}(); // Valid JSON but missing required field 'db_role' - auto const validJsonMissingField = R"({ + auto const validJsonMissingField = R"JSON({ "update_time": "2025-01-16T10:30:00Z" - })"; + })JSON"; EXPECT_CALL(*backend_, fetchClioNodesData) .Times(testing::AtLeast(1)) @@ -306,7 +306,7 @@ TEST_F(ClusterBackendTest, FetchClioNodesDataReturnsValidJsonButCannotConvertToC EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(true)); EXPECT_CALL(callbackMock, Call) .Times(testing::AtLeast(1)) - .WillRepeatedly([this](ClioNode::cUUID, std::shared_ptr clusterData) { + .WillRepeatedly([this](ClioNode::CUuid, std::shared_ptr clusterData) { SemaphoreReleaseGuard guard{semaphore}; ASSERT_FALSE(clusterData->has_value()); EXPECT_THAT(clusterData->error(), testing::HasSubstr("Error converting json to ClioNode")); diff --git a/tests/unit/cluster/ClusterCommunicationServiceTests.cpp b/tests/unit/cluster/ClusterCommunicationServiceTests.cpp index 6c287d5033..87d9651455 100644 --- a/tests/unit/cluster/ClusterCommunicationServiceTests.cpp +++ b/tests/unit/cluster/ClusterCommunicationServiceTests.cpp @@ -13,7 +13,7 @@ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ //============================================================================== diff --git a/tests/unit/cluster/MetricsTests.cpp b/tests/unit/cluster/MetricsTests.cpp index 477895d18d..d747ddfcf0 100644 --- a/tests/unit/cluster/MetricsTests.cpp +++ b/tests/unit/cluster/MetricsTests.cpp @@ -63,13 +63,11 @@ TEST_F(MetricsTest, OnNewStateWithValidClusterData) auto& nodesInClusterMock = makeMock("cluster_nodes_total_number", ""); auto& isHealthyMock = makeMock("cluster_communication_is_healthy", ""); - // Initial construction expectations EXPECT_CALL(nodesInClusterMock, set(1)); EXPECT_CALL(isHealthyMock, set(1)); Metrics metrics; - // Create cluster data with 3 nodes ClioNode node1{.uuid = uuid1, .updateTime = std::chrono::system_clock::now(), .dbRole = ClioNode::DbRole::Writer}; ClioNode node2{.uuid = uuid2, .updateTime = std::chrono::system_clock::now(), .dbRole = ClioNode::DbRole::ReadOnly}; ClioNode node3{ @@ -80,7 +78,6 @@ TEST_F(MetricsTest, OnNewStateWithValidClusterData) Backend::ClusterData clusterData = std::expected, std::string>(nodes); auto sharedClusterData = std::make_shared(clusterData); - // Expect metrics to be updated: health = true (1), node count = 3 EXPECT_CALL(isHealthyMock, set(1)); EXPECT_CALL(nodesInClusterMock, set(3)); @@ -92,18 +89,15 @@ TEST_F(MetricsTest, OnNewStateWithEmptyClusterData) auto& nodesInClusterMock = makeMock("cluster_nodes_total_number", ""); auto& isHealthyMock = makeMock("cluster_communication_is_healthy", ""); - // Initial construction expectations EXPECT_CALL(nodesInClusterMock, set(1)); EXPECT_CALL(isHealthyMock, set(1)); Metrics metrics; - // Create empty cluster data (0 nodes) std::vector nodes = {}; Backend::ClusterData clusterData = std::expected, std::string>(nodes); auto sharedClusterData = std::make_shared(clusterData); - // Expect metrics to be updated: health = true (1), node count = 0 EXPECT_CALL(isHealthyMock, set(1)); EXPECT_CALL(nodesInClusterMock, set(0)); @@ -115,19 +109,17 @@ TEST_F(MetricsTest, OnNewStateWithFailedClusterData) auto& nodesInClusterMock = makeMock("cluster_nodes_total_number", ""); auto& isHealthyMock = makeMock("cluster_communication_is_healthy", ""); - // Initial construction expectations EXPECT_CALL(nodesInClusterMock, set(1)); EXPECT_CALL(isHealthyMock, set(1)); Metrics metrics; - // Create failed cluster data (unexpected error) Backend::ClusterData clusterData = std::expected, std::string>(std::unexpected("Connection failed")); auto sharedClusterData = std::make_shared(clusterData); - // Expect health to be set to false (0), node count should not be updated EXPECT_CALL(isHealthyMock, set(0)); + EXPECT_CALL(nodesInClusterMock, set(1)); metrics.onNewState(uuid1, sharedClusterData); } @@ -137,20 +129,17 @@ TEST_F(MetricsTest, OnNewStateWithSingleNode) auto& nodesInClusterMock = makeMock("cluster_nodes_total_number", ""); auto& isHealthyMock = makeMock("cluster_communication_is_healthy", ""); - // Initial construction expectations EXPECT_CALL(nodesInClusterMock, set(1)); EXPECT_CALL(isHealthyMock, set(1)); Metrics metrics; - // Create cluster data with just 1 node (self) ClioNode node1{.uuid = uuid1, .updateTime = std::chrono::system_clock::now(), .dbRole = ClioNode::DbRole::Writer}; std::vector nodes = {node1}; Backend::ClusterData clusterData = std::expected, std::string>(nodes); auto sharedClusterData = std::make_shared(clusterData); - // Expect metrics to be updated: health = true (1), node count = 1 EXPECT_CALL(isHealthyMock, set(1)); EXPECT_CALL(nodesInClusterMock, set(1)); @@ -162,22 +151,20 @@ TEST_F(MetricsTest, OnNewStateRecoveryFromFailure) auto& nodesInClusterMock = makeMock("cluster_nodes_total_number", ""); auto& isHealthyMock = makeMock("cluster_communication_is_healthy", ""); - // Initial construction expectations EXPECT_CALL(nodesInClusterMock, set(1)); EXPECT_CALL(isHealthyMock, set(1)); Metrics metrics; - // First update: failure Backend::ClusterData clusterData1 = std::expected, std::string>(std::unexpected("Connection timeout")); auto sharedClusterData1 = std::make_shared(clusterData1); EXPECT_CALL(isHealthyMock, set(0)); + EXPECT_CALL(nodesInClusterMock, set(1)); metrics.onNewState(uuid1, sharedClusterData1); - // Second update: recovery with 2 nodes ClioNode node1{.uuid = uuid1, .updateTime = std::chrono::system_clock::now(), .dbRole = ClioNode::DbRole::Writer}; ClioNode node2{.uuid = uuid2, .updateTime = std::chrono::system_clock::now(), .dbRole = ClioNode::DbRole::ReadOnly}; diff --git a/tests/unit/cluster/RepeatedTaskTests.cpp b/tests/unit/cluster/RepeatedTaskTests.cpp index 90c0fc6b9a..aa399a372c 100644 --- a/tests/unit/cluster/RepeatedTaskTests.cpp +++ b/tests/unit/cluster/RepeatedTaskTests.cpp @@ -177,17 +177,14 @@ TYPED_TEST(RepeatedTaskTypedTest, TaskStateTransitionsCorrectly) { RepeatedTask task(std::chrono::milliseconds(1), this->ctx_); - // Initially not running task.stop(); // Should be no-op this->expectCalls(3); - // Start running task.run(this->mockFn.AsStdFunction()); EXPECT_TRUE(this->semaphore.try_acquire_for(TestFixture::kTIMEOUT)); - // Stop task.stop(); // Stop again should be no-op diff --git a/tests/unit/cluster/WriterDeciderTests.cpp b/tests/unit/cluster/WriterDeciderTests.cpp index d9e50f51c1..c3e6bbe397 100644 --- a/tests/unit/cluster/WriterDeciderTests.cpp +++ b/tests/unit/cluster/WriterDeciderTests.cpp @@ -111,7 +111,7 @@ TEST_P(WriterDeciderTest, WriterSelection) } std::shared_ptr clusterData; - ClioNode::cUUID selfIdPtr; + ClioNode::CUuid selfIdPtr; if (params.useEmptyClusterData) { clusterData = std::make_shared(std::unexpected(std::string("Communication failed"))); diff --git a/tests/unit/etl/SystemStateTests.cpp b/tests/unit/etl/SystemStateTests.cpp index a410011f5b..aeebd9d661 100644 --- a/tests/unit/etl/SystemStateTests.cpp +++ b/tests/unit/etl/SystemStateTests.cpp @@ -53,7 +53,7 @@ struct SystemStateReadOnlyTest : util::prometheus::WithPrometheus, testing::With TEST_P(SystemStateReadOnlyTest, MakeSystemStateWithReadOnly) { auto const readOnlyValue = GetParam(); - auto const configJson = boost::json::parse(fmt::format(R"({{"read_only": {}}})", readOnlyValue)); + auto const configJson = boost::json::parse(fmt::format(R"JSON({{"read_only": {}}})JSON", readOnlyValue)); auto config = ClioConfigDefinition{{{"read_only", ConfigValue{ConfigType::Boolean}}}}; auto const configFile = ConfigFileJson{configJson.as_object()};