Skip to content

Commit 584417a

Browse files
committed
Fix tests and add new
1 parent a64c8c4 commit 584417a

File tree

13 files changed

+183
-7
lines changed

13 files changed

+183
-7
lines changed

src/cluster/ClioNode.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@ struct Fields {
4949
ClioNode
5050
ClioNode::from(ClioNode::UUID uuid, etl::WriterStateInterface const& writerState)
5151
{
52+
// Determine the database role based on writer state priority:
53+
// 1. ReadOnly takes precedence (configured mode)
54+
// 2. Fallback mode indicates cluster-wide fallback mechanism is active
55+
// 3. Otherwise, Writer or NotWriter based on current writing state
5256
auto const dbRole = [&writerState]() {
5357
if (writerState.isReadOnly()) {
5458
return ClioNode::DbRole::ReadOnly;

src/cluster/ClioNode.hpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,19 @@ struct ClioNode {
3939
*/
4040
static constexpr char const* kTIME_FORMAT = "%Y-%m-%dT%H:%M:%SZ";
4141

42-
/** @brief Database role */
42+
/**
43+
* @brief Database role of a node in the cluster.
44+
*
45+
* Roles are used to coordinate which node writes to the database:
46+
* - ReadOnly: Node is configured to never write (strict read-only mode)
47+
* - NotWriter: Node can write but is currently not the designated writer
48+
* - Writer: Node is actively writing to the database
49+
* - Fallback: Node is using the fallback writer decision mechanism
50+
*
51+
* When any node in the cluster is in Fallback mode, the entire cluster switches
52+
* from the cluster communication mechanism to the slower but more reliable
53+
* database-based conflict detection mechanism.
54+
*/
4355
enum class DbRole { ReadOnly = 0, NotWriter = 1, Writer = 2, Fallback = 3, MAX = 3 };
4456

4557
using UUID = std::shared_ptr<boost::uuids::uuid>;

src/cluster/WriterDecider.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,13 +51,19 @@ WriterDecider::onNewState(ClioNode::cUUID selfId, std::shared_ptr<Backend::Clust
5151
[writerState = writerState_->clone(),
5252
selfId = std::move(selfId),
5353
clusterData = clusterData->value()](auto&&) mutable {
54+
// Find this node's data in the cluster state
5455
auto const selfData =
5556
std::ranges::find_if(clusterData, [&selfId](ClioNode const& node) { return node.uuid == selfId; });
5657
ASSERT(selfData != clusterData.end(), "Self data should always be in the cluster data");
58+
59+
// ReadOnly nodes never participate in writer decisions
60+
// Fallback nodes have already switched to fallback mechanism
5761
if (selfData->dbRole == ClioNode::DbRole::ReadOnly or selfData->dbRole == ClioNode::DbRole::Fallback) {
5862
return;
5963
}
6064

65+
// If any node in the cluster is in Fallback mode, the entire cluster must switch
66+
// to the fallback writer decision mechanism for consistency
6167
if (std::ranges::any_of(clusterData, [](ClioNode const& node) {
6268
return node.dbRole == ClioNode::DbRole::Fallback;
6369
})) {

src/etl/ETLService.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,8 @@ ETLService::startMonitor(uint32_t seq)
395395

396396
monitorDbStalledSubscription_ = monitor_->subscribeToDbStalled([this]() {
397397
LOG(log_.warn()) << "ETLService received DbStalled signal from Monitor";
398+
// Database stall detected - no writer has been active for 10 seconds
399+
// This triggers the fallback mechanism and attempts to become the writer
398400
if (not state_->isStrictReadonly and not state_->isWriting)
399401
state_->writeCommandSignal(SystemState::WriteCommand::StartWriting);
400402
state_->isWriterDecidingFallback = true;
@@ -435,3 +437,4 @@ ETLService::giveUpWriter()
435437
}
436438

437439
} // namespace etl
440+

src/etl/SystemState.hpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,11 +113,24 @@ struct SystemState {
113113
"Whether clio detected a corruption that needs manual attention"
114114
);
115115

116+
/**
117+
* @brief Whether the cluster is using the fallback writer decision mechanism.
118+
*
119+
* The fallback mechanism is triggered when:
120+
* - The database stalls for 10 seconds (detected by Monitor), indicating no active writer
121+
* - A write conflict is detected, indicating multiple nodes attempting to write simultaneously
122+
*
123+
* When fallback mode is active, the cluster stops using the cluster communication mechanism
124+
* (TTL-based role announcements) and relies on the slower but more reliable database-based
125+
* conflict detection. This flag propagates across the cluster - if any node enters fallback
126+
* mode, all nodes in the cluster will switch to fallback mode.
127+
*/
116128
util::prometheus::Bool isWriterDecidingFallback = PrometheusService::boolMetric(
117129
"etl_writing_deciding_fallback",
118130
util::prometheus::Labels{},
119-
"Whether clio detected a corruption that needs manual attention"
131+
"Whether the cluster is using the fallback writer decision mechanism"
120132
);
121133
};
122134

123135
} // namespace etl
136+

src/etl/WriterState.hpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,22 @@ class WriterStateInterface {
6868
virtual void
6969
giveUpWriting() = 0;
7070

71+
/**
72+
* @brief Check if the cluster is using the fallback writer decision mechanism.
73+
*
74+
* @return true if the cluster has switched to fallback mode, false otherwise
75+
*/
7176
[[nodiscard]] virtual bool
7277
isFallback() const = 0;
7378

79+
/**
80+
* @brief Switch the cluster to the fallback writer decision mechanism.
81+
*
82+
* This method is called when the cluster needs to transition from the cluster
83+
* communication mechanism to the slower but more reliable fallback mechanism.
84+
* Once set, this flag propagates to all nodes in the cluster through the
85+
* ClioNode DbRole::Fallback state.
86+
*/
7487
virtual void
7588
setWriterDecidingFallback() = 0;
7689

@@ -124,9 +137,20 @@ class WriterState : public WriterStateInterface {
124137
void
125138
giveUpWriting() override;
126139

140+
/**
141+
* @brief Switch the cluster to the fallback writer decision mechanism.
142+
*
143+
* Sets the isWriterDecidingFallback flag in the system state, which will be
144+
* propagated to other nodes in the cluster through the ClioNode DbRole::Fallback state.
145+
*/
127146
void
128147
setWriterDecidingFallback() override;
129148

149+
/**
150+
* @brief Check if the cluster is using the fallback writer decision mechanism.
151+
*
152+
* @return true if the cluster has switched to fallback mode, false otherwise
153+
*/
130154
bool
131155
isFallback() const override;
132156

@@ -135,3 +159,5 @@ class WriterState : public WriterStateInterface {
135159
};
136160

137161
} // namespace etl
162+
163+

src/etl/impl/Loading.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ Loader::load(model::LedgerData const& data)
7575
<< "; took " << duration << "ms";
7676

7777
if (not success) {
78+
// Write conflict detected - another node wrote to the database
79+
// This triggers the fallback mechanism and stops this node from writing
7880
state_->writeCommandSignal(SystemState::WriteCommand::StopWriting);
7981
state_->isWriterDecidingFallback = true;
8082
LOG(log_.warn()) << "Another node wrote a ledger into the DB - we have a write conflict";
@@ -155,3 +157,4 @@ Loader::loadInitialLedger(model::LedgerData const& data)
155157
}
156158

157159
} // namespace etl::impl
160+

tests/unit/cluster/BackendTests.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ TEST_F(ClusterBackendTest, FetchClioNodesDataReturnsDataWithOtherNodes)
175175
);
176176
EXPECT_CALL(*backend_, writeNodeMessage).Times(testing::AtLeast(1));
177177
EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false));
178+
EXPECT_CALL(writerStateRef, isFallback).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false));
178179
EXPECT_CALL(writerStateRef, isWriting).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false));
179180
EXPECT_CALL(callbackMock, Call)
180181
.Times(testing::AtLeast(1))
@@ -253,6 +254,7 @@ TEST_F(ClusterBackendTest, WriteNodeMessageWritesSelfDataWithRecentTimestampAndD
253254
.Times(testing::AtLeast(1))
254255
.WillRepeatedly(testing::Return(BackendInterface::ClioNodesDataFetchResult{}));
255256
EXPECT_CALL(writerStateRef, isReadOnly).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false));
257+
EXPECT_CALL(writerStateRef, isFallback).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false));
256258
EXPECT_CALL(writerStateRef, isWriting).Times(testing::AtLeast(1)).WillRepeatedly(testing::Return(false));
257259
EXPECT_CALL(*backend_, writeNodeMessage)
258260
.Times(testing::AtLeast(1))

tests/unit/cluster/ClioNodeTests.cpp

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,8 @@ INSTANTIATE_TEST_SUITE_P(
111111
testing::Values(
112112
ClioNodeDbRoleTestBundle{.testName = "ReadOnly", .role = ClioNode::DbRole::ReadOnly},
113113
ClioNodeDbRoleTestBundle{.testName = "NotWriter", .role = ClioNode::DbRole::NotWriter},
114-
ClioNodeDbRoleTestBundle{.testName = "Writer", .role = ClioNode::DbRole::Writer}
114+
ClioNodeDbRoleTestBundle{.testName = "Writer", .role = ClioNode::DbRole::Writer},
115+
ClioNodeDbRoleTestBundle{.testName = "Fallback", .role = ClioNode::DbRole::Fallback}
115116
),
116117
tests::util::kNAME_GENERATOR
117118
);
@@ -153,6 +154,7 @@ TEST_F(ClioNodeDbRoleTest, DeserializationMissingDbRole)
153154
struct ClioNodeFromTestBundle {
154155
std::string testName;
155156
bool readOnly;
157+
bool fallback;
156158
bool writing;
157159
ClioNode::DbRole expectedRole;
158160
};
@@ -170,18 +172,28 @@ INSTANTIATE_TEST_SUITE_P(
170172
ClioNodeFromTestBundle{
171173
.testName = "ReadOnly",
172174
.readOnly = true,
175+
.fallback = false,
173176
.writing = false,
174177
.expectedRole = ClioNode::DbRole::ReadOnly
175178
},
179+
ClioNodeFromTestBundle{
180+
.testName = "Fallback",
181+
.readOnly = false,
182+
.fallback = true,
183+
.writing = false,
184+
.expectedRole = ClioNode::DbRole::Fallback
185+
},
176186
ClioNodeFromTestBundle{
177187
.testName = "NotWriterNotReadOnly",
178188
.readOnly = false,
189+
.fallback = false,
179190
.writing = false,
180191
.expectedRole = ClioNode::DbRole::NotWriter
181192
},
182193
ClioNodeFromTestBundle{
183194
.testName = "Writer",
184195
.readOnly = false,
196+
.fallback = false,
185197
.writing = true,
186198
.expectedRole = ClioNode::DbRole::Writer
187199
}
@@ -195,7 +207,10 @@ TEST_P(ClioNodeFromTest, FromWriterState)
195207

196208
EXPECT_CALL(writerState, isReadOnly()).WillOnce(testing::Return(param.readOnly));
197209
if (not param.readOnly) {
198-
EXPECT_CALL(writerState, isWriting()).WillOnce(testing::Return(param.writing));
210+
EXPECT_CALL(writerState, isFallback()).WillOnce(testing::Return(param.fallback));
211+
if (not param.fallback) {
212+
EXPECT_CALL(writerState, isWriting()).WillOnce(testing::Return(param.writing));
213+
}
199214
}
200215

201216
auto const beforeTime = std::chrono::system_clock::now();
@@ -207,3 +222,7 @@ TEST_P(ClioNodeFromTest, FromWriterState)
207222
EXPECT_GE(node.updateTime, beforeTime);
208223
EXPECT_LE(node.updateTime, afterTime);
209224
}
225+
226+
227+
228+

tests/unit/cluster/WriterDeciderTests.cpp

Lines changed: 50 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737

3838
using namespace cluster;
3939

40-
enum class ExpectedAction { StartWriting, GiveUpWriting, NoAction };
40+
enum class ExpectedAction { StartWriting, GiveUpWriting, NoAction, SetFallback };
4141

4242
struct WriterDeciderTestParams {
4343
std::string testName;
@@ -97,6 +97,10 @@ TEST_P(WriterDeciderTest, WriterSelection)
9797
EXPECT_CALL(*clonedState, giveUpWriting());
9898
EXPECT_CALL(writerStateRef, clone()).WillOnce(testing::Return(testing::ByMove(std::move(clonedState))));
9999
break;
100+
case ExpectedAction::SetFallback:
101+
EXPECT_CALL(*clonedState, setWriterDecidingFallback());
102+
EXPECT_CALL(writerStateRef, clone()).WillOnce(testing::Return(testing::ByMove(std::move(clonedState))));
103+
break;
100104
case ExpectedAction::NoAction:
101105
if (not params.useEmptyClusterData) {
102106
// For all-ReadOnly case, we still clone but don't call any action
@@ -107,19 +111,25 @@ TEST_P(WriterDeciderTest, WriterSelection)
107111
}
108112

109113
std::shared_ptr<Backend::ClusterData> clusterData;
114+
ClioNode::cUUID selfIdPtr;
110115

111116
if (params.useEmptyClusterData) {
112117
clusterData = std::make_shared<Backend::ClusterData>(std::unexpected(std::string("Communication failed")));
118+
selfIdPtr = std::make_shared<boost::uuids::uuid>(selfUuid);
113119
} else {
114120
std::vector<ClioNode> nodes;
115121
nodes.reserve(params.nodes.size());
116122
for (auto const& [uuidValue, role] : params.nodes) {
117-
nodes.push_back(makeNode(makeUuid(uuidValue), role));
123+
auto node = makeNode(makeUuid(uuidValue), role);
124+
if (uuidValue == params.selfUuidValue) {
125+
selfIdPtr = node.uuid; // Use the same shared_ptr as in the node
126+
}
127+
nodes.push_back(std::move(node));
118128
}
119129
clusterData = std::make_shared<Backend::ClusterData>(std::move(nodes));
120130
}
121131

122-
decider.onNewState(std::make_shared<boost::uuids::uuid>(selfUuid), clusterData);
132+
decider.onNewState(selfIdPtr, clusterData);
123133

124134
ctx.join();
125135
}
@@ -220,6 +230,43 @@ INSTANTIATE_TEST_SUITE_P(
220230
{0x03, ClioNode::DbRole::Writer},
221231
{0x02, ClioNode::DbRole::ReadOnly}},
222232
.expectedAction = ExpectedAction::StartWriting
233+
},
234+
WriterDeciderTestParams{
235+
.testName = "SelfIsFallbackNoActionTaken",
236+
.selfUuidValue = 0x01,
237+
.nodes = {{0x01, ClioNode::DbRole::Fallback}, {0x02, ClioNode::DbRole::Writer}},
238+
.expectedAction = ExpectedAction::NoAction
239+
},
240+
WriterDeciderTestParams{
241+
.testName = "OtherNodeIsFallbackSetsFallbackMode",
242+
.selfUuidValue = 0x01,
243+
.nodes = {{0x01, ClioNode::DbRole::Writer}, {0x02, ClioNode::DbRole::Fallback}},
244+
.expectedAction = ExpectedAction::SetFallback
245+
},
246+
WriterDeciderTestParams{
247+
.testName = "SelfIsReadOnlyOthersAreFallbackNoActionTaken",
248+
.selfUuidValue = 0x01,
249+
.nodes = {{0x01, ClioNode::DbRole::ReadOnly}, {0x02, ClioNode::DbRole::Fallback}},
250+
.expectedAction = ExpectedAction::NoAction
251+
},
252+
WriterDeciderTestParams{
253+
.testName = "MultipleFallbackNodesSelfNotFallbackSetsFallback",
254+
.selfUuidValue = 0x03,
255+
.nodes =
256+
{{0x01, ClioNode::DbRole::Fallback},
257+
{0x02, ClioNode::DbRole::Fallback},
258+
{0x03, ClioNode::DbRole::Writer}},
259+
.expectedAction = ExpectedAction::SetFallback
260+
},
261+
WriterDeciderTestParams{
262+
.testName = "MixedRolesWithOneFallbackSetsFallback",
263+
.selfUuidValue = 0x02,
264+
.nodes =
265+
{{0x01, ClioNode::DbRole::Writer},
266+
{0x02, ClioNode::DbRole::NotWriter},
267+
{0x03, ClioNode::DbRole::Fallback},
268+
{0x04, ClioNode::DbRole::Writer}},
269+
.expectedAction = ExpectedAction::SetFallback
223270
}
224271
),
225272
[](testing::TestParamInfo<WriterDeciderTestParams> const& info) { return info.param.testName; }

0 commit comments

Comments
 (0)