Skip to content

Commit 9fe9e7c

Browse files
authored
fix: Subscription source bugs fix (#1626)
For #1620. - Add timeouts for websocket operations for connections to rippled. Without these timeouts if connection hangs for some reason, clio wouldn't know the connection is hanging. - Fix potential data race in choosing new subscription source which will forward messages to users. - Optimise switching between subscription sources.
1 parent 2e2740d commit 9fe9e7c

File tree

8 files changed

+147
-84
lines changed

8 files changed

+147
-84
lines changed

src/etl/LoadBalancer.cpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -109,10 +109,13 @@ LoadBalancer::LoadBalancer(
109109
validatedLedgers,
110110
forwardingTimeout,
111111
[this]() {
112-
if (not hasForwardingSource_)
112+
if (not hasForwardingSource_.lock().get())
113+
chooseForwardingSource();
114+
},
115+
[this](bool wasForwarding) {
116+
if (wasForwarding)
113117
chooseForwardingSource();
114118
},
115-
[this]() { chooseForwardingSource(); },
116119
[this]() {
117120
if (forwardingCache_.has_value())
118121
forwardingCache_->invalidate();
@@ -315,11 +318,12 @@ void
315318
LoadBalancer::chooseForwardingSource()
316319
{
317320
LOG(log_.info()) << "Choosing a new source to forward subscriptions";
318-
hasForwardingSource_ = false;
321+
auto hasForwardingSourceLock = hasForwardingSource_.lock();
322+
hasForwardingSourceLock.get() = false;
319323
for (auto& source : sources_) {
320-
if (not hasForwardingSource_ and source->isConnected()) {
324+
if (not hasForwardingSourceLock.get() and source->isConnected()) {
321325
source->setForwarding(true);
322-
hasForwardingSource_ = true;
326+
hasForwardingSourceLock.get() = true;
323327
} else {
324328
source->setForwarding(false);
325329
}

src/etl/LoadBalancer.hpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "etl/Source.hpp"
2626
#include "etl/impl/ForwardingCache.hpp"
2727
#include "feed/SubscriptionManagerInterface.hpp"
28+
#include "util/Mutex.hpp"
2829
#include "util/config/Config.hpp"
2930
#include "util/log/Logger.hpp"
3031

@@ -38,7 +39,6 @@
3839
#include <org/xrpl/rpc/v1/ledger.pb.h>
3940
#include <ripple/proto/org/xrpl/rpc/v1/xrp_ledger.grpc.pb.h>
4041

41-
#include <atomic>
4242
#include <chrono>
4343
#include <cstdint>
4444
#include <memory>
@@ -74,7 +74,10 @@ class LoadBalancer {
7474
std::optional<ETLState> etlState_;
7575
std::uint32_t downloadRanges_ =
7676
DEFAULT_DOWNLOAD_RANGES; /*< The number of markers to use when downloading initial ledger */
77-
std::atomic_bool hasForwardingSource_{false};
77+
78+
// Using mutext instead of atomic_bool because choosing a new source to
79+
// forward messages should be done with a mutual exclusion otherwise there will be a race condition
80+
util::Mutex<bool> hasForwardingSource_{false};
7881

7982
public:
8083
/**

src/etl/Source.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ namespace etl {
5151
class SourceBase {
5252
public:
5353
using OnConnectHook = std::function<void()>;
54-
using OnDisconnectHook = std::function<void()>;
54+
using OnDisconnectHook = std::function<void(bool)>;
5555
using OnLedgerClosedHook = std::function<void()>;
5656

5757
virtual ~SourceBase() = default;

src/etl/impl/SubscriptionSource.cpp

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
#include "rpc/JS.hpp"
2525
#include "util/Retry.hpp"
2626
#include "util/log/Logger.hpp"
27+
#include "util/prometheus/Label.hpp"
28+
#include "util/prometheus/Prometheus.hpp"
2729
#include "util/requests/Types.hpp"
2830

2931
#include <boost/algorithm/string/classification.hpp>
@@ -66,22 +68,28 @@ SubscriptionSource::SubscriptionSource(
6668
OnConnectHook onConnect,
6769
OnDisconnectHook onDisconnect,
6870
OnLedgerClosedHook onLedgerClosed,
69-
std::chrono::steady_clock::duration const connectionTimeout,
71+
std::chrono::steady_clock::duration const wsTimeout,
7072
std::chrono::steady_clock::duration const retryDelay
7173
)
7274
: log_(fmt::format("SubscriptionSource[{}:{}]", ip, wsPort))
7375
, wsConnectionBuilder_(ip, wsPort)
7476
, validatedLedgers_(std::move(validatedLedgers))
7577
, subscriptions_(std::move(subscriptions))
7678
, strand_(boost::asio::make_strand(ioContext))
79+
, wsTimeout_(wsTimeout)
7780
, retry_(util::makeRetryExponentialBackoff(retryDelay, RETRY_MAX_DELAY, strand_))
7881
, onConnect_(std::move(onConnect))
7982
, onDisconnect_(std::move(onDisconnect))
8083
, onLedgerClosed_(std::move(onLedgerClosed))
84+
, lastMessageTimeSecondsSinceEpoch_(PrometheusService::gaugeInt(
85+
"subscription_source_last_message_time",
86+
util::prometheus::Labels({{"source", fmt::format("{}:{}", ip, wsPort)}}),
87+
"Seconds since epoch of the last message received from rippled subscription streams"
88+
))
8189
{
8290
wsConnectionBuilder_.addHeader({boost::beast::http::field::user_agent, "clio-client"})
8391
.addHeader({"X-User", "clio-client"})
84-
.setConnectionTimeout(connectionTimeout);
92+
.setConnectionTimeout(wsTimeout_);
8593
}
8694

8795
SubscriptionSource::~SubscriptionSource()
@@ -167,21 +175,22 @@ SubscriptionSource::subscribe()
167175
}
168176

169177
wsConnection_ = std::move(connection).value();
170-
isConnected_ = true;
171-
onConnect_();
172-
LOG(log_.info()) << "Connected";
173178

174179
auto const& subscribeCommand = getSubscribeCommandJson();
175-
auto const writeErrorOpt = wsConnection_->write(subscribeCommand, yield);
180+
auto const writeErrorOpt = wsConnection_->write(subscribeCommand, yield, wsTimeout_);
176181
if (writeErrorOpt) {
177182
handleError(writeErrorOpt.value(), yield);
178183
return;
179184
}
180185

186+
isConnected_ = true;
187+
LOG(log_.info()) << "Connected";
188+
onConnect_();
189+
181190
retry_.reset();
182191

183192
while (!stop_) {
184-
auto const message = wsConnection_->read(yield);
193+
auto const message = wsConnection_->read(yield, wsTimeout_);
185194
if (not message) {
186195
handleError(message.error(), yield);
187196
return;
@@ -256,8 +265,6 @@ SubscriptionSource::handleMessage(std::string const& message)
256265
} else if (object.contains(JS(type)) && object.at(JS(type)) == JS_ManifestReceived) {
257266
LOG(log_.debug()) << "Forwarding manifest: " << object;
258267
subscriptions_->forwardManifest(object);
259-
} else {
260-
LOG(log_.error()) << "Unknown message: " << object;
261268
}
262269
}
263270
}
@@ -278,10 +285,10 @@ void
278285
SubscriptionSource::handleError(util::requests::RequestError const& error, boost::asio::yield_context yield)
279286
{
280287
isConnected_ = false;
281-
isForwarding_ = false;
288+
bool const wasForwarding = isForwarding_.exchange(false);
282289
if (not stop_) {
283-
onDisconnect_();
284290
LOG(log_.info()) << "Disconnected";
291+
onDisconnect_(wasForwarding);
285292
}
286293

287294
if (wsConnection_ != nullptr) {
@@ -312,7 +319,11 @@ SubscriptionSource::logError(util::requests::RequestError const& error) const
312319
void
313320
SubscriptionSource::setLastMessageTime()
314321
{
315-
lastMessageTime_.lock().get() = std::chrono::steady_clock::now();
322+
lastMessageTimeSecondsSinceEpoch_.get().set(
323+
std::chrono::duration_cast<std::chrono::seconds>(std::chrono::system_clock::now().time_since_epoch()).count()
324+
);
325+
auto lock = lastMessageTime_.lock();
326+
lock.get() = std::chrono::steady_clock::now();
316327
}
317328

318329
void

src/etl/impl/SubscriptionSource.hpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,13 @@
1919

2020
#pragma once
2121

22-
#include "etl/ETLHelpers.hpp"
22+
#include "etl/NetworkValidatedLedgersInterface.hpp"
2323
#include "etl/Source.hpp"
2424
#include "feed/SubscriptionManagerInterface.hpp"
2525
#include "util/Mutex.hpp"
2626
#include "util/Retry.hpp"
2727
#include "util/log/Logger.hpp"
28+
#include "util/prometheus/Gauge.hpp"
2829
#include "util/requests/Types.hpp"
2930
#include "util/requests/WsConnection.hpp"
3031

@@ -37,6 +38,7 @@
3738
#include <atomic>
3839
#include <chrono>
3940
#include <cstdint>
41+
#include <functional>
4042
#include <future>
4143
#include <memory>
4244
#include <optional>
@@ -71,6 +73,8 @@ class SubscriptionSource {
7173

7274
boost::asio::strand<boost::asio::io_context::executor_type> strand_;
7375

76+
std::chrono::steady_clock::duration wsTimeout_;
77+
7478
util::Retry retry_;
7579

7680
OnConnectHook onConnect_;
@@ -83,9 +87,11 @@ class SubscriptionSource {
8387

8488
util::Mutex<std::chrono::steady_clock::time_point> lastMessageTime_;
8589

90+
std::reference_wrapper<util::prometheus::GaugeInt> lastMessageTimeSecondsSinceEpoch_;
91+
8692
std::future<void> runFuture_;
8793

88-
static constexpr std::chrono::seconds CONNECTION_TIMEOUT{30};
94+
static constexpr std::chrono::seconds WS_TIMEOUT{30};
8995
static constexpr std::chrono::seconds RETRY_MAX_DELAY{30};
9096
static constexpr std::chrono::seconds RETRY_DELAY{1};
9197

@@ -103,7 +109,7 @@ class SubscriptionSource {
103109
* @param onNewLedger The onNewLedger hook. Called when a new ledger is received
104110
* @param onLedgerClosed The onLedgerClosed hook. Called when the ledger is closed but only if the source is
105111
* forwarding
106-
* @param connectionTimeout The connection timeout. Defaults to 30 seconds
112+
* @param wsTimeout A timeout for websocket operations. Defaults to 30 seconds
107113
* @param retryDelay The retry delay. Defaults to 1 second
108114
*/
109115
SubscriptionSource(
@@ -115,7 +121,7 @@ class SubscriptionSource {
115121
OnConnectHook onConnect,
116122
OnDisconnectHook onDisconnect,
117123
OnLedgerClosedHook onLedgerClosed,
118-
std::chrono::steady_clock::duration const connectionTimeout = CONNECTION_TIMEOUT,
124+
std::chrono::steady_clock::duration const wsTimeout = WS_TIMEOUT,
119125
std::chrono::steady_clock::duration const retryDelay = RETRY_DELAY
120126
);
121127

src/util/requests/impl/WsConnectionImpl.hpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,10 @@
3939
#include <boost/beast/websocket/stream_base.hpp>
4040
#include <boost/system/errc.hpp>
4141

42+
#include <atomic>
4243
#include <chrono>
4344
#include <expected>
45+
#include <memory>
4446
#include <optional>
4547
#include <string>
4648
#include <utility>
@@ -123,15 +125,20 @@ class WsConnectionImpl : public WsConnection {
123125
static void
124126
withTimeout(Operation&& operation, boost::asio::yield_context yield, std::chrono::steady_clock::duration timeout)
125127
{
128+
auto isCompleted = std::make_shared<bool>(false);
126129
boost::asio::cancellation_signal cancellationSignal;
127130
auto cyield = boost::asio::bind_cancellation_slot(cancellationSignal.slot(), yield);
128131

129132
boost::asio::steady_timer timer{boost::asio::get_associated_executor(cyield), timeout};
130-
timer.async_wait([&cancellationSignal](boost::system::error_code errorCode) {
131-
if (!errorCode)
133+
134+
// The timer below can be called with no error code even if the operation is completed before the timeout, so we
135+
// need an additional flag here
136+
timer.async_wait([&cancellationSignal, isCompleted](boost::system::error_code errorCode) {
137+
if (!errorCode and not *isCompleted)
132138
cancellationSignal.emit(boost::asio::cancellation_type::terminal);
133139
});
134140
operation(cyield);
141+
*isCompleted = true;
135142
}
136143

137144
static boost::system::error_code

tests/unit/etl/LoadBalancerTests.cpp

Lines changed: 11 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -271,15 +271,12 @@ TEST_F(LoadBalancerOnDisconnectHookTests, source0Disconnects)
271271
EXPECT_CALL(sourceFactory_.sourceAt(0), setForwarding(false));
272272
EXPECT_CALL(sourceFactory_.sourceAt(1), isConnected()).WillOnce(Return(true));
273273
EXPECT_CALL(sourceFactory_.sourceAt(1), setForwarding(true));
274-
sourceFactory_.callbacksAt(0).onDisconnect();
274+
sourceFactory_.callbacksAt(0).onDisconnect(true);
275275
}
276276

277277
TEST_F(LoadBalancerOnDisconnectHookTests, source1Disconnects)
278278
{
279-
EXPECT_CALL(sourceFactory_.sourceAt(0), isConnected()).WillOnce(Return(true));
280-
EXPECT_CALL(sourceFactory_.sourceAt(0), setForwarding(true));
281-
EXPECT_CALL(sourceFactory_.sourceAt(1), setForwarding(false));
282-
sourceFactory_.callbacksAt(1).onDisconnect();
279+
sourceFactory_.callbacksAt(1).onDisconnect(false);
283280
}
284281

285282
TEST_F(LoadBalancerOnDisconnectHookTests, source0DisconnectsAndConnectsBack)
@@ -288,29 +285,25 @@ TEST_F(LoadBalancerOnDisconnectHookTests, source0DisconnectsAndConnectsBack)
288285
EXPECT_CALL(sourceFactory_.sourceAt(0), setForwarding(false));
289286
EXPECT_CALL(sourceFactory_.sourceAt(1), isConnected()).WillOnce(Return(true));
290287
EXPECT_CALL(sourceFactory_.sourceAt(1), setForwarding(true));
291-
sourceFactory_.callbacksAt(0).onDisconnect();
288+
sourceFactory_.callbacksAt(0).onDisconnect(true);
292289

293290
sourceFactory_.callbacksAt(0).onConnect();
294291
}
295292

296293
TEST_F(LoadBalancerOnDisconnectHookTests, source1DisconnectsAndConnectsBack)
297294
{
298-
EXPECT_CALL(sourceFactory_.sourceAt(0), isConnected()).WillOnce(Return(true));
299-
EXPECT_CALL(sourceFactory_.sourceAt(0), setForwarding(true));
300-
EXPECT_CALL(sourceFactory_.sourceAt(1), setForwarding(false));
301-
sourceFactory_.callbacksAt(1).onDisconnect();
302-
295+
sourceFactory_.callbacksAt(1).onDisconnect(false);
303296
sourceFactory_.callbacksAt(1).onConnect();
304297
}
305298

306299
TEST_F(LoadBalancerOnConnectHookTests, bothSourcesDisconnectAndConnectBack)
307300
{
308-
EXPECT_CALL(sourceFactory_.sourceAt(0), isConnected()).Times(2).WillRepeatedly(Return(false));
309-
EXPECT_CALL(sourceFactory_.sourceAt(0), setForwarding(false)).Times(2);
310-
EXPECT_CALL(sourceFactory_.sourceAt(1), isConnected()).Times(2).WillRepeatedly(Return(false));
311-
EXPECT_CALL(sourceFactory_.sourceAt(1), setForwarding(false)).Times(2);
312-
sourceFactory_.callbacksAt(0).onDisconnect();
313-
sourceFactory_.callbacksAt(1).onDisconnect();
301+
EXPECT_CALL(sourceFactory_.sourceAt(0), isConnected()).WillOnce(Return(false));
302+
EXPECT_CALL(sourceFactory_.sourceAt(0), setForwarding(false));
303+
EXPECT_CALL(sourceFactory_.sourceAt(1), isConnected()).WillOnce(Return(false));
304+
EXPECT_CALL(sourceFactory_.sourceAt(1), setForwarding(false));
305+
sourceFactory_.callbacksAt(0).onDisconnect(true);
306+
sourceFactory_.callbacksAt(1).onDisconnect(false);
314307

315308
EXPECT_CALL(sourceFactory_.sourceAt(0), isConnected()).WillOnce(Return(true));
316309
EXPECT_CALL(sourceFactory_.sourceAt(0), setForwarding(true));
@@ -353,12 +346,7 @@ TEST_F(LoadBalancer3SourcesTests, forwardingUpdate)
353346
sourceFactory_.callbacksAt(1).onConnect();
354347

355348
// Source 0 got disconnected
356-
EXPECT_CALL(sourceFactory_.sourceAt(0), isConnected()).WillOnce(Return(false));
357-
EXPECT_CALL(sourceFactory_.sourceAt(0), setForwarding(false));
358-
EXPECT_CALL(sourceFactory_.sourceAt(1), isConnected()).WillOnce(Return(true));
359-
EXPECT_CALL(sourceFactory_.sourceAt(1), setForwarding(true));
360-
EXPECT_CALL(sourceFactory_.sourceAt(2), setForwarding(false)); // only source 1 must be forwarding
361-
sourceFactory_.callbacksAt(0).onDisconnect();
349+
sourceFactory_.callbacksAt(0).onDisconnect(false);
362350
}
363351

364352
struct LoadBalancerLoadInitialLedgerTests : LoadBalancerOnConnectHookTests {

0 commit comments

Comments
 (0)