Skip to content

Commit 2d03628

Browse files
authored
feat DEAD channel detection, now is everywhere (#612)
* refactring hearbeat mechanism Signed-off-by: dorjesinpo <[email protected]> * BrokerResponse w/ hearbeat config Signed-off-by: dorjesinpo <[email protected]> * Addressing review Signed-off-by: dorjesinpo <[email protected]> --------- Signed-off-by: dorjesinpo <[email protected]>
1 parent 062d6b0 commit 2d03628

22 files changed

+898
-278
lines changed

docker/cluster/config/bmqbrkrcfg.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373
},
7474
"networkInterfaces": {
7575
"heartbeats": {
76-
"client": 0,
76+
"client": 10,
7777
"downstreamBroker": 10,
7878
"upstreamBroker": 10,
7979
"clusterPeer": 10

docker/single-node/config/bmqbrkrcfg.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373
},
7474
"networkInterfaces": {
7575
"heartbeats": {
76-
"client": 0,
76+
"client": 10,
7777
"downstreamBroker": 10,
7878
"upstreamBroker": 10,
7979
"clusterPeer": 10

src/applications/bmqbrkr/etc/bmqbrkrcfg.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373
},
7474
"networkInterfaces": {
7575
"heartbeats": {
76-
"client": 0,
76+
"client": 10,
7777
"downstreamBroker": 10,
7878
"upstreamBroker": 10,
7979
"clusterPeer": 10

src/groups/bmq/bmqimp/bmqimp_application.cpp

Lines changed: 98 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ namespace {
7474
const double k_RECONNECT_INTERVAL_MS = 500;
7575
const int k_RECONNECT_COUNT = bsl::numeric_limits<int>::max();
7676
const bsls::Types::Int64 k_CHANNEL_LOW_WATERMARK = 512 * 1024;
77+
const int k_DEFAULT_MAX_MISSED_HEARTBEATS = 10;
78+
const int k_DEFAULT_HEARTBEAT_INTERVAL_MS = 1000;
7779

7880
/// Create the StatContextConfiguration to use, from the specified
7981
/// `options`, and using the specified `allocator` for memory allocations.
@@ -144,6 +146,8 @@ void Application::onChannelDown(const bsl::string& peerUri,
144146
{
145147
// executed by the *IO* thread
146148

149+
stopHeartbeat();
150+
147151
BALL_LOG_INFO << "Session with '" << peerUri << "' is now DOWN"
148152
<< " [status: " << status << "]";
149153

@@ -159,10 +163,12 @@ void Application::onChannelWatermark(const bsl::string& peerUri,
159163
d_brokerSession.handleChannelWatermark(type);
160164
}
161165

162-
void Application::readCb(const bmqio::Status& status,
163-
int* numNeeded,
164-
bdlbb::Blob* blob,
165-
const bsl::shared_ptr<bmqio::Channel>& channel)
166+
void Application::readCb(
167+
const bmqio::Status& status,
168+
int* numNeeded,
169+
bdlbb::Blob* blob,
170+
const bsl::shared_ptr<bmqio::Channel>& channel,
171+
const bsl::shared_ptr<bmqp::HeartbeatMonitor>& monitor)
166172
{
167173
// executed by the *IO* thread
168174

@@ -201,10 +207,15 @@ void Application::readCb(const bmqio::Status& status,
201207
return; // RETURN
202208
}
203209

204-
BALL_LOG_TRACE << channel->peerUri() << ": ReadCallback got a blob\n"
205-
<< bmqu::BlobStartHexDumper(&readBlob);
210+
// Create a raw event with a cloned blob
211+
bmqp::Event event(&readBlob, &d_allocator, true);
212+
213+
if (monitor->checkData(channel.get(), event)) {
214+
BALL_LOG_TRACE << channel->peerUri() << ": ReadCallback got a blob\n"
215+
<< bmqu::BlobStartHexDumper(&readBlob);
206216

207-
d_brokerSession.processPacket(readBlob);
217+
d_brokerSession.processPacket(event);
218+
}
208219
}
209220

210221
void Application::channelStateCallback(
@@ -238,6 +249,9 @@ void Application::channelStateCallback(
238249

239250
d_brokerSession.setChannel(channel);
240251

252+
bsl::shared_ptr<bmqp::HeartbeatMonitor> monitor = createMonitor(
253+
channel);
254+
241255
// Initiate read flow
242256
bmqio::Status st;
243257
channel->read(
@@ -248,7 +262,8 @@ void Application::channelStateCallback(
248262
bdlf::PlaceHolders::_1, // status
249263
bdlf::PlaceHolders::_2, // numNeeded
250264
bdlf::PlaceHolders::_3, // blob
251-
channel));
265+
channel,
266+
monitor));
252267
if (!st) {
253268
BALL_LOG_ERROR << "Could not read from channel:"
254269
<< " [peer: " << channel->peerUri()
@@ -260,6 +275,8 @@ void Application::channelStateCallback(
260275
// Cancel the timeout event (if the handle is invalid, this will just
261276
// do nothing)
262277
d_scheduler.cancelEvent(&d_startTimeoutHandle);
278+
279+
startHeartbeat(channel, monitor);
263280
} break; // BREAK
264281
case bmqio::ChannelFactoryEvent::e_CONNECT_ATTEMPT_FAILED: {
265282
BALL_LOG_DEBUG << "Failed an attempt to establish a session with '"
@@ -599,6 +616,7 @@ Application::Application(
599616
, d_statSnaphotTimerHandle()
600617
, d_nextStatDump(-1)
601618
, d_lastAllocatorSnapshot(0)
619+
, d_heartbeatSchedulerHandle()
602620
{
603621
// NOTE:
604622
// o The persistent session pool must live longer than the brokerSession
@@ -721,6 +739,8 @@ void Application::stop()
721739
BALL_LOG_INFO << "::: STOP (SYNC) [state: " << d_brokerSession.state()
722740
<< "] :::";
723741

742+
stopHeartbeat();
743+
724744
// Stop the brokerSession
725745
d_brokerSession.stop();
726746
}
@@ -730,9 +750,79 @@ void Application::stopAsync()
730750
BALL_LOG_INFO << "::: STOP (ASYNC) [state: " << d_brokerSession.state()
731751
<< "] :::";
732752

753+
stopHeartbeat();
754+
733755
// Stop the brokerSession
734756
d_brokerSession.stopAsync();
735757
}
736758

759+
bsl::shared_ptr<bmqp::HeartbeatMonitor>
760+
Application::createMonitor(const bsl::shared_ptr<bmqio::Channel>& channel)
761+
{
762+
int maxMissedHeartbeats = k_DEFAULT_MAX_MISSED_HEARTBEATS;
763+
764+
channel->properties().load(
765+
&maxMissedHeartbeats,
766+
NegotiatedChannelFactory::k_CHANNEL_PROPERTY_MAX_MISSED_HEARTBEATS);
767+
768+
bsl::shared_ptr<bmqp::HeartbeatMonitor> monitor(
769+
new (d_allocator) bmqp::HeartbeatMonitor(maxMissedHeartbeats),
770+
&d_allocator);
771+
772+
return monitor;
773+
}
774+
775+
void Application::startHeartbeat(
776+
const bsl::shared_ptr<bmqio::Channel>& channel,
777+
const bsl::shared_ptr<bmqp::HeartbeatMonitor>& monitor)
778+
{
779+
BSLS_ASSERT_SAFE(monitor);
780+
781+
if (!monitor->isHearbeatEnabled()) {
782+
return; // RETURN
783+
}
784+
785+
int heartbeatIntervalMs = k_DEFAULT_HEARTBEAT_INTERVAL_MS;
786+
787+
channel->properties().load(
788+
&heartbeatIntervalMs,
789+
NegotiatedChannelFactory::k_CHANNEL_PROPERTY_HEARTBEAT_INTERVAL_MS);
790+
791+
bsls::TimeInterval interval;
792+
interval.addMilliseconds(heartbeatIntervalMs);
793+
794+
d_scheduler.scheduleRecurringEvent(
795+
&d_heartbeatSchedulerHandle,
796+
interval,
797+
bdlf::BindUtil::bind(&Application::onHeartbeatSchedulerEvent,
798+
this,
799+
channel,
800+
monitor));
801+
}
802+
void Application::stopHeartbeat()
803+
{
804+
d_scheduler.cancelEventAndWait(&d_heartbeatSchedulerHandle);
805+
}
806+
807+
void Application::onHeartbeatSchedulerEvent(
808+
const bsl::shared_ptr<bmqio::Channel>& channel,
809+
const bsl::shared_ptr<bmqp::HeartbeatMonitor>& monitor)
810+
{
811+
// executed by the *SCHEDULER* thread
812+
813+
BSLS_ASSERT_SAFE(monitor);
814+
BSLS_ASSERT_SAFE(monitor->maxMissedHeartbeats());
815+
816+
if (!monitor->checkHeartbeat(channel.get())) {
817+
BALL_LOG_WARN << "#TCP_DEAD_CHANNEL "
818+
<< "Closing unresponsive channel after "
819+
<< monitor->maxMissedHeartbeats()
820+
<< " missed heartbeats [channel: '" << channel->peerUri()
821+
<< "']";
822+
823+
channel->close();
824+
}
825+
}
826+
737827
} // close package namespace
738828
} // close enterprise namespace

src/groups/bmq/bmqimp/bmqimp_application.h

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
#include <bmqimp_eventqueue.h>
4141
#include <bmqimp_negotiatedchannelfactory.h>
4242
#include <bmqp_ctrlmsg_messages.h>
43+
#include <bmqp_heartbeatmonitor.h>
4344
#include <bmqt_sessionoptions.h>
4445

4546
#include <bmqio_channel.h>
@@ -159,6 +160,9 @@ class Application {
159160
// the snapshot was performed on the
160161
// Counting Allocators context
161162

163+
/// Scheduler handle of the recurring event to monitor channels heartbeats.
164+
bdlmt::EventSchedulerRecurringEventHandle d_heartbeatSchedulerHandle;
165+
162166
private:
163167
// PRIVATE MANIPULATORS
164168
void onChannelDown(const bsl::string& peerUri,
@@ -167,10 +171,11 @@ class Application {
167171
void onChannelWatermark(const bsl::string& peerUri,
168172
bmqio::ChannelWatermarkType::Enum type);
169173

170-
void readCb(const bmqio::Status& status,
171-
int* numNeeded,
172-
bdlbb::Blob* blob,
173-
const bsl::shared_ptr<bmqio::Channel>& channel);
174+
void readCb(const bmqio::Status& status,
175+
int* numNeeded,
176+
bdlbb::Blob* blob,
177+
const bsl::shared_ptr<bmqio::Channel>& channel,
178+
const bsl::shared_ptr<bmqp::HeartbeatMonitor>& monitor);
174179

175180
void channelStateCallback(const bsl::string& endpoint,
176181
bmqio::ChannelFactoryEvent::Enum event,
@@ -212,6 +217,19 @@ class Application {
212217
bmqimp::BrokerSession::State::Enum newState,
213218
bmqimp::BrokerSession::FsmEvent::Enum event);
214219

220+
/// Recurring scheduler event to check for all `heartbeat-enabled`
221+
/// channels : this will send a heartbeat if no data has been received
222+
/// on a given channel, or proactively reset the channel if too many
223+
/// heartbeats have been missed.
224+
void onHeartbeatSchedulerEvent(
225+
const bsl::shared_ptr<bmqio::Channel>& channel,
226+
const bsl::shared_ptr<bmqp::HeartbeatMonitor>& monitor);
227+
bsl::shared_ptr<bmqp::HeartbeatMonitor>
228+
createMonitor(const bsl::shared_ptr<bmqio::Channel>& channel);
229+
void startHeartbeat(const bsl::shared_ptr<bmqio::Channel>& channel,
230+
const bsl::shared_ptr<bmqp::HeartbeatMonitor>& monitor);
231+
void stopHeartbeat();
232+
215233
private:
216234
// NOT IMPLEMENTED
217235
Application(const Application& other) BSLS_CPP11_DELETED;

src/groups/bmq/bmqimp/bmqimp_brokersession.cpp

Lines changed: 13 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -3430,18 +3430,6 @@ void BrokerSession::processControlEvent(const bmqp::Event& event)
34303430
}
34313431
}
34323432

3433-
void BrokerSession::onHeartbeat()
3434-
{
3435-
// executed by the *IO* thread
3436-
// Add to the FSM event queue
3437-
bsl::shared_ptr<Event> queueEvent = createEvent();
3438-
queueEvent->configureAsRequestEvent(
3439-
bdlf::BindUtil::bind(&BrokerSession::doHandleHeartbeat,
3440-
this,
3441-
bdlf::PlaceHolders::_1)); // eventImpl
3442-
enqueueFsmEvent(queueEvent);
3443-
}
3444-
34453433
void BrokerSession::enableMessageRetransmission(
34463434
const bmqp::PutMessageIterator& putIter,
34473435
const bsls::TimeInterval& sentTime)
@@ -4332,28 +4320,6 @@ void BrokerSession::doHandleChannelWatermark(
43324320
}
43334321
}
43344322

4335-
void BrokerSession::doHandleHeartbeat(
4336-
BSLS_ANNOTATION_UNUSED const bsl::shared_ptr<Event>& eventSp)
4337-
{
4338-
// executed by the FSM thread
4339-
4340-
BSLS_ASSERT_SAFE(d_fsmThreadChecker.inSameThread());
4341-
4342-
// The broker sent a heartbeat to check on us, simply reply with a
4343-
// heartbeat response.
4344-
//
4345-
// NOTE: the client doesn't check on the broker, therefore it will never
4346-
// send 'HEARTBEAT_REQ' and hence we don't have to handle
4347-
// 'HEARTBEAT_RSP' type.
4348-
if (d_channel_sp) {
4349-
d_channel_sp->write(0, // status
4350-
bmqp::ProtocolUtil::heartbeatRspBlob(),
4351-
d_sessionOptions.channelHighWatermark());
4352-
// We explicitly ignore any failure as failure implies issues with the
4353-
// channel, which is what the heartbeat is trying to expose.
4354-
}
4355-
}
4356-
43574323
void BrokerSession::enqueueStateRestoredIfNeeded()
43584324
{
43594325
// executed by the FSM thread
@@ -5851,25 +5817,29 @@ BrokerSession::processPacket(const bdlbb::Blob& packet)
58515817
// executed by the *IO* thread
58525818
// or *APPLICATION* thread
58535819

5854-
enum { e_NUM_BYTES_IN_BLOB_TO_DUMP = 256 };
5855-
58565820
// Create a raw event with a cloned blob
58575821
bmqp::Event event(&packet, d_allocator_p, true);
5822+
5823+
return processPacket(event);
5824+
}
5825+
5826+
bmqt::GenericResult::Enum
5827+
BrokerSession::processPacket(const bmqp::Event& event)
5828+
{
5829+
// executed by the *IO* thread
5830+
// or *APPLICATION* thread
5831+
5832+
enum { e_NUM_BYTES_IN_BLOB_TO_DUMP = 256 };
5833+
58585834
if (BSLS_PERFORMANCEHINT_PREDICT_UNLIKELY(!event.isValid())) {
58595835
BSLS_PERFORMANCEHINT_UNLIKELY_HINT;
58605836
BALL_LOG_ERROR << "Received an invalid packet: "
58615837
<< bmqu::BlobStartHexDumper(
5862-
&packet,
5838+
event.blob(),
58635839
e_NUM_BYTES_IN_BLOB_TO_DUMP);
58645840
return bmqt::GenericResult::e_INVALID_ARGUMENT; // RETURN
58655841
}
58665842

5867-
if (BSLS_PERFORMANCEHINT_PREDICT_UNLIKELY(event.isHeartbeatReqEvent())) {
5868-
BSLS_PERFORMANCEHINT_UNLIKELY_HINT;
5869-
onHeartbeat();
5870-
return bmqt::GenericResult::e_SUCCESS; // RETURN
5871-
}
5872-
58735843
// Add to event queue
58745844
bsl::shared_ptr<Event> queueEvent = createEvent();
58755845
queueEvent->configureAsRawEvent(event);

src/groups/bmq/bmqimp/bmqimp_brokersession.h

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -967,10 +967,6 @@ class BrokerSession BSLS_CPP11_FINAL {
967967
/// the broker) is available on the channel.
968968
void processControlEvent(const bmqp::Event& event);
969969

970-
/// This method gets called each time a new heart beat event (sent by
971-
/// the broker) is available on the channel.
972-
void onHeartbeat();
973-
974970
void enableMessageRetransmission(const bmqp::PutMessageIterator& putIter,
975971
const bsls::TimeInterval& sentTime);
976972

@@ -1360,10 +1356,6 @@ class BrokerSession BSLS_CPP11_FINAL {
13601356
void doHandleChannelWatermark(bmqio::ChannelWatermarkType::Enum type,
13611357
const bsl::shared_ptr<Event>& eventSp);
13621358

1363-
/// Invoked from the FSM thread as a handler to the heartbeat event
1364-
/// specified as `eventSp` sent by the IO thread.
1365-
void doHandleHeartbeat(const bsl::shared_ptr<Event>& eventSp);
1366-
13671359
/// Invoked from the FSM thread to start channel closing.
13681360
void disconnectChannel();
13691361

@@ -1525,6 +1517,7 @@ class BrokerSession BSLS_CPP11_FINAL {
15251517
/// Return error status in case the packet cannot be handled by any
15261518
/// reason.
15271519
bmqt::GenericResult::Enum processPacket(const bdlbb::Blob& packet);
1520+
bmqt::GenericResult::Enum processPacket(const bmqp::Event& event);
15281521

15291522
/// Set the specified `channel` to use for communication with the
15301523
/// bmqbrkr. If `channel` is non null, this is a newly-established

0 commit comments

Comments
 (0)