Skip to content

Commit 9a40b50

Browse files
juanlofer-eprosimamergify[bot]
authored andcommitted
Solve Discovery Server race conditions (#5780)
* Refs #23088: Test reconnection when removing participant Signed-off-by: cferreiragonz <[email protected]> * Refs #23088: Solve EDP-PDP queues race condition Signed-off-by: Juan Lopez Fernandez <[email protected]> * Refs #23088: Solve data UP + data P race condition Signed-off-by: Juan Lopez Fernandez <[email protected]> * Refs #23088: Abort writer/reader processing if associated participant not alive Signed-off-by: Juan Lopez Fernandez <[email protected]> * Refs #23088: Apply suggestions Signed-off-by: Juan Lopez Fernandez <[email protected]> * Refs #23088: Release change when writer/reader insertion in DB failed Signed-off-by: Juan Lopez Fernandez <[email protected]> * Refs #23088: Match servers after change update Signed-off-by: Juan Lopez Fernandez <[email protected]> --------- Signed-off-by: cferreiragonz <[email protected]> Signed-off-by: Juan Lopez Fernandez <[email protected]> Co-authored-by: cferreiragonz <[email protected]> (cherry picked from commit ec666f7) # Conflicts: # test/blackbox/common/BlackboxTestsDiscovery.cpp
1 parent c6f89d3 commit 9a40b50

File tree

4 files changed

+603
-40
lines changed

4 files changed

+603
-40
lines changed

src/cpp/rtps/builtin/discovery/database/DiscoveryDataBase.cpp

Lines changed: 105 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -492,9 +492,6 @@ void DiscoveryDataBase::process_pdp_data_queue()
492492
// Lock(exclusive mode) mutex locally
493493
std::lock_guard<std::recursive_mutex> guard(mutex_);
494494

495-
// Swap DATA queues
496-
pdp_data_queue_.Swap();
497-
498495
// Process all messages in the queque
499496
while (!pdp_data_queue_.Empty())
500497
{
@@ -532,9 +529,6 @@ bool DiscoveryDataBase::process_edp_data_queue()
532529
// Lock(exclusive mode) mutex locally
533530
std::lock_guard<std::recursive_mutex> guard(mutex_);
534531

535-
// Swap DATA queues
536-
edp_data_queue_.Swap();
537-
538532
eprosima::fastdds::rtps::CacheChange_t* change;
539533
std::string topic_name;
540534

@@ -658,7 +652,7 @@ void DiscoveryDataBase::match_new_server_(
658652
}
659653
}
660654
// The resources needed for TCP new connections are created during the matching process when the
661-
// DATA(p) is receieved by each server.
655+
// DATA(p) is received by each server.
662656

663657
// Create virtual endpoints
664658
create_virtual_endpoints_(participant_prefix);
@@ -786,32 +780,71 @@ void DiscoveryDataBase::update_participant_from_change_(
786780
{
787781
fastdds::rtps::GUID_t change_guid = guid_from_change(ch);
788782

783+
assert(ch->kind == eprosima::fastdds::rtps::ALIVE);
784+
785+
// If the change corresponds to a previously removed participant (which hasn't yet been removed from the map since
786+
// the DATA(Up) is still unacked), update map with new data and behave as if it was a new participant.
787+
// Remove also the old change from the disposals collection, if it was added just before
788+
if (participant_info.change()->kind != eprosima::fastdds::rtps::ALIVE)
789+
{
790+
// Update the change data
791+
participant_info.participant_change_data(change_data);
792+
793+
// Remove old change from disposals if it was added just before to avoid sending data UP
794+
auto it = std::find(disposals_.begin(), disposals_.end(), participant_info.change());
795+
if (it != disposals_.end())
796+
{
797+
disposals_.erase(it);
798+
}
799+
800+
// Update change. This should add the UNALIVE change to changes_to_release_, which should later both remove the
801+
// change from the writer's history and release the change
802+
update_change_and_unmatch_(ch, participant_info);
803+
804+
// If it is local and server we have to create virtual endpoints, except for our own server
805+
if (change_guid.guidPrefix != server_guid_prefix_ && !change_data.is_client() && change_data.is_local())
806+
{
807+
// Match new server and create virtual endpoints
808+
// NOTE: match after having updated the change, so virtual endpoints are not discarded for having
809+
// an associated unalive participant
810+
match_new_server_(change_guid.guidPrefix, change_data.is_superclient());
811+
}
812+
813+
// Treat as a new participant found
814+
new_updates_++;
815+
if (change_guid.guidPrefix != server_guid_prefix_)
816+
{
817+
server_acked_by_all(false);
818+
}
819+
}
820+
789821
// Specific case when a Data(P) from an entity A known as remote comes from the very entity A (we have
790822
// the Data(P) because of other server B, but now it arrives from A itself)
791823
// The entity A changes to local
792824
// Must be local data, or else it is a remote endpoint and should not be changed
793-
if (change_guid.guidPrefix != server_guid_prefix_ && change_data.is_local() &&
825+
else if (change_guid.guidPrefix != server_guid_prefix_ && change_data.is_local() &&
794826
DiscoveryDataBase::participant_data_has_changed_(participant_info, change_data))
795827
{
828+
// Update the change data
829+
participant_info.participant_change_data(change_data);
830+
831+
// Update change
832+
update_change_and_unmatch_(ch, participant_info);
833+
796834
// If the participant changes to server local, virtual endpoints must be added
797835
// If it is local and server the only possibility is it was a remote server and it must be converted to local
798836
if (!change_data.is_client())
799837
{
838+
// NOTE: match after having updated the change in order to send the new Data(P)
800839
match_new_server_(change_guid.guidPrefix, change_data.is_superclient());
801840
}
802841

803-
// Update the change data
804-
participant_info.participant_change_data(change_data);
805-
806-
// Update change
807-
update_change_and_unmatch_(ch, participant_info);
808-
809842
// Treat as a new participant found
810843
new_updates_++;
811844
server_acked_by_all(false);
812845

813846
// It is possible that this Data(P) is in our history if it has not been acked by all
814-
// In this case we have to resent it with the new update
847+
// In this case we have to resend it with the new update
815848
if (!participant_info.is_acked_by_all())
816849
{
817850
add_pdp_to_send_(ch);
@@ -914,6 +947,29 @@ void DiscoveryDataBase::create_writers_from_change_(
914947
// The writer was NOT known by the database
915948
else
916949
{
950+
// Check if corresponding participant is known, abort otherwise
951+
// NOTE: Processing a DATA(w) should always be preceded by the reception and processing of its corresponding
952+
// participant. However, one may receive a DATA(w) just after the participant has been removed, case in which the
953+
// former should no longer be processed.
954+
std::map<eprosima::fastdds::rtps::GuidPrefix_t, DiscoveryParticipantInfo>::iterator writer_part_it =
955+
participants_.find(writer_guid.guidPrefix);
956+
if (writer_part_it == participants_.end())
957+
{
958+
EPROSIMA_LOG_ERROR(DISCOVERY_DATABASE,
959+
"Writer " << writer_guid << " has no associated participant. Skipping");
960+
assert(topic_name != virtual_topic_);
961+
changes_to_release_.push_back(ch); // Release change so it can be reused
962+
return;
963+
}
964+
else if (writer_part_it->second.change()->kind != fastdds::rtps::ChangeKind_t::ALIVE)
965+
{
966+
EPROSIMA_LOG_WARNING(DISCOVERY_DATABASE,
967+
"Writer " << writer_guid << " is associated to a removed participant. Skipping");
968+
assert(topic_name != virtual_topic_);
969+
changes_to_release_.push_back(ch); // Release change so it can be reused
970+
return;
971+
}
972+
917973
// Add entry to writers_
918974
DiscoveryEndpointInfo tmp_writer(
919975
ch,
@@ -934,18 +990,7 @@ void DiscoveryDataBase::create_writers_from_change_(
934990
new_updates_++;
935991

936992
// Add entry to participants_[guid_prefix]::writers
937-
std::map<eprosima::fastdds::rtps::GuidPrefix_t, DiscoveryParticipantInfo>::iterator writer_part_it =
938-
participants_.find(writer_guid.guidPrefix);
939-
if (writer_part_it != participants_.end())
940-
{
941-
writer_part_it->second.add_writer(writer_guid);
942-
}
943-
else
944-
{
945-
EPROSIMA_LOG_ERROR(DISCOVERY_DATABASE,
946-
"Writer " << writer_guid << " has no associated participant. Skipping");
947-
return;
948-
}
993+
writer_part_it->second.add_writer(writer_guid);
949994

950995
// Add writer to writers_by_topic_[topic_name]
951996
add_writer_to_topic_(writer_guid, topic_name);
@@ -1032,6 +1077,29 @@ void DiscoveryDataBase::create_readers_from_change_(
10321077
// The reader was NOT known by the database
10331078
else
10341079
{
1080+
// Check if corresponding participant is known, abort otherwise
1081+
// NOTE: Processing a DATA(r) should always be preceded by the reception and processing of its corresponding
1082+
// participant. However, one may receive a DATA(r) just after the participant has been removed, case in which the
1083+
// former should no longer be processed.
1084+
std::map<eprosima::fastdds::rtps::GuidPrefix_t, DiscoveryParticipantInfo>::iterator reader_part_it =
1085+
participants_.find(reader_guid.guidPrefix);
1086+
if (reader_part_it == participants_.end())
1087+
{
1088+
EPROSIMA_LOG_ERROR(DISCOVERY_DATABASE,
1089+
"Reader " << reader_guid << " has no associated participant. Skipping");
1090+
assert(topic_name != virtual_topic_);
1091+
changes_to_release_.push_back(ch); // Release change so it can be reused
1092+
return;
1093+
}
1094+
else if (reader_part_it->second.change()->kind != fastdds::rtps::ChangeKind_t::ALIVE)
1095+
{
1096+
EPROSIMA_LOG_WARNING(DISCOVERY_DATABASE,
1097+
"Reader " << reader_guid << " is associated to a removed participant. Skipping");
1098+
assert(topic_name != virtual_topic_);
1099+
changes_to_release_.push_back(ch); // Release change so it can be reused
1100+
return;
1101+
}
1102+
10351103
// Add entry to readers_
10361104
DiscoveryEndpointInfo tmp_reader(
10371105
ch,
@@ -1052,18 +1120,7 @@ void DiscoveryDataBase::create_readers_from_change_(
10521120
new_updates_++;
10531121

10541122
// Add entry to participants_[guid_prefix]::readers
1055-
std::map<eprosima::fastdds::rtps::GuidPrefix_t, DiscoveryParticipantInfo>::iterator reader_part_it =
1056-
participants_.find(reader_guid.guidPrefix);
1057-
if (reader_part_it != participants_.end())
1058-
{
1059-
reader_part_it->second.add_reader(reader_guid);
1060-
}
1061-
else
1062-
{
1063-
EPROSIMA_LOG_ERROR(DISCOVERY_DATABASE,
1064-
"Reader " << reader_guid << " has no associated participant. Skipping");
1065-
return;
1066-
}
1123+
reader_part_it->second.add_reader(reader_guid);
10671124

10681125
// Add reader to readers_by_topic_[topic_name]
10691126
add_reader_to_topic_(reader_guid, topic_name);
@@ -1345,7 +1402,7 @@ void DiscoveryDataBase::process_dispose_participant_(
13451402
delete_reader_entity_(reader_guid);
13461403
}
13471404

1348-
// All participant endoints must be already unmatched in others endopoints relevant_ack maps
1405+
// All participant endpoints must be already unmatched in others endpoints relevant_ack maps
13491406

13501407
// Unmatch own participant
13511408
unmatch_participant_(participant_guid.guidPrefix);
@@ -1602,6 +1659,14 @@ bool DiscoveryDataBase::data_queue_empty()
16021659
return (pdp_data_queue_.BothEmpty() && edp_data_queue_.BothEmpty());
16031660
}
16041661

1662+
void DiscoveryDataBase::swap_data_queues()
1663+
{
1664+
// Swap EDP before PDP to avoid race condition in which both data P and w/r are received at the same time,
1665+
// just after having swapped the PDP queue
1666+
edp_data_queue_.Swap();
1667+
pdp_data_queue_.Swap();
1668+
}
1669+
16051670
bool DiscoveryDataBase::is_participant(
16061671
const eprosima::fastdds::rtps::GUID_t& guid)
16071672
{

src/cpp/rtps/builtin/discovery/database/DiscoveryDataBase.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,9 @@ class DiscoveryDataBase
294294
// Check if the data queue is empty
295295
bool data_queue_empty();
296296

297+
// Swap both EDP and PDP data queues
298+
void swap_data_queues();
299+
297300
void to_json(
298301
nlohmann::json& j) const;
299302

src/cpp/rtps/builtin/discovery/participant/PDPServer.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1080,6 +1080,9 @@ bool PDPServer::remove_remote_participant(
10801080
bool PDPServer::process_data_queues()
10811081
{
10821082
EPROSIMA_LOG_INFO(RTPS_PDP_SERVER, "process_data_queues start");
1083+
// Swap both as a first step in order to avoid the following race condition: reception of data w/r while processing
1084+
// the PDP queue, not having processed yet the corresponding data P (also received while processing the queue).
1085+
discovery_db_.swap_data_queues();
10831086
discovery_db_.process_pdp_data_queue();
10841087
return discovery_db_.process_edp_data_queue();
10851088
}

0 commit comments

Comments
 (0)