Skip to content

Commit 73ef72b

Browse files
authored
Fixed multiple bugs in data store (#2325)
* Fixed a bug in the data store where the call to get_empty_node was not using a proper mutex for the m_data structure. Added additional instances of locking to avoid thread race conditions. Fixed shared memory segment so that only rank 0 on a node will try to unlink it. * Fixed the order of memory unmapping and unlinking. * Added clang-format * Fixed the locking within the exchange_local_cache function using more fine grained locks to avoid nesting locks.
1 parent 572f119 commit 73ef72b

File tree

2 files changed

+137
-115
lines changed

2 files changed

+137
-115
lines changed

include/lbann/data_store/data_store_conduit.hpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,11 @@ class data_store_conduit
261261
std::ofstream* m_profile = nullptr;
262262

263263
/// for use during development and debugging
264-
int get_data_size() { return m_data.size(); }
264+
int get_data_size()
265+
{
266+
std::lock_guard<std::mutex> lock(m_mutex);
267+
return m_data.size();
268+
}
265269

266270
/// made public for debugging during development
267271
void copy_members(const data_store_conduit& rhs);
@@ -362,7 +366,9 @@ class data_store_conduit
362366
map_ii_t m_spilled_nodes;
363367

364368
/// used in set_conduit_node(...)
365-
std::mutex m_mutex;
369+
// Guards m_sample_sizes, m_data, m_image_offsets, m_sample_sizes
370+
mutable std::mutex m_mutex;
371+
// Guards m_compact_sample_size
366372
std::mutex m_mutex_2;
367373

368374
/// for use in local cache mode

src/data_store/data_store_conduit.cpp

Lines changed: 129 additions & 113 deletions
Original file line numberDiff line numberDiff line change
@@ -132,15 +132,15 @@ data_store_conduit::~data_store_conduit()
132132
m_profile->close();
133133
}
134134
if (m_is_local_cache && m_mem_seg) {
135-
int sanity = shm_unlink(m_seg_name.c_str());
135+
int sanity = munmap(reinterpret_cast<void*>(m_mem_seg), m_mem_seg_length);
136136
if (sanity != 0) {
137-
std::cout << "\nWARNING: shm_unlink failed in "
138-
"data_store_conduit::~data_store_conduit()\n";
137+
LBANN_WARNING("munmap failed in for m_seg_name ", m_seg_name);
139138
}
140-
sanity = munmap(reinterpret_cast<void*>(m_mem_seg), m_mem_seg_length);
141-
if (sanity != 0) {
142-
std::cout << "\nWARNING: munmap failed in "
143-
"data_store_conduit::~data_store_conduit()\n";
139+
if (m_comm->get_rank_in_node() == 0) {
140+
sanity = shm_unlink(m_seg_name.c_str());
141+
if (sanity != 0) {
142+
LBANN_WARNING("shm_unlink failed for m_seg_name ", m_seg_name);
143+
}
144144
}
145145
}
146146
}
@@ -280,7 +280,7 @@ void data_store_conduit::spill_preloaded_conduit_node(int data_id,
280280
// note: at this point m_data[data_id] = node
281281
conduit::Node n3 = node;
282282
{
283-
std::lock_guard<std::mutex> lock(m_mutex);
283+
// BVE 9-20-2023 std::lock_guard<std::mutex> lock(m_mutex);
284284
build_node_for_sending(node, n3);
285285
}
286286
if (!m_node_sizes_vary) {
@@ -329,13 +329,13 @@ void data_store_conduit::set_preloaded_conduit_node(int data_id,
329329
conduit::Node n2 = node; // node == m_data[data_id]
330330
std::lock_guard<std::mutex> lock(m_mutex);
331331
build_node_for_sending(n2, m_data[data_id]);
332-
}
333-
if (!m_node_sizes_vary) {
334-
error_check_compacted_node(m_data[data_id], data_id);
335-
}
336-
else {
337-
std::lock_guard<std::mutex> lock(m_mutex);
338-
m_sample_sizes[data_id] = m_data[data_id].total_bytes_compact();
332+
333+
if (!m_node_sizes_vary) {
334+
error_check_compacted_node(m_data[data_id], data_id);
335+
}
336+
else {
337+
m_sample_sizes[data_id] = m_data[data_id].total_bytes_compact();
338+
}
339339
}
340340
}
341341

@@ -476,6 +476,7 @@ const conduit::Node& data_store_conduit::get_conduit_node(int data_id) const
476476
{
477477
using iterator_t = std::unordered_map<int, conduit::Node>::const_iterator;
478478
if (is_local_cache()) {
479+
std::lock_guard<std::mutex> lock(m_mutex);
479480
iterator_t t3 = m_data.find(data_id);
480481
if (t3 == m_data.end()) {
481482
LBANN_ERROR("(local cache) failed to find data_id: ",
@@ -490,6 +491,7 @@ const conduit::Node& data_store_conduit::get_conduit_node(int data_id) const
490491
// if not preloaded, and get_label() or get_response() is called,
491492
// we need to check m_data
492493
if (t2 == m_minibatch_data.end()) {
494+
std::lock_guard<std::mutex> lock(m_mutex);
493495
iterator_t t3 = m_data.find(data_id);
494496
if (t3 != m_data.end()) {
495497
return t3->second["data"];
@@ -593,113 +595,119 @@ void data_store_conduit::start_exchange_data_by_sample(size_t current_pos,
593595
// part 2: exchange the actual data
594596

595597
// start sends for outgoing data
596-
size_t ss = 0;
597-
for (int p = 0; p < m_np_in_trainer; p++) {
598-
const std::unordered_set<int>& indices = m_indices_to_send[p];
599-
for (auto index : indices) {
600-
if (m_data.find(index) == m_data.end()) {
601-
LBANN_ERROR("failed to find data_id: ",
602-
index,
603-
" to be sent to ",
604-
p,
605-
" in m_data");
606-
}
607-
const conduit::Node& n = m_data[index];
608-
const El::byte* s = reinterpret_cast<const El::byte*>(n.data_ptr());
609-
if (!n.is_contiguous()) {
610-
LBANN_ERROR("data_id: ", index, " does not have a contiguous layout");
611-
}
612-
if (n.data_ptr() == nullptr) {
613-
LBANN_ERROR("data_id: ", index, " does not have a valid data pointer");
614-
}
615-
if (n.contiguous_data_ptr() == nullptr) {
616-
LBANN_ERROR("data_id: ",
617-
index,
618-
" does not have a valid contiguous data pointer");
619-
}
598+
{
599+
std::lock_guard<std::mutex> lock(m_mutex);
600+
size_t ss = 0;
601+
for (int p = 0; p < m_np_in_trainer; p++) {
602+
const std::unordered_set<int>& indices = m_indices_to_send[p];
603+
for (auto index : indices) {
604+
if (m_data.find(index) == m_data.end()) {
605+
LBANN_ERROR("failed to find data_id: ",
606+
index,
607+
" to be sent to ",
608+
p,
609+
" in m_data");
610+
}
611+
const conduit::Node& n = m_data[index];
612+
const El::byte* s = reinterpret_cast<const El::byte*>(n.data_ptr());
613+
if (!n.is_contiguous()) {
614+
LBANN_ERROR("data_id: ", index, " does not have a contiguous layout");
615+
}
616+
if (n.data_ptr() == nullptr) {
617+
LBANN_ERROR("data_id: ",
618+
index,
619+
" does not have a valid data pointer");
620+
}
621+
if (n.contiguous_data_ptr() == nullptr) {
622+
LBANN_ERROR("data_id: ",
623+
index,
624+
" does not have a valid contiguous data pointer");
625+
}
620626

621-
size_t sz = m_compacted_sample_size;
627+
size_t sz = m_compacted_sample_size;
622628

623-
if (m_node_sizes_vary) {
624-
if (m_sample_sizes.find(index) == m_sample_sizes.end()) {
625-
LBANN_ERROR(
626-
"m_sample_sizes.find(index) == m_sample_sizes.end() for index: ",
627-
index,
628-
"; m_sample_sizes.size: ",
629-
m_sample_sizes.size());
629+
if (m_node_sizes_vary) {
630+
if (m_sample_sizes.find(index) == m_sample_sizes.end()) {
631+
LBANN_ERROR(
632+
"m_sample_sizes.find(index) == m_sample_sizes.end() for index: ",
633+
index,
634+
"; m_sample_sizes.size: ",
635+
m_sample_sizes.size());
636+
}
637+
sz = m_sample_sizes[index];
630638
}
631-
sz = m_sample_sizes[index];
632-
}
633639

634-
m_comm->nb_tagged_send<El::byte>(s,
635-
sz,
636-
p,
637-
index,
638-
m_send_requests[ss++],
639-
m_comm->get_trainer_comm());
640+
m_comm->nb_tagged_send<El::byte>(s,
641+
sz,
642+
p,
643+
index,
644+
m_send_requests[ss++],
645+
m_comm->get_trainer_comm());
646+
}
640647
}
641-
}
642-
643-
// sanity checks
644-
if (ss != m_send_requests.size()) {
645-
LBANN_ERROR("ss != m_send_requests.size; ss: ",
646-
ss,
647-
" m_send_requests.size: ",
648-
m_send_requests.size());
649-
}
650-
651-
// start recvs for incoming data
652-
ss = 0;
653648

654-
for (int p = 0; p < m_np_in_trainer; p++) {
655-
const std::unordered_set<int>& indices = m_indices_to_recv[p];
656-
int sanity = 0;
657-
for (auto index : indices) {
658-
++sanity;
659-
int sz = m_compacted_sample_size;
660-
if (m_node_sizes_vary) {
661-
if (m_sample_sizes.find(index) == m_sample_sizes.end()) {
662-
LBANN_ERROR(
663-
"m_sample_sizes.find(index) == m_sample_sizes.end() for index: ",
664-
index,
665-
"; m_sample_sizes.size(): ",
666-
m_sample_sizes.size(),
667-
" role: ",
668-
m_reader->get_role(),
669-
" for index: ",
670-
sanity,
671-
" of ",
672-
indices.size());
649+
// sanity checks
650+
if (ss != m_send_requests.size()) {
651+
LBANN_ERROR("ss != m_send_requests.size; ss: ",
652+
ss,
653+
" m_send_requests.size: ",
654+
m_send_requests.size());
655+
}
656+
657+
// start recvs for incoming data
658+
ss = 0;
659+
660+
for (int p = 0; p < m_np_in_trainer; p++) {
661+
const std::unordered_set<int>& indices = m_indices_to_recv[p];
662+
int sanity = 0;
663+
for (auto index : indices) {
664+
++sanity;
665+
int sz = m_compacted_sample_size;
666+
if (m_node_sizes_vary) {
667+
if (m_sample_sizes.find(index) == m_sample_sizes.end()) {
668+
LBANN_ERROR(
669+
"m_sample_sizes.find(index) == m_sample_sizes.end() for index: ",
670+
index,
671+
"; m_sample_sizes.size(): ",
672+
m_sample_sizes.size(),
673+
" role: ",
674+
m_reader->get_role(),
675+
" for index: ",
676+
sanity,
677+
" of ",
678+
indices.size());
679+
}
680+
sz = m_sample_sizes[index];
673681
}
674-
sz = m_sample_sizes[index];
675-
}
676682

677-
m_recv_buffer[ss].set(conduit::DataType::uint8(sz));
678-
El::byte* r = reinterpret_cast<El::byte*>(m_recv_buffer[ss].data_ptr());
679-
m_comm->nb_tagged_recv<El::byte>(r,
680-
sz,
681-
p,
682-
index,
683-
m_recv_requests[ss],
684-
m_comm->get_trainer_comm());
685-
m_recv_data_ids[ss] = index;
686-
++ss;
683+
m_recv_buffer[ss].set(conduit::DataType::uint8(sz));
684+
El::byte* r = reinterpret_cast<El::byte*>(m_recv_buffer[ss].data_ptr());
685+
m_comm->nb_tagged_recv<El::byte>(r,
686+
sz,
687+
p,
688+
index,
689+
m_recv_requests[ss],
690+
m_comm->get_trainer_comm());
691+
m_recv_data_ids[ss] = index;
692+
++ss;
693+
}
687694
}
688-
}
689695

690-
// sanity checks
691-
if (ss != m_recv_buffer.size()) {
692-
LBANN_ERROR("ss != m_recv_buffer.size; ss: ",
693-
ss,
694-
" m_recv_buffer.size: ",
695-
m_recv_buffer.size());
696-
}
697-
if (m_recv_requests.size() != m_recv_buffer.size()) {
698-
LBANN_ERROR("m_recv_requests.size != m_recv_buffer.size; m_recv_requests: ",
699-
m_recv_requests.size(),
700-
" m_recv_buffer.size: ",
701-
m_recv_buffer.size());
702-
}
696+
// sanity checks
697+
if (ss != m_recv_buffer.size()) {
698+
LBANN_ERROR("ss != m_recv_buffer.size; ss: ",
699+
ss,
700+
" m_recv_buffer.size: ",
701+
m_recv_buffer.size());
702+
}
703+
if (m_recv_requests.size() != m_recv_buffer.size()) {
704+
LBANN_ERROR(
705+
"m_recv_requests.size != m_recv_buffer.size; m_recv_requests: ",
706+
m_recv_requests.size(),
707+
" m_recv_buffer.size: ",
708+
m_recv_buffer.size());
709+
}
710+
} // End scope std::lock_guard<std::mutex> lock(m_mutex);
703711

704712
m_start_snd_rcv_time += (get_time() - tm5);
705713
}
@@ -737,6 +745,7 @@ void data_store_conduit::finish_exchange_data_by_sample()
737745
m_rebuild_time += (get_time() - tm5);
738746

739747
if (m_spill) {
748+
std::lock_guard<std::mutex> lock(m_mutex);
740749
// TODO
741750
m_data.clear();
742751
}
@@ -839,6 +848,7 @@ void data_store_conduit::build_preloaded_owner_map(
839848

840849
const conduit::Node& data_store_conduit::get_random_node() const
841850
{
851+
std::lock_guard<std::mutex> lock(m_mutex);
842852
size_t sz = m_data.size();
843853

844854
// Deal with edge case
@@ -871,6 +881,7 @@ conduit::Node& data_store_conduit::get_empty_node(int data_id)
871881

872882
void data_store_conduit::compact_nodes()
873883
{
884+
std::lock_guard<std::mutex> lock(m_mutex);
874885
for (auto&& j : *m_shuffled_indices) {
875886
if (m_data.find(j) != m_data.end()) {
876887
if (!(m_data[j].is_contiguous() && m_data[j].is_compact())) {
@@ -1068,6 +1079,7 @@ void data_store_conduit::check_mem_capacity(lbann_comm* comm,
10681079

10691080
bool data_store_conduit::has_conduit_node(int data_id) const
10701081
{
1082+
std::lock_guard<std::mutex> lock(m_mutex);
10711083
std::unordered_map<int, conduit::Node>::const_iterator t =
10721084
m_data.find(data_id);
10731085
return t != m_data.end();
@@ -1185,6 +1197,7 @@ void data_store_conduit::get_image_sizes(map_is_t& file_sizes,
11851197
// this block fires if we're exchanging cache data at the end
11861198
// of the first epoch, and the data store was not preloaded
11871199
if (is_explicitly_loading()) {
1200+
std::lock_guard<std::mutex> lock(m_mutex);
11881201
for (const auto& t : m_data) {
11891202
int data_id = t.first;
11901203
my_image_sizes.push_back(data_id);
@@ -1256,6 +1269,7 @@ void data_store_conduit::compute_image_offsets(
12561269
map_is_t& sizes,
12571270
std::vector<std::vector<int>>& indices)
12581271
{
1272+
std::lock_guard<std::mutex> lock(m_mutex);
12591273
size_t offset = 0;
12601274
for (size_t p = 0; p < indices.size(); p++) {
12611275
for (auto idx : indices[p]) {
@@ -1319,7 +1333,7 @@ void data_store_conduit::allocate_shared_segment(
13191333
if (node_id == 0) {
13201334
shm_fd = shm_open(m_seg_name.c_str(), O_CREAT | O_RDWR | O_EXCL, 0666);
13211335
if (shm_fd == -1) {
1322-
LBANN_ERROR("shm_open failed");
1336+
LBANN_ERROR("shm_open failed: ", m_seg_name);
13231337
}
13241338
int v = ftruncate(shm_fd, size);
13251339
if (v != 0) {
@@ -1382,6 +1396,7 @@ void data_store_conduit::exchange_local_caches()
13821396
PROFILE(" get_image_sizes time: ", (get_time() - tm1));
13831397

13841398
tm1 = get_time();
1399+
// BVE 9-21-2023 It seems like this should only be done once
13851400
allocate_shared_segment(m_sample_sizes, indices);
13861401
PROFILE(" allocate_shared_segment time: ", (get_time() - tm1));
13871402

@@ -1447,6 +1462,7 @@ void data_store_conduit::read_files(std::vector<char>& work,
14471462
void data_store_conduit::build_conduit_nodes(map_is_t& sizes)
14481463
{
14491464
image_data_reader* image_reader = dynamic_cast<image_data_reader*>(m_reader);
1465+
std::lock_guard<std::mutex> lock(m_mutex);
14501466
for (auto t : sizes) {
14511467
int data_id = t.first;
14521468
const auto sample = image_reader->get_sample(static_cast<size_t>(data_id));

0 commit comments

Comments
 (0)