@@ -132,15 +132,15 @@ data_store_conduit::~data_store_conduit()
132132 m_profile->close ();
133133 }
134134 if (m_is_local_cache && m_mem_seg) {
135- int sanity = shm_unlink (m_seg_name. c_str () );
135+ int sanity = munmap ( reinterpret_cast < void *>(m_mem_seg), m_mem_seg_length );
136136 if (sanity != 0 ) {
137- std::cout << " \n WARNING: shm_unlink failed in "
138- " data_store_conduit::~data_store_conduit()\n " ;
137+ LBANN_WARNING (" munmap failed in for m_seg_name " , m_seg_name);
139138 }
140- sanity = munmap (reinterpret_cast <void *>(m_mem_seg), m_mem_seg_length);
141- if (sanity != 0 ) {
142- std::cout << " \n WARNING: munmap failed in "
143- " data_store_conduit::~data_store_conduit()\n " ;
139+ if (m_comm->get_rank_in_node () == 0 ) {
140+ sanity = shm_unlink (m_seg_name.c_str ());
141+ if (sanity != 0 ) {
142+ LBANN_WARNING (" shm_unlink failed for m_seg_name " , m_seg_name);
143+ }
144144 }
145145 }
146146}
@@ -280,7 +280,7 @@ void data_store_conduit::spill_preloaded_conduit_node(int data_id,
280280 // note: at this point m_data[data_id] = node
281281 conduit::Node n3 = node;
282282 {
283- std::lock_guard<std::mutex> lock (m_mutex);
283+ // BVE 9-20-2023 std::lock_guard<std::mutex> lock(m_mutex);
284284 build_node_for_sending (node, n3);
285285 }
286286 if (!m_node_sizes_vary) {
@@ -329,13 +329,13 @@ void data_store_conduit::set_preloaded_conduit_node(int data_id,
329329 conduit::Node n2 = node; // node == m_data[data_id]
330330 std::lock_guard<std::mutex> lock (m_mutex);
331331 build_node_for_sending (n2, m_data[data_id]);
332- }
333- if (!m_node_sizes_vary) {
334- error_check_compacted_node (m_data[data_id], data_id);
335- }
336- else {
337- std::lock_guard<std::mutex> lock (m_mutex );
338- m_sample_sizes[data_id] = m_data[data_id]. total_bytes_compact ();
332+
333+ if (!m_node_sizes_vary) {
334+ error_check_compacted_node (m_data[data_id], data_id);
335+ }
336+ else {
337+ m_sample_sizes[data_id] = m_data[data_id]. total_bytes_compact ( );
338+ }
339339 }
340340}
341341
@@ -476,6 +476,7 @@ const conduit::Node& data_store_conduit::get_conduit_node(int data_id) const
476476{
477477 using iterator_t = std::unordered_map<int , conduit::Node>::const_iterator;
478478 if (is_local_cache ()) {
479+ std::lock_guard<std::mutex> lock (m_mutex);
479480 iterator_t t3 = m_data.find (data_id);
480481 if (t3 == m_data.end ()) {
481482 LBANN_ERROR (" (local cache) failed to find data_id: " ,
@@ -490,6 +491,7 @@ const conduit::Node& data_store_conduit::get_conduit_node(int data_id) const
490491 // if not preloaded, and get_label() or get_response() is called,
491492 // we need to check m_data
492493 if (t2 == m_minibatch_data.end ()) {
494+ std::lock_guard<std::mutex> lock (m_mutex);
493495 iterator_t t3 = m_data.find (data_id);
494496 if (t3 != m_data.end ()) {
495497 return t3->second [" data" ];
@@ -593,113 +595,119 @@ void data_store_conduit::start_exchange_data_by_sample(size_t current_pos,
593595 // part 2: exchange the actual data
594596
595597 // start sends for outgoing data
596- size_t ss = 0 ;
597- for (int p = 0 ; p < m_np_in_trainer; p++) {
598- const std::unordered_set<int >& indices = m_indices_to_send[p];
599- for (auto index : indices) {
600- if (m_data.find (index) == m_data.end ()) {
601- LBANN_ERROR (" failed to find data_id: " ,
602- index,
603- " to be sent to " ,
604- p,
605- " in m_data" );
606- }
607- const conduit::Node& n = m_data[index];
608- const El::byte* s = reinterpret_cast <const El::byte*>(n.data_ptr ());
609- if (!n.is_contiguous ()) {
610- LBANN_ERROR (" data_id: " , index, " does not have a contiguous layout" );
611- }
612- if (n.data_ptr () == nullptr ) {
613- LBANN_ERROR (" data_id: " , index, " does not have a valid data pointer" );
614- }
615- if (n.contiguous_data_ptr () == nullptr ) {
616- LBANN_ERROR (" data_id: " ,
617- index,
618- " does not have a valid contiguous data pointer" );
619- }
598+ {
599+ std::lock_guard<std::mutex> lock (m_mutex);
600+ size_t ss = 0 ;
601+ for (int p = 0 ; p < m_np_in_trainer; p++) {
602+ const std::unordered_set<int >& indices = m_indices_to_send[p];
603+ for (auto index : indices) {
604+ if (m_data.find (index) == m_data.end ()) {
605+ LBANN_ERROR (" failed to find data_id: " ,
606+ index,
607+ " to be sent to " ,
608+ p,
609+ " in m_data" );
610+ }
611+ const conduit::Node& n = m_data[index];
612+ const El::byte* s = reinterpret_cast <const El::byte*>(n.data_ptr ());
613+ if (!n.is_contiguous ()) {
614+ LBANN_ERROR (" data_id: " , index, " does not have a contiguous layout" );
615+ }
616+ if (n.data_ptr () == nullptr ) {
617+ LBANN_ERROR (" data_id: " ,
618+ index,
619+ " does not have a valid data pointer" );
620+ }
621+ if (n.contiguous_data_ptr () == nullptr ) {
622+ LBANN_ERROR (" data_id: " ,
623+ index,
624+ " does not have a valid contiguous data pointer" );
625+ }
620626
621- size_t sz = m_compacted_sample_size;
627+ size_t sz = m_compacted_sample_size;
622628
623- if (m_node_sizes_vary) {
624- if (m_sample_sizes.find (index) == m_sample_sizes.end ()) {
625- LBANN_ERROR (
626- " m_sample_sizes.find(index) == m_sample_sizes.end() for index: " ,
627- index,
628- " ; m_sample_sizes.size: " ,
629- m_sample_sizes.size ());
629+ if (m_node_sizes_vary) {
630+ if (m_sample_sizes.find (index) == m_sample_sizes.end ()) {
631+ LBANN_ERROR (
632+ " m_sample_sizes.find(index) == m_sample_sizes.end() for index: " ,
633+ index,
634+ " ; m_sample_sizes.size: " ,
635+ m_sample_sizes.size ());
636+ }
637+ sz = m_sample_sizes[index];
630638 }
631- sz = m_sample_sizes[index];
632- }
633639
634- m_comm->nb_tagged_send <El::byte>(s,
635- sz,
636- p,
637- index,
638- m_send_requests[ss++],
639- m_comm->get_trainer_comm ());
640+ m_comm->nb_tagged_send <El::byte>(s,
641+ sz,
642+ p,
643+ index,
644+ m_send_requests[ss++],
645+ m_comm->get_trainer_comm ());
646+ }
640647 }
641- }
642-
643- // sanity checks
644- if (ss != m_send_requests.size ()) {
645- LBANN_ERROR (" ss != m_send_requests.size; ss: " ,
646- ss,
647- " m_send_requests.size: " ,
648- m_send_requests.size ());
649- }
650-
651- // start recvs for incoming data
652- ss = 0 ;
653648
654- for (int p = 0 ; p < m_np_in_trainer; p++) {
655- const std::unordered_set<int >& indices = m_indices_to_recv[p];
656- int sanity = 0 ;
657- for (auto index : indices) {
658- ++sanity;
659- int sz = m_compacted_sample_size;
660- if (m_node_sizes_vary) {
661- if (m_sample_sizes.find (index) == m_sample_sizes.end ()) {
662- LBANN_ERROR (
663- " m_sample_sizes.find(index) == m_sample_sizes.end() for index: " ,
664- index,
665- " ; m_sample_sizes.size(): " ,
666- m_sample_sizes.size (),
667- " role: " ,
668- m_reader->get_role (),
669- " for index: " ,
670- sanity,
671- " of " ,
672- indices.size ());
649+ // sanity checks
650+ if (ss != m_send_requests.size ()) {
651+ LBANN_ERROR (" ss != m_send_requests.size; ss: " ,
652+ ss,
653+ " m_send_requests.size: " ,
654+ m_send_requests.size ());
655+ }
656+
657+ // start recvs for incoming data
658+ ss = 0 ;
659+
660+ for (int p = 0 ; p < m_np_in_trainer; p++) {
661+ const std::unordered_set<int >& indices = m_indices_to_recv[p];
662+ int sanity = 0 ;
663+ for (auto index : indices) {
664+ ++sanity;
665+ int sz = m_compacted_sample_size;
666+ if (m_node_sizes_vary) {
667+ if (m_sample_sizes.find (index) == m_sample_sizes.end ()) {
668+ LBANN_ERROR (
669+ " m_sample_sizes.find(index) == m_sample_sizes.end() for index: " ,
670+ index,
671+ " ; m_sample_sizes.size(): " ,
672+ m_sample_sizes.size (),
673+ " role: " ,
674+ m_reader->get_role (),
675+ " for index: " ,
676+ sanity,
677+ " of " ,
678+ indices.size ());
679+ }
680+ sz = m_sample_sizes[index];
673681 }
674- sz = m_sample_sizes[index];
675- }
676682
677- m_recv_buffer[ss].set (conduit::DataType::uint8 (sz));
678- El::byte* r = reinterpret_cast <El::byte*>(m_recv_buffer[ss].data_ptr ());
679- m_comm->nb_tagged_recv <El::byte>(r,
680- sz,
681- p,
682- index,
683- m_recv_requests[ss],
684- m_comm->get_trainer_comm ());
685- m_recv_data_ids[ss] = index;
686- ++ss;
683+ m_recv_buffer[ss].set (conduit::DataType::uint8 (sz));
684+ El::byte* r = reinterpret_cast <El::byte*>(m_recv_buffer[ss].data_ptr ());
685+ m_comm->nb_tagged_recv <El::byte>(r,
686+ sz,
687+ p,
688+ index,
689+ m_recv_requests[ss],
690+ m_comm->get_trainer_comm ());
691+ m_recv_data_ids[ss] = index;
692+ ++ss;
693+ }
687694 }
688- }
689695
690- // sanity checks
691- if (ss != m_recv_buffer.size ()) {
692- LBANN_ERROR (" ss != m_recv_buffer.size; ss: " ,
693- ss,
694- " m_recv_buffer.size: " ,
695- m_recv_buffer.size ());
696- }
697- if (m_recv_requests.size () != m_recv_buffer.size ()) {
698- LBANN_ERROR (" m_recv_requests.size != m_recv_buffer.size; m_recv_requests: " ,
699- m_recv_requests.size (),
700- " m_recv_buffer.size: " ,
701- m_recv_buffer.size ());
702- }
696+ // sanity checks
697+ if (ss != m_recv_buffer.size ()) {
698+ LBANN_ERROR (" ss != m_recv_buffer.size; ss: " ,
699+ ss,
700+ " m_recv_buffer.size: " ,
701+ m_recv_buffer.size ());
702+ }
703+ if (m_recv_requests.size () != m_recv_buffer.size ()) {
704+ LBANN_ERROR (
705+ " m_recv_requests.size != m_recv_buffer.size; m_recv_requests: " ,
706+ m_recv_requests.size (),
707+ " m_recv_buffer.size: " ,
708+ m_recv_buffer.size ());
709+ }
710+ } // End scope std::lock_guard<std::mutex> lock(m_mutex);
703711
704712 m_start_snd_rcv_time += (get_time () - tm5);
705713}
@@ -737,6 +745,7 @@ void data_store_conduit::finish_exchange_data_by_sample()
737745 m_rebuild_time += (get_time () - tm5);
738746
739747 if (m_spill) {
748+ std::lock_guard<std::mutex> lock (m_mutex);
740749 // TODO
741750 m_data.clear ();
742751 }
@@ -839,6 +848,7 @@ void data_store_conduit::build_preloaded_owner_map(
839848
840849const conduit::Node& data_store_conduit::get_random_node () const
841850{
851+ std::lock_guard<std::mutex> lock (m_mutex);
842852 size_t sz = m_data.size ();
843853
844854 // Deal with edge case
@@ -871,6 +881,7 @@ conduit::Node& data_store_conduit::get_empty_node(int data_id)
871881
872882void data_store_conduit::compact_nodes ()
873883{
884+ std::lock_guard<std::mutex> lock (m_mutex);
874885 for (auto && j : *m_shuffled_indices) {
875886 if (m_data.find (j) != m_data.end ()) {
876887 if (!(m_data[j].is_contiguous () && m_data[j].is_compact ())) {
@@ -1068,6 +1079,7 @@ void data_store_conduit::check_mem_capacity(lbann_comm* comm,
10681079
10691080bool data_store_conduit::has_conduit_node (int data_id) const
10701081{
1082+ std::lock_guard<std::mutex> lock (m_mutex);
10711083 std::unordered_map<int , conduit::Node>::const_iterator t =
10721084 m_data.find (data_id);
10731085 return t != m_data.end ();
@@ -1185,6 +1197,7 @@ void data_store_conduit::get_image_sizes(map_is_t& file_sizes,
11851197 // this block fires if we're exchanging cache data at the end
11861198 // of the first epoch, and the data store was not preloaded
11871199 if (is_explicitly_loading ()) {
1200+ std::lock_guard<std::mutex> lock (m_mutex);
11881201 for (const auto & t : m_data) {
11891202 int data_id = t.first ;
11901203 my_image_sizes.push_back (data_id);
@@ -1256,6 +1269,7 @@ void data_store_conduit::compute_image_offsets(
12561269 map_is_t & sizes,
12571270 std::vector<std::vector<int >>& indices)
12581271{
1272+ std::lock_guard<std::mutex> lock (m_mutex);
12591273 size_t offset = 0 ;
12601274 for (size_t p = 0 ; p < indices.size (); p++) {
12611275 for (auto idx : indices[p]) {
@@ -1319,7 +1333,7 @@ void data_store_conduit::allocate_shared_segment(
13191333 if (node_id == 0 ) {
13201334 shm_fd = shm_open (m_seg_name.c_str (), O_CREAT | O_RDWR | O_EXCL, 0666 );
13211335 if (shm_fd == -1 ) {
1322- LBANN_ERROR (" shm_open failed" );
1336+ LBANN_ERROR (" shm_open failed: " , m_seg_name );
13231337 }
13241338 int v = ftruncate (shm_fd, size);
13251339 if (v != 0 ) {
@@ -1382,6 +1396,7 @@ void data_store_conduit::exchange_local_caches()
13821396 PROFILE (" get_image_sizes time: " , (get_time () - tm1));
13831397
13841398 tm1 = get_time ();
1399+ // BVE 9-21-2023 It seems like this should only be done once
13851400 allocate_shared_segment (m_sample_sizes, indices);
13861401 PROFILE (" allocate_shared_segment time: " , (get_time () - tm1));
13871402
@@ -1447,6 +1462,7 @@ void data_store_conduit::read_files(std::vector<char>& work,
14471462void data_store_conduit::build_conduit_nodes (map_is_t & sizes)
14481463{
14491464 image_data_reader* image_reader = dynamic_cast <image_data_reader*>(m_reader);
1465+ std::lock_guard<std::mutex> lock (m_mutex);
14501466 for (auto t : sizes) {
14511467 int data_id = t.first ;
14521468 const auto sample = image_reader->get_sample (static_cast <size_t >(data_id));
0 commit comments