igfuw
diff --git a/‎libmpdata++/bcond/detail/bcond_common.hpp‎
Lines changed: 5 additions & 2 deletions b/‎libmpdata++/bcond/detail/bcond_common.hpp‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎libmpdata++/bcond/detail/polar_common.hpp‎
Lines changed: 3 additions & 3 deletions b/‎libmpdata++/bcond/detail/polar_common.hpp‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎libmpdata++/bcond/detail/remote_3d_common.hpp‎
Lines changed: 116 additions & 0 deletions b/‎libmpdata++/bcond/detail/remote_3d_common.hpp‎
Lines changed: 116 additions & 0 deletions
diff --git a/‎libmpdata++/bcond/detail/remote_common.hpp‎
Lines changed: 37 additions & 16 deletions b/‎libmpdata++/bcond/detail/remote_common.hpp‎
Lines changed: 37 additions & 16 deletions
diff --git a/‎libmpdata++/bcond/open_3d.hpp‎
Lines changed: 6 additions & 6 deletions b/‎libmpdata++/bcond/open_3d.hpp‎
Lines changed: 6 additions & 6 deletions
@@ -193,6 +193,8 @@ namespace libmpdataxx
         virtual void avg_edge_and_halo1_sclr_cyclic(arr_3d_t &, const rng_t &, const rng_t &)
         {};
 
+        const bool single_threaded;
+
         protected:
           // sclr
         int
@@ -207,7 +209,7 @@ namespace libmpdataxx
         public:
 
         // ctor
-        bcond_common(const rng_t &i, const std::array<int, n_dims> &) :
+        bcond_common(const rng_t &i, const std::array<int, n_dims> &, bool single_threaded = false) :
           // sclr
           left_edge_sclr(
             i.first()
@@ -247,7 +249,8 @@ namespace libmpdataxx
           rght_intr_vctr(
             (i^h^(-1)).last() - (halo - 1),
             (i^h^(-1)).last()
-          )
+          ),
+          single_threaded(single_threaded)
         {}
 
         // the one for use in shared
 
@@ -37,10 +37,10 @@ namespace libmpdataxx
         // ctor
         polar_common(
           const rng_t &i,
-          const std::array<int, n_dims> &grid_size
+          const std::array<int, n_dims> &distmem_grid_size
         ) :
-          parent_t(i, grid_size),
-          pole((grid_size[0] - 1) / 2)
+          parent_t(i, distmem_grid_size),
+          pole((distmem_grid_size[0] - 1) / 2)
         {}
       };
     } // namespace detail
 
@@ -0,0 +1,116 @@
+// common code for ``remote'' MPI boundary conditions for libmpdata++
+//
+// licensing: GPU GPL v3
+// copyright: University of Warsaw
+
+#pragma once
+
+#include <libmpdata++/bcond/detail/remote_common.hpp>
+
+namespace libmpdataxx
+{
+  namespace bcond
+  {
+    namespace detail
+    {
+      template <typename real_t, int halo, drctn_e dir>
+      class remote_3d_common : public remote_common<real_t, halo, dir, 3>
+      {
+        using parent_t = detail::remote_common<real_t, halo, dir, 3>;
+
+        protected:
+
+        using arr_t = typename parent_t::arr_t;
+        using idx_t = typename parent_t::idx_t;
+
+        const int thread_rank, thread_size;
+
+        private:
+
+        const rng_t thread_j;
+        const int grid_size_y;
+
+        // try to guess what should be the whole domain exchanged by this process
+        // based on the difference between idx to be sent by this thread and idx of this process
+        idx_t extend_idx(idx_t idx)
+        {
+//std::cerr << "extend idx start idx(1): " << idx.lbound(1) << ", " << idx.ubound(1) << std::endl;
+          idx.lbound(1) = 0                 + idx.lbound(1) - thread_j.first(); // does it have to start at 0?
+          idx.ubound(1) = (grid_size_y - 1) + idx.ubound(1) - thread_j.last();  // does it have to end at grid_size_y - 1?
+//std::cerr << "extend idx end idx(1): " << idx.lbound(1) << ", " << idx.ubound(1) << std::endl;
+          return idx;
+        }
+
+        public:
+
+        void xchng (
+          const arr_t &a,
+          const idx_t &idx_send,
+          const idx_t &idx_recv
+        )
+        {
+          parent_t::xchng(a, extend_idx(idx_send), extend_idx(idx_recv));
+        }
+
+        void send (
+          const arr_t &a,
+          const idx_t &idx_send
+        )
+        {
+          parent_t::send(a, extend_idx(idx_send));
+        }
+
+        void recv (
+          const arr_t &a,
+          const idx_t &idx_recv
+        )
+        {
+          parent_t::recv(a, extend_idx(idx_recv));
+        }
+
+        // ctor
+        remote_3d_common(
+          const rng_t &i,
+          const std::array<int, 3> &distmem_grid_size,
+          const rng_t _thread_j,
+          const int thread_rank,
+          const int thread_size
+        ) :
+          parent_t(i, distmem_grid_size, true), // true indicating that this is a bcond done with a single thread
+          thread_rank(thread_rank),
+          thread_size(thread_size),
+          thread_j(_thread_j),
+          grid_size_y(distmem_grid_size[1])
+        {
+#if defined(USE_MPI)
+          // only 2 threads do mpi, others don't need buffers
+          if(thread_rank != 0 && thread_rank != thread_size-1)
+          {
+            free(parent_t::buf_send);
+            free(parent_t::buf_recv);
+          }
+//std::cerr << "remote_3d_common ctor thread_j: " << thread_j.lbound(0) << ", " << thread_j.ubound(0) << std::endl;
+//std::cerr << "remote_3d_common ctor _thread_j: " << _thread_j.lbound(0) << ", " << _thread_j.ubound(0) << std::endl;
+//std::cerr << "remote_3d_common ctor f-l thread_j: " << thread_j.first() << ", " << thread_j.last() << std::endl;
+//std::cerr << "remote_3d_common ctor f-l _thread_j: " << _thread_j.first() << ", " << _thread_j.last() << std::endl;
+#endif
+        }
+
+        // dtor
+        ~remote_3d_common()
+        {
+#if defined(USE_MPI)
+          if(thread_rank == 0 || thread_rank == thread_size-1)
+          {
+            free(parent_t::buf_send);
+            free(parent_t::buf_recv);
+          }
+          // hack to make free in ~remote_common give defined behaviour
+          parent_t::buf_send = nullptr;
+          parent_t::buf_recv = nullptr;
+#endif
+        }
+      };
+    };
+  } // namespace bcond
+} // namespace libmpdataxx
@@ -29,21 +29,22 @@ namespace libmpdataxx
         using arr_t = blitz::Array<real_t, n_dims>;
         using idx_t = blitz::RectDomain<n_dims>;
 
-        private:
+        real_t *buf_send,
+               *buf_recv;
 
-        const int grid_size_0;
+        private:
 
 #if defined(USE_MPI)
         boost::mpi::communicator mpicom;
-        real_t *buf_send,
-               *buf_recv;
 
 #  if defined(NDEBUG)
         static const int n_reqs = 2; // data, reqs for recv only is enough?
-        static const int n_dbg_reqs = 0;
+        static const int n_dbg_send_reqs = 0;
+        static const int n_dbg_tags = 0;
 #  else
         static const int n_reqs = 4; // data + ranges
-        static const int n_dbg_reqs = 1;
+        static const int n_dbg_send_reqs = 1;
+        static const int n_dbg_tags = 2;
 #  endif
 
         std::array<boost::mpi::request, n_reqs> reqs;
@@ -53,7 +54,6 @@ namespace libmpdataxx
           : (mpicom.rank() + 1                ) % mpicom.size();
 
 #  if !defined(NDEBUG)
-          const int debug = 2;
           std::pair<int, int> buf_rng;
 #  endif
 #endif
@@ -78,6 +78,12 @@ namespace libmpdataxx
           const int
             msg_send = dir == left ? left : rght;
 
+//          std::cerr << "send_hlpr idx dir " << dir << " : " 
+//            << " (" << idx_send.lbound(0) << ", " << idx_send.ubound(0) << ")"  
+//            << " (" << idx_send.lbound(1) << ", " << idx_send.ubound(1) << ")"  
+//            << " (" << idx_send.lbound(2) << ", " << idx_send.ubound(2) << ")"  
+//            << std::endl;
+
           // arr_send references part of the send buffer that will be used
           arr_t arr_send(buf_send, a(idx_send).shape(), blitz::neverDeleteData);
           // copying data to be sent
@@ -92,7 +98,7 @@ namespace libmpdataxx
 
             // sending debug information
 #  if !defined(NDEBUG)
-            reqs[1] = mpicom.isend(peer, msg_send ^ debug, std::pair<int,int>(
+            reqs[1] = mpicom.isend(peer, msg_send + n_dbg_tags, std::pair<int,int>(
               idx_send[0].first(),
               idx_send[0].last()
             ));
@@ -112,15 +118,21 @@ namespace libmpdataxx
           const int
             msg_recv = dir == left ? rght : left;
 
+//          std::cerr << "recv_hlpr idx dir " << dir << " : " 
+//            << " (" << idx_recv.lbound(0) << ", " << idx_recv.ubound(0) << ")"  
+//            << " (" << idx_recv.lbound(1) << ", " << idx_recv.ubound(1) << ")"  
+//            << " (" << idx_recv.lbound(2) << ", " << idx_recv.ubound(2) << ")"  
+//            << std::endl;
+
 
           // launching async data transfer
           if(a(idx_recv).size()!=0) // TODO: test directly size of idx_recv
           {
-            reqs[1+n_dbg_reqs] = mpicom.irecv(peer, msg_recv, buf_recv, a(idx_recv).size());
+            reqs[1+n_dbg_send_reqs] = mpicom.irecv(peer, msg_recv, buf_recv, a(idx_recv).size());
 
             // sending debug information
 #  if !defined(NDEBUG)
-            reqs[3] = mpicom.irecv(peer, msg_recv ^ debug, buf_rng);
+            reqs[3] = mpicom.irecv(peer, msg_recv + n_dbg_tags, buf_rng);
 #  endif
           }
 #else
@@ -137,7 +149,7 @@ namespace libmpdataxx
           send_hlpr(a, idx_send);
 
           // waiting for the transfers to finish
-          boost::mpi::wait_all(reqs.begin(), reqs.begin() + 1 + n_dbg_reqs); // MPI_Waitall is thread-safe?
+          boost::mpi::wait_all(reqs.begin(), reqs.begin() + 1 + n_dbg_send_reqs); // MPI_Waitall is thread-safe?
 #else
           assert(false);
 #endif
@@ -153,7 +165,7 @@ namespace libmpdataxx
           recv_hlpr(a, idx_recv);
 
           // waiting for the transfers to finish
-          boost::mpi::wait_all(reqs.begin() + 1 + n_dbg_reqs, reqs.end()); // MPI_Waitall is thread-safe?
+          boost::mpi::wait_all(reqs.begin() + 1 + n_dbg_send_reqs, reqs.end()); // MPI_Waitall is thread-safe?
 
           // a blitz handler for the used part of the receive buffer
           arr_t arr_recv(buf_recv, a(idx_recv).shape(), blitz::neverDeleteData); // TODO: shape directly from idx_recv
@@ -207,13 +219,21 @@ namespace libmpdataxx
         // ctor
         remote_common(
           const rng_t &i,
-          const std::array<int, n_dims> &grid_size
+          const std::array<int, n_dims> &distmem_grid_size,
+          bool single_threaded = false
         ) :
-          parent_t(i, grid_size),
-          grid_size_0(grid_size[0])
+          parent_t(i, distmem_grid_size, single_threaded)
         {
 #if defined(USE_MPI)
-          const int slice_size = n_dims==1 ? 1 : (n_dims==2? grid_size[1]+6 : (grid_size[1]+6) * (grid_size[2]+6) ); // 3 is the max halo size (?), so 6 on both sides
+  
+          const int slice_size = n_dims==1 ? 1 : (n_dims==2? distmem_grid_size[1]+6 : (distmem_grid_size[1]+6) * (distmem_grid_size[2]+6) ); // 3 is the max halo size (?), so 6 on both sides
+//std::cerr << "remote_common ctor, " 
+//  << " distmem_grid_size[0]: " << distmem_grid_size[0]
+//  << " distmem_grid_size[1]: " << distmem_grid_size[1]
+//  << " distmem_grid_size[2]: " << distmem_grid_size[2]
+//  << " slice_size: " << slice_size 
+//  << " halo: " << halo
+//  << std::endl;
           // allocate enough memory in buffers to store largest halos to be sent
           buf_send = (real_t *) malloc(halo * slice_size * sizeof(real_t));
           buf_recv = (real_t *) malloc(halo * slice_size * sizeof(real_t));
@@ -232,3 +252,4 @@ namespace libmpdataxx
     }
   } // namespace bcond
 } // namespace libmpdataxx
+
@@ -78,15 +78,15 @@ namespace libmpdataxx
         // TODO: exactly the same code below!
         switch (d) // note: order and lack of breaks intentional!
         {
-          case 0:
+          case 1:
           av[d+2](pi<d>(i, j, (k-h).first())) = 0;
           av[d+2](pi<d>(i, j, (k+h).last() )) = 0;
 
-          case 1:
+          case 2:
           av[d+1](pi<d>(i, (j-h).first(), k)) = 0;
           av[d+1](pi<d>(i, (j+h).last(),  k)) = 0;
 
-          case 2:
+          case 0:
           break;
 
           default: assert(false);
@@ -190,15 +190,15 @@ namespace libmpdataxx
 
         switch (d) // note: order and lack of breaks intentional!
         {
-          case 0:
+          case 1:
           av[d+2](pi<d>(i, j, (k-h).first())) = 0;
           av[d+2](pi<d>(i, j, (k+h).last() )) = 0;
 
-          case 1:
+          case 2:
           av[d+1](pi<d>(i, (j-h).first(), k)) = 0;
           av[d+1](pi<d>(i, (j+h).last(),  k)) = 0;
 
-          case 2:
+          case 0:
           break;
 
           default: assert(false);