Skip to content

Commit cefa43a

Browse files
authored
Merge pull request #467 from pdziekan/sharedmem_3d_y_blitz_storage_order_mpi_by_two_threads
Sharedmem 3d y blitz storage order mpi by two threads
2 parents 23a8eb1 + 3964132 commit cefa43a

20 files changed

+518
-140
lines changed

libmpdata++/bcond/detail/bcond_common.hpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,8 @@ namespace libmpdataxx
193193
virtual void avg_edge_and_halo1_sclr_cyclic(arr_3d_t &, const rng_t &, const rng_t &)
194194
{};
195195

196+
const bool single_threaded;
197+
196198
protected:
197199
// sclr
198200
int
@@ -207,7 +209,7 @@ namespace libmpdataxx
207209
public:
208210

209211
// ctor
210-
bcond_common(const rng_t &i, const std::array<int, n_dims> &) :
212+
bcond_common(const rng_t &i, const std::array<int, n_dims> &, bool single_threaded = false) :
211213
// sclr
212214
left_edge_sclr(
213215
i.first()
@@ -247,7 +249,8 @@ namespace libmpdataxx
247249
rght_intr_vctr(
248250
(i^h^(-1)).last() - (halo - 1),
249251
(i^h^(-1)).last()
250-
)
252+
),
253+
single_threaded(single_threaded)
251254
{}
252255

253256
// the one for use in shared

libmpdata++/bcond/detail/polar_common.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,10 +37,10 @@ namespace libmpdataxx
3737
// ctor
3838
polar_common(
3939
const rng_t &i,
40-
const std::array<int, n_dims> &grid_size
40+
const std::array<int, n_dims> &distmem_grid_size
4141
) :
42-
parent_t(i, grid_size),
43-
pole((grid_size[0] - 1) / 2)
42+
parent_t(i, distmem_grid_size),
43+
pole((distmem_grid_size[0] - 1) / 2)
4444
{}
4545
};
4646
} // namespace detail
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
// common code for ``remote'' MPI boundary conditions for libmpdata++
2+
//
3+
// licensing: GPU GPL v3
4+
// copyright: University of Warsaw
5+
6+
#pragma once
7+
8+
#include <libmpdata++/bcond/detail/remote_common.hpp>
9+
10+
namespace libmpdataxx
11+
{
12+
namespace bcond
13+
{
14+
namespace detail
15+
{
16+
template <typename real_t, int halo, drctn_e dir>
17+
class remote_3d_common : public remote_common<real_t, halo, dir, 3>
18+
{
19+
using parent_t = detail::remote_common<real_t, halo, dir, 3>;
20+
21+
protected:
22+
23+
using arr_t = typename parent_t::arr_t;
24+
using idx_t = typename parent_t::idx_t;
25+
26+
const int thread_rank, thread_size;
27+
28+
private:
29+
30+
const rng_t thread_j;
31+
const int grid_size_y;
32+
33+
// try to guess what should be the whole domain exchanged by this process
34+
// based on the difference between idx to be sent by this thread and idx of this process
35+
idx_t extend_idx(idx_t idx)
36+
{
37+
//std::cerr << "extend idx start idx(1): " << idx.lbound(1) << ", " << idx.ubound(1) << std::endl;
38+
idx.lbound(1) = 0 + idx.lbound(1) - thread_j.first(); // does it have to start at 0?
39+
idx.ubound(1) = (grid_size_y - 1) + idx.ubound(1) - thread_j.last(); // does it have to end at grid_size_y - 1?
40+
//std::cerr << "extend idx end idx(1): " << idx.lbound(1) << ", " << idx.ubound(1) << std::endl;
41+
return idx;
42+
}
43+
44+
public:
45+
46+
void xchng (
47+
const arr_t &a,
48+
const idx_t &idx_send,
49+
const idx_t &idx_recv
50+
)
51+
{
52+
parent_t::xchng(a, extend_idx(idx_send), extend_idx(idx_recv));
53+
}
54+
55+
void send (
56+
const arr_t &a,
57+
const idx_t &idx_send
58+
)
59+
{
60+
parent_t::send(a, extend_idx(idx_send));
61+
}
62+
63+
void recv (
64+
const arr_t &a,
65+
const idx_t &idx_recv
66+
)
67+
{
68+
parent_t::recv(a, extend_idx(idx_recv));
69+
}
70+
71+
// ctor
72+
remote_3d_common(
73+
const rng_t &i,
74+
const std::array<int, 3> &distmem_grid_size,
75+
const rng_t _thread_j,
76+
const int thread_rank,
77+
const int thread_size
78+
) :
79+
parent_t(i, distmem_grid_size, true), // true indicating that this is a bcond done with a single thread
80+
thread_rank(thread_rank),
81+
thread_size(thread_size),
82+
thread_j(_thread_j),
83+
grid_size_y(distmem_grid_size[1])
84+
{
85+
#if defined(USE_MPI)
86+
// only 2 threads do mpi, others don't need buffers
87+
if(thread_rank != 0 && thread_rank != thread_size-1)
88+
{
89+
free(parent_t::buf_send);
90+
free(parent_t::buf_recv);
91+
}
92+
//std::cerr << "remote_3d_common ctor thread_j: " << thread_j.lbound(0) << ", " << thread_j.ubound(0) << std::endl;
93+
//std::cerr << "remote_3d_common ctor _thread_j: " << _thread_j.lbound(0) << ", " << _thread_j.ubound(0) << std::endl;
94+
//std::cerr << "remote_3d_common ctor f-l thread_j: " << thread_j.first() << ", " << thread_j.last() << std::endl;
95+
//std::cerr << "remote_3d_common ctor f-l _thread_j: " << _thread_j.first() << ", " << _thread_j.last() << std::endl;
96+
#endif
97+
}
98+
99+
// dtor
100+
~remote_3d_common()
101+
{
102+
#if defined(USE_MPI)
103+
if(thread_rank == 0 || thread_rank == thread_size-1)
104+
{
105+
free(parent_t::buf_send);
106+
free(parent_t::buf_recv);
107+
}
108+
// hack to make free in ~remote_common give defined behaviour
109+
parent_t::buf_send = nullptr;
110+
parent_t::buf_recv = nullptr;
111+
#endif
112+
}
113+
};
114+
};
115+
} // namespace bcond
116+
} // namespace libmpdataxx

libmpdata++/bcond/detail/remote_common.hpp

Lines changed: 37 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -29,21 +29,22 @@ namespace libmpdataxx
2929
using arr_t = blitz::Array<real_t, n_dims>;
3030
using idx_t = blitz::RectDomain<n_dims>;
3131

32-
private:
32+
real_t *buf_send,
33+
*buf_recv;
3334

34-
const int grid_size_0;
35+
private:
3536

3637
#if defined(USE_MPI)
3738
boost::mpi::communicator mpicom;
38-
real_t *buf_send,
39-
*buf_recv;
4039

4140
# if defined(NDEBUG)
4241
static const int n_reqs = 2; // data, reqs for recv only is enough?
43-
static const int n_dbg_reqs = 0;
42+
static const int n_dbg_send_reqs = 0;
43+
static const int n_dbg_tags = 0;
4444
# else
4545
static const int n_reqs = 4; // data + ranges
46-
static const int n_dbg_reqs = 1;
46+
static const int n_dbg_send_reqs = 1;
47+
static const int n_dbg_tags = 2;
4748
# endif
4849

4950
std::array<boost::mpi::request, n_reqs> reqs;
@@ -53,7 +54,6 @@ namespace libmpdataxx
5354
: (mpicom.rank() + 1 ) % mpicom.size();
5455

5556
# if !defined(NDEBUG)
56-
const int debug = 2;
5757
std::pair<int, int> buf_rng;
5858
# endif
5959
#endif
@@ -78,6 +78,12 @@ namespace libmpdataxx
7878
const int
7979
msg_send = dir == left ? left : rght;
8080

81+
// std::cerr << "send_hlpr idx dir " << dir << " : "
82+
// << " (" << idx_send.lbound(0) << ", " << idx_send.ubound(0) << ")"
83+
// << " (" << idx_send.lbound(1) << ", " << idx_send.ubound(1) << ")"
84+
// << " (" << idx_send.lbound(2) << ", " << idx_send.ubound(2) << ")"
85+
// << std::endl;
86+
8187
// arr_send references part of the send buffer that will be used
8288
arr_t arr_send(buf_send, a(idx_send).shape(), blitz::neverDeleteData);
8389
// copying data to be sent
@@ -92,7 +98,7 @@ namespace libmpdataxx
9298

9399
// sending debug information
94100
# if !defined(NDEBUG)
95-
reqs[1] = mpicom.isend(peer, msg_send ^ debug, std::pair<int,int>(
101+
reqs[1] = mpicom.isend(peer, msg_send + n_dbg_tags, std::pair<int,int>(
96102
idx_send[0].first(),
97103
idx_send[0].last()
98104
));
@@ -112,15 +118,21 @@ namespace libmpdataxx
112118
const int
113119
msg_recv = dir == left ? rght : left;
114120

121+
// std::cerr << "recv_hlpr idx dir " << dir << " : "
122+
// << " (" << idx_recv.lbound(0) << ", " << idx_recv.ubound(0) << ")"
123+
// << " (" << idx_recv.lbound(1) << ", " << idx_recv.ubound(1) << ")"
124+
// << " (" << idx_recv.lbound(2) << ", " << idx_recv.ubound(2) << ")"
125+
// << std::endl;
126+
115127

116128
// launching async data transfer
117129
if(a(idx_recv).size()!=0) // TODO: test directly size of idx_recv
118130
{
119-
reqs[1+n_dbg_reqs] = mpicom.irecv(peer, msg_recv, buf_recv, a(idx_recv).size());
131+
reqs[1+n_dbg_send_reqs] = mpicom.irecv(peer, msg_recv, buf_recv, a(idx_recv).size());
120132

121133
// sending debug information
122134
# if !defined(NDEBUG)
123-
reqs[3] = mpicom.irecv(peer, msg_recv ^ debug, buf_rng);
135+
reqs[3] = mpicom.irecv(peer, msg_recv + n_dbg_tags, buf_rng);
124136
# endif
125137
}
126138
#else
@@ -137,7 +149,7 @@ namespace libmpdataxx
137149
send_hlpr(a, idx_send);
138150

139151
// waiting for the transfers to finish
140-
boost::mpi::wait_all(reqs.begin(), reqs.begin() + 1 + n_dbg_reqs); // MPI_Waitall is thread-safe?
152+
boost::mpi::wait_all(reqs.begin(), reqs.begin() + 1 + n_dbg_send_reqs); // MPI_Waitall is thread-safe?
141153
#else
142154
assert(false);
143155
#endif
@@ -153,7 +165,7 @@ namespace libmpdataxx
153165
recv_hlpr(a, idx_recv);
154166

155167
// waiting for the transfers to finish
156-
boost::mpi::wait_all(reqs.begin() + 1 + n_dbg_reqs, reqs.end()); // MPI_Waitall is thread-safe?
168+
boost::mpi::wait_all(reqs.begin() + 1 + n_dbg_send_reqs, reqs.end()); // MPI_Waitall is thread-safe?
157169

158170
// a blitz handler for the used part of the receive buffer
159171
arr_t arr_recv(buf_recv, a(idx_recv).shape(), blitz::neverDeleteData); // TODO: shape directly from idx_recv
@@ -207,13 +219,21 @@ namespace libmpdataxx
207219
// ctor
208220
remote_common(
209221
const rng_t &i,
210-
const std::array<int, n_dims> &grid_size
222+
const std::array<int, n_dims> &distmem_grid_size,
223+
bool single_threaded = false
211224
) :
212-
parent_t(i, grid_size),
213-
grid_size_0(grid_size[0])
225+
parent_t(i, distmem_grid_size, single_threaded)
214226
{
215227
#if defined(USE_MPI)
216-
const int slice_size = n_dims==1 ? 1 : (n_dims==2? grid_size[1]+6 : (grid_size[1]+6) * (grid_size[2]+6) ); // 3 is the max halo size (?), so 6 on both sides
228+
229+
const int slice_size = n_dims==1 ? 1 : (n_dims==2? distmem_grid_size[1]+6 : (distmem_grid_size[1]+6) * (distmem_grid_size[2]+6) ); // 3 is the max halo size (?), so 6 on both sides
230+
//std::cerr << "remote_common ctor, "
231+
// << " distmem_grid_size[0]: " << distmem_grid_size[0]
232+
// << " distmem_grid_size[1]: " << distmem_grid_size[1]
233+
// << " distmem_grid_size[2]: " << distmem_grid_size[2]
234+
// << " slice_size: " << slice_size
235+
// << " halo: " << halo
236+
// << std::endl;
217237
// allocate enough memory in buffers to store largest halos to be sent
218238
buf_send = (real_t *) malloc(halo * slice_size * sizeof(real_t));
219239
buf_recv = (real_t *) malloc(halo * slice_size * sizeof(real_t));
@@ -232,3 +252,4 @@ namespace libmpdataxx
232252
}
233253
} // namespace bcond
234254
} // namespace libmpdataxx
255+

libmpdata++/bcond/open_3d.hpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -78,15 +78,15 @@ namespace libmpdataxx
7878
// TODO: exactly the same code below!
7979
switch (d) // note: order and lack of breaks intentional!
8080
{
81-
case 0:
81+
case 1:
8282
av[d+2](pi<d>(i, j, (k-h).first())) = 0;
8383
av[d+2](pi<d>(i, j, (k+h).last() )) = 0;
8484

85-
case 1:
85+
case 2:
8686
av[d+1](pi<d>(i, (j-h).first(), k)) = 0;
8787
av[d+1](pi<d>(i, (j+h).last(), k)) = 0;
8888

89-
case 2:
89+
case 0:
9090
break;
9191

9292
default: assert(false);
@@ -190,15 +190,15 @@ namespace libmpdataxx
190190

191191
switch (d) // note: order and lack of breaks intentional!
192192
{
193-
case 0:
193+
case 1:
194194
av[d+2](pi<d>(i, j, (k-h).first())) = 0;
195195
av[d+2](pi<d>(i, j, (k+h).last() )) = 0;
196196

197-
case 1:
197+
case 2:
198198
av[d+1](pi<d>(i, (j-h).first(), k)) = 0;
199199
av[d+1](pi<d>(i, (j+h).last(), k)) = 0;
200200

201-
case 2:
201+
case 0:
202202
break;
203203

204204
default: assert(false);

0 commit comments

Comments
 (0)