Skip to content

Commit 9c4dcc5

Browse files
committed
Merge branch 'master' into develop
2 parents a0ccbb3 + a1063dd commit 9c4dcc5

13 files changed

Lines changed: 683 additions & 87 deletions

Grid/GridStd.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,7 @@
2828
///////////////////
2929
#include "Config.h"
3030

31+
#ifdef TOFU
32+
#undef GRID_COMMS_THREADS
33+
#endif
3134
#endif /* GRID_STD_H */

Grid/allocator/AlignedAllocator.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,9 +165,17 @@ template<typename _Tp> inline bool operator!=(const devAllocator<_Tp>&, const d
165165
////////////////////////////////////////////////////////////////////////////////
166166
// Template typedefs
167167
////////////////////////////////////////////////////////////////////////////////
168-
//template<class T> using commAllocator = devAllocator<T>;
168+
#ifdef ACCELERATOR_CSHIFT
169+
// Cshift on device
170+
template<class T> using cshiftAllocator = devAllocator<T>;
171+
#else
172+
// Cshift on host
173+
template<class T> using cshiftAllocator = std::allocator<T>;
174+
#endif
175+
169176
template<class T> using Vector = std::vector<T,uvmAllocator<T> >;
170177
template<class T> using commVector = std::vector<T,devAllocator<T> >;
178+
template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
171179

172180
NAMESPACE_END(Grid);
173181

Grid/communicator/Communicator_mpi3.cc

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
4444
MPI_Initialized(&flag); // needed to coexist with other libs apparently
4545
if ( !flag ) {
4646

47-
#if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori
47+
#ifndef GRID_COMMS_THREADS
4848
nCommThreads=1;
4949
// wrong results here too
5050
// For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
@@ -358,16 +358,19 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
358358
assert(from != _processor);
359359
assert(gme == ShmRank);
360360
double off_node_bytes=0.0;
361+
int tag;
361362

362363
if ( gfrom ==MPI_UNDEFINED) {
363-
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
364+
tag= dir+from*32;
365+
ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
364366
assert(ierr==0);
365367
list.push_back(rrq);
366368
off_node_bytes+=bytes;
367369
}
368370

369371
if ( gdest == MPI_UNDEFINED ) {
370-
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
372+
tag= dir+_processor*32;
373+
ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
371374
assert(ierr==0);
372375
list.push_back(xrq);
373376
off_node_bytes+=bytes;

Grid/communicator/SharedMemoryMPI.cc

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -457,8 +457,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
457457
std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
458458
exit(EXIT_FAILURE);
459459
}
460-
if ( WorldRank == 0 ){
461-
std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes
460+
// if ( WorldRank == 0 ){
461+
if ( 1 ){
462+
std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes
462463
<< "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
463464
}
464465
SharedMemoryZero(ShmCommBuf,bytes);
@@ -771,20 +772,11 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
771772
std::vector<int> ranks(size); for(int r=0;r<size;r++) ranks[r]=r;
772773
MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]);
773774

774-
#ifdef GRID_IBM_SUMMIT
775-
// Hide the shared memory path between sockets
776-
// if even number of nodes
777-
if ( (ShmSize & 0x1)==0 ) {
778-
int SocketSize = ShmSize/2;
779-
int mySocket = ShmRank/SocketSize;
775+
#ifdef GRID_SHM_DISABLE
776+
// Hide the shared memory path between ranks
777+
{
780778
for(int r=0;r<size;r++){
781-
int hisRank=ShmRanks[r];
782-
if ( hisRank!= MPI_UNDEFINED ) {
783-
int hisSocket=hisRank/SocketSize;
784-
if ( hisSocket != mySocket ) {
785-
ShmRanks[r] = MPI_UNDEFINED;
786-
}
787-
}
779+
ShmRanks[r] = MPI_UNDEFINED;
788780
}
789781
}
790782
#endif

Grid/cshift/Cshift_common.h

Lines changed: 79 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ extern Vector<std::pair<int,int> > Cshift_table;
3535
// Gather for when there is no need to SIMD split
3636
///////////////////////////////////////////////////////////////////
3737
template<class vobj> void
38-
Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
38+
Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
3939
{
4040
int rd = rhs.Grid()->_rdimensions[dimension];
4141

@@ -73,12 +73,19 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
7373
}
7474
}
7575
{
76-
autoView(rhs_v , rhs, AcceleratorRead);
7776
auto buffer_p = & buffer[0];
7877
auto table = &Cshift_table[0];
78+
#ifdef ACCELERATOR_CSHIFT
79+
autoView(rhs_v , rhs, AcceleratorRead);
7980
accelerator_for(i,ent,vobj::Nsimd(),{
8081
coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
8182
});
83+
#else
84+
autoView(rhs_v , rhs, CpuRead);
85+
thread_for(i,ent,{
86+
buffer_p[table[i].first]=rhs_v[table[i].second];
87+
});
88+
#endif
8289
}
8390
}
8491

@@ -103,6 +110,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
103110
int n1=rhs.Grid()->_slice_stride[dimension];
104111

105112
if ( cbmask ==0x3){
113+
#ifdef ACCELERATOR_CSHIFT
106114
autoView(rhs_v , rhs, AcceleratorRead);
107115
accelerator_for2d(n,e1,b,e2,1,{
108116
int o = n*n1;
@@ -111,12 +119,22 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
111119
vobj temp =rhs_v[so+o+b];
112120
extract<vobj>(temp,pointers,offset);
113121
});
122+
#else
123+
autoView(rhs_v , rhs, CpuRead);
124+
thread_for2d(n,e1,b,e2,{
125+
int o = n*n1;
126+
int offset = b+n*e2;
127+
128+
vobj temp =rhs_v[so+o+b];
129+
extract<vobj>(temp,pointers,offset);
130+
});
131+
#endif
114132
} else {
115-
autoView(rhs_v , rhs, AcceleratorRead);
116-
117133
Coordinate rdim=rhs.Grid()->_rdimensions;
118134
Coordinate cdm =rhs.Grid()->_checker_dim_mask;
119135
std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
136+
#ifdef ACCELERATOR_CSHIFT
137+
autoView(rhs_v , rhs, AcceleratorRead);
120138
accelerator_for2d(n,e1,b,e2,1,{
121139

122140
Coordinate coor;
@@ -134,13 +152,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
134152
extract<vobj>(temp,pointers,offset);
135153
}
136154
});
155+
#else
156+
autoView(rhs_v , rhs, CpuRead);
157+
thread_for2d(n,e1,b,e2,{
158+
159+
Coordinate coor;
160+
161+
int o=n*n1;
162+
int oindex = o+b;
163+
164+
int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
165+
166+
int ocb=1<<cb;
167+
int offset = b+n*e2;
168+
169+
if ( ocb & cbmask ) {
170+
vobj temp =rhs_v[so+o+b];
171+
extract<vobj>(temp,pointers,offset);
172+
}
173+
});
174+
#endif
137175
}
138176
}
139177

140178
//////////////////////////////////////////////////////
141179
// Scatter for when there is no need to SIMD split
142180
//////////////////////////////////////////////////////
143-
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
181+
template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
144182
{
145183
int rd = rhs.Grid()->_rdimensions[dimension];
146184

@@ -182,12 +220,19 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
182220
}
183221

184222
{
185-
autoView( rhs_v, rhs, AcceleratorWrite);
186223
auto buffer_p = & buffer[0];
187224
auto table = &Cshift_table[0];
225+
#ifdef ACCELERATOR_CSHIFT
226+
autoView( rhs_v, rhs, AcceleratorWrite);
188227
accelerator_for(i,ent,vobj::Nsimd(),{
189228
coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
190229
});
230+
#else
231+
autoView( rhs_v, rhs, CpuWrite);
232+
thread_for(i,ent,{
233+
rhs_v[table[i].first]=buffer_p[table[i].second];
234+
});
235+
#endif
191236
}
192237
}
193238

@@ -208,14 +253,23 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
208253
int e2=rhs.Grid()->_slice_block[dimension];
209254

210255
if(cbmask ==0x3 ) {
211-
autoView( rhs_v , rhs, AcceleratorWrite);
212256
int _slice_stride = rhs.Grid()->_slice_stride[dimension];
213257
int _slice_block = rhs.Grid()->_slice_block[dimension];
258+
#ifdef ACCELERATOR_CSHIFT
259+
autoView( rhs_v , rhs, AcceleratorWrite);
214260
accelerator_for2d(n,e1,b,e2,1,{
215261
int o = n*_slice_stride;
216262
int offset = b+n*_slice_block;
217263
merge(rhs_v[so+o+b],pointers,offset);
218264
});
265+
#else
266+
autoView( rhs_v , rhs, CpuWrite);
267+
thread_for2d(n,e1,b,e2,{
268+
int o = n*_slice_stride;
269+
int offset = b+n*_slice_block;
270+
merge(rhs_v[so+o+b],pointers,offset);
271+
});
272+
#endif
219273
} else {
220274

221275
// Case of SIMD split AND checker dim cannot currently be hit, except in
@@ -280,12 +334,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
280334
}
281335

282336
{
337+
auto table = &Cshift_table[0];
338+
#ifdef ACCELERATOR_CSHIFT
283339
autoView(rhs_v , rhs, AcceleratorRead);
284340
autoView(lhs_v , lhs, AcceleratorWrite);
285-
auto table = &Cshift_table[0];
286341
accelerator_for(i,ent,vobj::Nsimd(),{
287342
coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
288343
});
344+
#else
345+
autoView(rhs_v , rhs, CpuRead);
346+
autoView(lhs_v , lhs, CpuWrite);
347+
thread_for(i,ent,{
348+
lhs_v[table[i].first]=rhs_v[table[i].second];
349+
});
350+
#endif
289351
}
290352
}
291353

@@ -324,12 +386,20 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
324386
}
325387

326388
{
389+
auto table = &Cshift_table[0];
390+
#ifdef ACCELERATOR_CSHIFT
327391
autoView( rhs_v, rhs, AcceleratorRead);
328392
autoView( lhs_v, lhs, AcceleratorWrite);
329-
auto table = &Cshift_table[0];
330393
accelerator_for(i,ent,1,{
331394
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
332395
});
396+
#else
397+
autoView( rhs_v, rhs, CpuRead);
398+
autoView( lhs_v, lhs, CpuWrite);
399+
thread_for(i,ent,{
400+
permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
401+
});
402+
#endif
333403
}
334404
}
335405

0 commit comments

Comments
 (0)