paboyle
diff --git a/‎Grid/GridStd.h‎
Lines changed: 3 additions & 0 deletions b/‎Grid/GridStd.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎Grid/allocator/AlignedAllocator.h‎
Lines changed: 9 additions & 1 deletion b/‎Grid/allocator/AlignedAllocator.h‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎Grid/communicator/Communicator_mpi3.cc‎
Lines changed: 6 additions & 3 deletions b/‎Grid/communicator/Communicator_mpi3.cc‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎Grid/communicator/SharedMemoryMPI.cc‎
Lines changed: 7 additions & 15 deletions b/‎Grid/communicator/SharedMemoryMPI.cc‎
Lines changed: 7 additions & 15 deletions
diff --git a/‎Grid/cshift/Cshift_common.h‎
Lines changed: 79 additions & 9 deletions b/‎Grid/cshift/Cshift_common.h‎
Lines changed: 79 additions & 9 deletions
@@ -28,4 +28,7 @@
 ///////////////////
 #include "Config.h"
 
+#ifdef TOFU
+#undef GRID_COMMS_THREADS
+#endif
 #endif /* GRID_STD_H */
@@ -165,9 +165,17 @@ template<typename _Tp>  inline bool operator!=(const devAllocator<_Tp>&, const d
 ////////////////////////////////////////////////////////////////////////////////
 // Template typedefs
 ////////////////////////////////////////////////////////////////////////////////
-//template<class T> using commAllocator = devAllocator<T>;
+#ifdef ACCELERATOR_CSHIFT
+// Cshift on device
+template<class T> using cshiftAllocator = devAllocator<T>;
+#else
+// Cshift on host
+template<class T> using cshiftAllocator = std::allocator<T>;
+#endif
+
 template<class T> using Vector     = std::vector<T,uvmAllocator<T> >;           
 template<class T> using commVector = std::vector<T,devAllocator<T> >;
+template<class T> using cshiftVector = std::vector<T,cshiftAllocator<T> >;
 
 NAMESPACE_END(Grid);
 
 
@@ -44,7 +44,7 @@ void CartesianCommunicator::Init(int *argc, char ***argv)
   MPI_Initialized(&flag); // needed to coexist with other libs apparently
   if ( !flag ) {
 
-#if defined (TOFU) // FUGAKU, credits go to Issaku Kanamori
+#ifndef GRID_COMMS_THREADS
     nCommThreads=1;
     // wrong results here too
     // For now: comms-overlap leads to wrong results in Benchmark_wilson even on single node MPI runs
@@ -358,16 +358,19 @@ double CartesianCommunicator::StencilSendToRecvFromBegin(std::vector<CommsReques
   assert(from != _processor);
   assert(gme  == ShmRank);
   double off_node_bytes=0.0;
+  int tag;
 
   if ( gfrom ==MPI_UNDEFINED) {
-    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,from,communicator_halo[commdir],&rrq);
+    tag= dir+from*32;
+    ierr=MPI_Irecv(recv, bytes, MPI_CHAR,from,tag,communicator_halo[commdir],&rrq);
     assert(ierr==0);
     list.push_back(rrq);
     off_node_bytes+=bytes;
   }
 
   if ( gdest == MPI_UNDEFINED ) {
-    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,_processor,communicator_halo[commdir],&xrq);
+    tag= dir+_processor*32;
+    ierr =MPI_Isend(xmit, bytes, MPI_CHAR,dest,tag,communicator_halo[commdir],&xrq);
     assert(ierr==0);
     list.push_back(xrq);
     off_node_bytes+=bytes;
 
@@ -457,8 +457,9 @@ void GlobalSharedMemory::SharedMemoryAllocate(uint64_t bytes, int flags)
     std::cerr << " SharedMemoryMPI.cc acceleratorAllocDevice failed NULL pointer for " << bytes<<" bytes " << std::endl;
     exit(EXIT_FAILURE);  
   }
-  if ( WorldRank == 0 ){
-    std::cout << header " SharedMemoryMPI.cc cudaMalloc "<< bytes 
+  //  if ( WorldRank == 0 ){
+  if ( 1 ){
+    std::cout << WorldRank << header " SharedMemoryMPI.cc acceleratorAllocDevice "<< bytes 
 	      << "bytes at "<< std::hex<< ShmCommBuf <<std::dec<<" for comms buffers " <<std::endl;
   }
   SharedMemoryZero(ShmCommBuf,bytes);
@@ -771,20 +772,11 @@ void SharedMemory::SetCommunicator(Grid_MPI_Comm comm)
   std::vector<int> ranks(size);   for(int r=0;r<size;r++) ranks[r]=r;
   MPI_Group_translate_ranks (FullGroup,size,&ranks[0],ShmGroup, &ShmRanks[0]); 
 
-#ifdef GRID_IBM_SUMMIT
-  // Hide the shared memory path between sockets 
-  // if even number of nodes
-  if ( (ShmSize & 0x1)==0 ) {
-    int SocketSize = ShmSize/2;
-    int mySocket = ShmRank/SocketSize; 
+#ifdef GRID_SHM_DISABLE
+  // Hide the shared memory path between ranks
+  {
     for(int r=0;r<size;r++){
-      int hisRank=ShmRanks[r];
-      if ( hisRank!= MPI_UNDEFINED ) {
-	int hisSocket=hisRank/SocketSize;
-	if ( hisSocket != mySocket ) {
-	  ShmRanks[r] = MPI_UNDEFINED;
-	}
-      }
+      ShmRanks[r] = MPI_UNDEFINED;
     }
   }
 #endif
 
@@ -35,7 +35,7 @@ extern Vector<std::pair<int,int> > Cshift_table;
 // Gather for when there is no need to SIMD split 
 ///////////////////////////////////////////////////////////////////
 template<class vobj> void 
-Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
+Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector<vobj> &buffer,int dimension,int plane,int cbmask, int off=0)
 {
   int rd = rhs.Grid()->_rdimensions[dimension];
 
@@ -73,12 +73,19 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
      }
   }
   {
-    autoView(rhs_v , rhs, AcceleratorRead);
     auto buffer_p = & buffer[0];
     auto table = &Cshift_table[0];
+#ifdef ACCELERATOR_CSHIFT    
+    autoView(rhs_v , rhs, AcceleratorRead);
     accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(buffer_p[table[i].first],coalescedRead(rhs_v[table[i].second]));
     });
+#else
+    autoView(rhs_v , rhs, CpuRead);
+    thread_for(i,ent,{
+      buffer_p[table[i].first]=rhs_v[table[i].second];
+    });
+#endif
   }
 }
 
@@ -103,6 +110,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
   int n1=rhs.Grid()->_slice_stride[dimension];
 
   if ( cbmask ==0x3){
+#ifdef ACCELERATOR_CSHIFT    
     autoView(rhs_v , rhs, AcceleratorRead);
     accelerator_for2d(n,e1,b,e2,1,{
 	int o      =   n*n1;
@@ -111,12 +119,22 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
 	vobj temp =rhs_v[so+o+b];
 	extract<vobj>(temp,pointers,offset);
       });
+#else
+    autoView(rhs_v , rhs, CpuRead);
+    thread_for2d(n,e1,b,e2,{
+	int o      =   n*n1;
+	int offset = b+n*e2;
+	
+	vobj temp =rhs_v[so+o+b];
+	extract<vobj>(temp,pointers,offset);
+      });
+#endif
   } else { 
-    autoView(rhs_v , rhs, AcceleratorRead);
-
     Coordinate rdim=rhs.Grid()->_rdimensions;
     Coordinate cdm =rhs.Grid()->_checker_dim_mask;
     std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
+#ifdef ACCELERATOR_CSHIFT    
+    autoView(rhs_v , rhs, AcceleratorRead);
     accelerator_for2d(n,e1,b,e2,1,{
 
 	Coordinate coor;
@@ -134,13 +152,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
 	  extract<vobj>(temp,pointers,offset);
 	}
       });
+#else
+    autoView(rhs_v , rhs, CpuRead);
+    thread_for2d(n,e1,b,e2,{
+
+	Coordinate coor;
+
+	int o=n*n1;
+	int oindex = o+b;
+
+       	int cb = RedBlackCheckerBoardFromOindex(oindex, rdim, cdm);
+
+	int ocb=1<<cb;
+	int offset = b+n*e2;
+
+	if ( ocb & cbmask ) {
+	  vobj temp =rhs_v[so+o+b];
+	  extract<vobj>(temp,pointers,offset);
+	}
+      });
+#endif
   }
 }
 
 //////////////////////////////////////////////////////
 // Scatter for when there is no need to SIMD split
 //////////////////////////////////////////////////////
-template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vobj> &buffer, int dimension,int plane,int cbmask)
+template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector<vobj> &buffer, int dimension,int plane,int cbmask)
 {
   int rd = rhs.Grid()->_rdimensions[dimension];
 
@@ -182,12 +220,19 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
   }
 
   {
-    autoView( rhs_v, rhs, AcceleratorWrite);
     auto buffer_p = & buffer[0];
     auto table = &Cshift_table[0];
+#ifdef ACCELERATOR_CSHIFT    
+    autoView( rhs_v, rhs, AcceleratorWrite);
     accelerator_for(i,ent,vobj::Nsimd(),{
 	coalescedWrite(rhs_v[table[i].first],coalescedRead(buffer_p[table[i].second]));
     });
+#else
+    autoView( rhs_v, rhs, CpuWrite);
+    thread_for(i,ent,{
+      rhs_v[table[i].first]=buffer_p[table[i].second];
+    });
+#endif
   }
 }
 
@@ -208,14 +253,23 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
   int e2=rhs.Grid()->_slice_block[dimension];
 
   if(cbmask ==0x3 ) {
-    autoView( rhs_v , rhs, AcceleratorWrite);
     int _slice_stride = rhs.Grid()->_slice_stride[dimension];
     int _slice_block = rhs.Grid()->_slice_block[dimension];
+#ifdef ACCELERATOR_CSHIFT    
+    autoView( rhs_v , rhs, AcceleratorWrite);
     accelerator_for2d(n,e1,b,e2,1,{
 	int o      = n*_slice_stride;
 	int offset = b+n*_slice_block;
 	merge(rhs_v[so+o+b],pointers,offset);
       });
+#else
+    autoView( rhs_v , rhs, CpuWrite);
+    thread_for2d(n,e1,b,e2,{
+	int o      = n*_slice_stride;
+	int offset = b+n*_slice_block;
+	merge(rhs_v[so+o+b],pointers,offset);
+    });
+#endif
   } else { 
 
     // Case of SIMD split AND checker dim cannot currently be hit, except in 
@@ -280,12 +334,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
   }
 
   {
+    auto table = &Cshift_table[0];
+#ifdef ACCELERATOR_CSHIFT    
     autoView(rhs_v , rhs, AcceleratorRead);
     autoView(lhs_v , lhs, AcceleratorWrite);
-    auto table = &Cshift_table[0];
     accelerator_for(i,ent,vobj::Nsimd(),{
       coalescedWrite(lhs_v[table[i].first],coalescedRead(rhs_v[table[i].second]));
     });
+#else
+    autoView(rhs_v , rhs, CpuRead);
+    autoView(lhs_v , lhs, CpuWrite);
+    thread_for(i,ent,{
+      lhs_v[table[i].first]=rhs_v[table[i].second];
+    });
+#endif
   }
 }
 
@@ -324,12 +386,20 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
   }
 
   {
+    auto table = &Cshift_table[0];
+#ifdef ACCELERATOR_CSHIFT    
     autoView( rhs_v, rhs, AcceleratorRead);
     autoView( lhs_v, lhs, AcceleratorWrite);
-    auto table = &Cshift_table[0];
     accelerator_for(i,ent,1,{
       permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
     });
+#else
+    autoView( rhs_v, rhs, CpuRead);
+    autoView( lhs_v, lhs, CpuWrite);
+    thread_for(i,ent,{
+      permute(lhs_v[table[i].first],rhs_v[table[i].second],permute_type);
+    });
+#endif
   }
 }