@@ -35,7 +35,7 @@ extern Vector<std::pair<int,int> > Cshift_table;
3535// Gather for when there is no need to SIMD split
3636// /////////////////////////////////////////////////////////////////
3737template <class vobj > void
38- Gather_plane_simple (const Lattice<vobj> &rhs,commVector <vobj> &buffer,int dimension,int plane,int cbmask, int off=0 )
38+ Gather_plane_simple (const Lattice<vobj> &rhs,cshiftVector <vobj> &buffer,int dimension,int plane,int cbmask, int off=0 )
3939{
4040 int rd = rhs.Grid ()->_rdimensions [dimension];
4141
@@ -73,12 +73,19 @@ Gather_plane_simple (const Lattice<vobj> &rhs,commVector<vobj> &buffer,int dimen
7373 }
7474 }
7575 {
76- autoView (rhs_v , rhs, AcceleratorRead);
7776 auto buffer_p = & buffer[0 ];
7877 auto table = &Cshift_table[0 ];
78+ #ifdef ACCELERATOR_CSHIFT
79+ autoView (rhs_v , rhs, AcceleratorRead);
7980 accelerator_for (i,ent,vobj::Nsimd (),{
8081 coalescedWrite (buffer_p[table[i].first ],coalescedRead (rhs_v[table[i].second ]));
8182 });
83+ #else
84+ autoView (rhs_v , rhs, CpuRead);
85+ thread_for (i,ent,{
86+ buffer_p[table[i].first ]=rhs_v[table[i].second ];
87+ });
88+ #endif
8289 }
8390}
8491
@@ -103,6 +110,7 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
103110 int n1=rhs.Grid ()->_slice_stride [dimension];
104111
105112 if ( cbmask ==0x3 ){
113+ #ifdef ACCELERATOR_CSHIFT
106114 autoView (rhs_v , rhs, AcceleratorRead);
107115 accelerator_for2d (n,e1 ,b,e2 ,1 ,{
108116 int o = n*n1;
@@ -111,12 +119,22 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
111119 vobj temp =rhs_v[so+o+b];
112120 extract<vobj>(temp,pointers,offset);
113121 });
122+ #else
123+ autoView (rhs_v , rhs, CpuRead);
124+ thread_for2d (n,e1 ,b,e2 ,{
125+ int o = n*n1;
126+ int offset = b+n*e2 ;
127+
128+ vobj temp =rhs_v[so+o+b];
129+ extract<vobj>(temp,pointers,offset);
130+ });
131+ #endif
114132 } else {
115- autoView (rhs_v , rhs, AcceleratorRead);
116-
117133 Coordinate rdim=rhs.Grid ()->_rdimensions ;
118134 Coordinate cdm =rhs.Grid ()->_checker_dim_mask ;
119135 std::cout << " Dense packed buffer WARNING " <<std::endl; // Does this get called twice once for each cb?
136+ #ifdef ACCELERATOR_CSHIFT
137+ autoView (rhs_v , rhs, AcceleratorRead);
120138 accelerator_for2d (n,e1 ,b,e2 ,1 ,{
121139
122140 Coordinate coor;
@@ -134,13 +152,33 @@ Gather_plane_extract(const Lattice<vobj> &rhs,
134152 extract<vobj>(temp,pointers,offset);
135153 }
136154 });
155+ #else
156+ autoView (rhs_v , rhs, CpuRead);
157+ thread_for2d (n,e1 ,b,e2 ,{
158+
159+ Coordinate coor;
160+
161+ int o=n*n1;
162+ int oindex = o+b;
163+
164+ int cb = RedBlackCheckerBoardFromOindex (oindex, rdim, cdm);
165+
166+ int ocb=1 <<cb;
167+ int offset = b+n*e2 ;
168+
169+ if ( ocb & cbmask ) {
170+ vobj temp =rhs_v[so+o+b];
171+ extract<vobj>(temp,pointers,offset);
172+ }
173+ });
174+ #endif
137175 }
138176}
139177
140178// ////////////////////////////////////////////////////
141179// Scatter for when there is no need to SIMD split
142180// ////////////////////////////////////////////////////
143- template <class vobj > void Scatter_plane_simple (Lattice<vobj> &rhs,commVector <vobj> &buffer, int dimension,int plane,int cbmask)
181+ template <class vobj > void Scatter_plane_simple (Lattice<vobj> &rhs,cshiftVector <vobj> &buffer, int dimension,int plane,int cbmask)
144182{
145183 int rd = rhs.Grid ()->_rdimensions [dimension];
146184
@@ -182,12 +220,19 @@ template<class vobj> void Scatter_plane_simple (Lattice<vobj> &rhs,commVector<vo
182220 }
183221
184222 {
185- autoView ( rhs_v, rhs, AcceleratorWrite);
186223 auto buffer_p = & buffer[0 ];
187224 auto table = &Cshift_table[0 ];
225+ #ifdef ACCELERATOR_CSHIFT
226+ autoView ( rhs_v, rhs, AcceleratorWrite);
188227 accelerator_for (i,ent,vobj::Nsimd (),{
189228 coalescedWrite (rhs_v[table[i].first ],coalescedRead (buffer_p[table[i].second ]));
190229 });
230+ #else
231+ autoView ( rhs_v, rhs, CpuWrite);
232+ thread_for (i,ent,{
233+ rhs_v[table[i].first ]=buffer_p[table[i].second ];
234+ });
235+ #endif
191236 }
192237}
193238
@@ -208,14 +253,23 @@ template<class vobj> void Scatter_plane_merge(Lattice<vobj> &rhs,ExtractPointerA
208253 int e2 =rhs.Grid ()->_slice_block [dimension];
209254
210255 if (cbmask ==0x3 ) {
211- autoView ( rhs_v , rhs, AcceleratorWrite);
212256 int _slice_stride = rhs.Grid ()->_slice_stride [dimension];
213257 int _slice_block = rhs.Grid ()->_slice_block [dimension];
258+ #ifdef ACCELERATOR_CSHIFT
259+ autoView ( rhs_v , rhs, AcceleratorWrite);
214260 accelerator_for2d (n,e1 ,b,e2 ,1 ,{
215261 int o = n*_slice_stride;
216262 int offset = b+n*_slice_block;
217263 merge (rhs_v[so+o+b],pointers,offset);
218264 });
265+ #else
266+ autoView ( rhs_v , rhs, CpuWrite);
267+ thread_for2d (n,e1 ,b,e2 ,{
268+ int o = n*_slice_stride;
269+ int offset = b+n*_slice_block;
270+ merge (rhs_v[so+o+b],pointers,offset);
271+ });
272+ #endif
219273 } else {
220274
221275 // Case of SIMD split AND checker dim cannot currently be hit, except in
@@ -280,12 +334,20 @@ template<class vobj> void Copy_plane(Lattice<vobj>& lhs,const Lattice<vobj> &rhs
280334 }
281335
282336 {
337+ auto table = &Cshift_table[0 ];
338+ #ifdef ACCELERATOR_CSHIFT
283339 autoView (rhs_v , rhs, AcceleratorRead);
284340 autoView (lhs_v , lhs, AcceleratorWrite);
285- auto table = &Cshift_table[0 ];
286341 accelerator_for (i,ent,vobj::Nsimd (),{
287342 coalescedWrite (lhs_v[table[i].first ],coalescedRead (rhs_v[table[i].second ]));
288343 });
344+ #else
345+ autoView (rhs_v , rhs, CpuRead);
346+ autoView (lhs_v , lhs, CpuWrite);
347+ thread_for (i,ent,{
348+ lhs_v[table[i].first ]=rhs_v[table[i].second ];
349+ });
350+ #endif
289351 }
290352}
291353
@@ -324,12 +386,20 @@ template<class vobj> void Copy_plane_permute(Lattice<vobj>& lhs,const Lattice<vo
324386 }
325387
326388 {
389+ auto table = &Cshift_table[0 ];
390+ #ifdef ACCELERATOR_CSHIFT
327391 autoView ( rhs_v, rhs, AcceleratorRead);
328392 autoView ( lhs_v, lhs, AcceleratorWrite);
329- auto table = &Cshift_table[0 ];
330393 accelerator_for (i,ent,1 ,{
331394 permute (lhs_v[table[i].first ],rhs_v[table[i].second ],permute_type);
332395 });
396+ #else
397+ autoView ( rhs_v, rhs, CpuRead);
398+ autoView ( lhs_v, lhs, CpuWrite);
399+ thread_for (i,ent,{
400+ permute (lhs_v[table[i].first ],rhs_v[table[i].second ],permute_type);
401+ });
402+ #endif
333403 }
334404}
335405
0 commit comments