1919template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2020inline __device__ void ContractX2dFlattened (SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
2121 CeedScalar *V) {
22+ __syncthreads ();
2223 data.slice [t_id_x + t_id_y * T_1D] = *U;
2324 __syncthreads ();
2425 *V = 0.0 ;
@@ -27,7 +28,6 @@ inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_i
2728 *V += B[i + t_id_x * P_1D] * data.slice [i + t_id_y * T_1D]; // Contract x direction
2829 }
2930 }
30- __syncthreads ();
3131}
3232
3333// ------------------------------------------------------------------------------
@@ -36,6 +36,7 @@ inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_i
3636template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3737inline __device__ void ContractY2dFlattened (SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
3838 CeedScalar *V) {
39+ __syncthreads ();
3940 data.slice [t_id_x + t_id_y * T_1D] = *U;
4041 __syncthreads ();
4142 *V = 0.0 ;
@@ -44,7 +45,6 @@ inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_i
4445 *V += B[i + t_id_y * P_1D] * data.slice [t_id_x + i * T_1D]; // Contract y direction
4546 }
4647 }
47- __syncthreads ();
4848}
4949
5050// ------------------------------------------------------------------------------
@@ -53,6 +53,7 @@ inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_i
5353template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5454inline __device__ void ContractTransposeY2dFlattened (SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
5555 const CeedScalar *B, CeedScalar *V) {
56+ __syncthreads ();
5657 data.slice [t_id_x + t_id_y * T_1D] = *U;
5758 __syncthreads ();
5859 *V = 0.0 ;
@@ -61,7 +62,6 @@ inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, cons
6162 *V += B[t_id_y + i * P_1D] * data.slice [t_id_x + i * T_1D]; // Contract y direction
6263 }
6364 }
64- __syncthreads ();
6565}
6666
6767// ------------------------------------------------------------------------------
@@ -70,6 +70,7 @@ inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, cons
7070template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
7171inline __device__ void ContractTransposeX2dFlattened (SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
7272 const CeedScalar *B, CeedScalar *V) {
73+ __syncthreads ();
7374 data.slice [t_id_x + t_id_y * T_1D] = *U;
7475 __syncthreads ();
7576 *V = 0.0 ;
@@ -78,7 +79,6 @@ inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, cons
7879 *V += B[t_id_x + i * P_1D] * data.slice [i + t_id_y * T_1D]; // Contract x direction
7980 }
8081 }
81- __syncthreads ();
8282}
8383
8484// ------------------------------------------------------------------------------
@@ -87,14 +87,14 @@ inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, cons
8787template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
8888inline __device__ void ContractTransposeAddX2dFlattened (SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
8989 const CeedScalar *B, CeedScalar *V) {
90+ __syncthreads ();
9091 data.slice [t_id_x + t_id_y * T_1D] = *U;
9192 __syncthreads ();
9293 if (t_id_x < P_1D && t_id_y < P_1D) {
9394 for (CeedInt i = 0 ; i < Q_1D; i++) {
9495 *V += B[t_id_x + i * P_1D] * data.slice [i + t_id_y * T_1D]; // Contract x direction
9596 }
9697 }
97- __syncthreads ();
9898}
9999
100100// ------------------------------------------------------------------------------
@@ -105,10 +105,10 @@ inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const in
105105 const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
106106
107107 for (CeedInt comp = 0 ; comp < NUM_COMP; comp++) {
108+ __syncthreads ();
108109 if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice [t_id_x + t_id_y * T_1D] = U[comp];
109110 __syncthreads ();
110111 U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice [new_t_id_x + new_t_id_y * T_1D] : 0.0 ;
111- __syncthreads ();
112112 }
113113}
114114
@@ -117,10 +117,10 @@ inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const
117117 const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
118118
119119 for (CeedInt comp = 0 ; comp < NUM_COMP; comp++) {
120+ __syncthreads ();
120121 if (data.t_id_x < (Q_1D * Q_1D)) data.slice [old_t_id_x + old_t_id_y * T_1D] = U[comp];
121122 __syncthreads ();
122123 U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice [t_id_x + t_id_y * T_1D] : 0.0 ;
123- __syncthreads ();
124124 }
125125}
126126
@@ -218,6 +218,7 @@ inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const Ceed
218218template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
219219inline __device__ void ContractX3dFlattened (SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
220220 const CeedScalar *B, CeedScalar *V) {
221+ __syncthreads ();
221222 data.slice [t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
222223 __syncthreads ();
223224 *V = 0.0 ;
@@ -226,7 +227,6 @@ inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_i
226227 *V += B[i + t_id_x * P_1D] * data.slice [i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction
227228 }
228229 }
229- __syncthreads ();
230230}
231231
232232// ------------------------------------------------------------------------------
@@ -235,6 +235,7 @@ inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_i
235235template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
236236inline __device__ void ContractY3dFlattened (SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
237237 const CeedScalar *B, CeedScalar *V) {
238+ __syncthreads ();
238239 data.slice [t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
239240 __syncthreads ();
240241 *V = 0.0 ;
@@ -243,7 +244,6 @@ inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_i
243244 *V += B[i + t_id_y * P_1D] * data.slice [t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction
244245 }
245246 }
246- __syncthreads ();
247247}
248248
249249// ------------------------------------------------------------------------------
@@ -252,6 +252,7 @@ inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_i
252252template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
253253inline __device__ void ContractZ3dFlattened (SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
254254 const CeedScalar *B, CeedScalar *V) {
255+ __syncthreads ();
255256 data.slice [t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
256257 __syncthreads ();
257258 *V = 0.0 ;
@@ -260,7 +261,6 @@ inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_i
260261 *V += B[i + t_id_z * P_1D] * data.slice [t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction
261262 }
262263 }
263- __syncthreads ();
264264}
265265
266266// ------------------------------------------------------------------------------
@@ -269,6 +269,7 @@ inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_i
269269template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
270270inline __device__ void ContractTransposeZ3dFlattened (SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
271271 const CeedScalar *B, CeedScalar *V) {
272+ __syncthreads ();
272273 data.slice [t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
273274 __syncthreads ();
274275 *V = 0.0 ;
@@ -277,7 +278,6 @@ inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, cons
277278 *V += B[t_id_z + i * P_1D] * data.slice [t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction
278279 }
279280 }
280- __syncthreads ();
281281}
282282
283283// ------------------------------------------------------------------------------
@@ -286,14 +286,14 @@ inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, cons
286286template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
287287inline __device__ void ContractTransposeAddZ3dFlattened (SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
288288 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
289+ __syncthreads ();
289290 data.slice [t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
290291 __syncthreads ();
291292 if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
292293 for (CeedInt i = 0 ; i < Q_1D; i++) {
293294 *V += B[t_id_z + i * P_1D] * data.slice [t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction
294295 }
295296 }
296- __syncthreads ();
297297}
298298
299299// ------------------------------------------------------------------------------
@@ -302,6 +302,7 @@ inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, c
302302template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
303303inline __device__ void ContractTransposeY3dFlattened (SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
304304 const CeedScalar *B, CeedScalar *V) {
305+ __syncthreads ();
305306 data.slice [t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
306307 __syncthreads ();
307308 *V = 0.0 ;
@@ -310,7 +311,6 @@ inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, cons
310311 *V += B[t_id_y + i * P_1D] * data.slice [t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction
311312 }
312313 }
313- __syncthreads ();
314314}
315315
316316// ------------------------------------------------------------------------------
@@ -319,14 +319,14 @@ inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, cons
319319template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
320320inline __device__ void ContractTransposeAddY3dFlattened (SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
321321 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
322+ __syncthreads ();
322323 data.slice [t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
323324 __syncthreads ();
324325 if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
325326 for (CeedInt i = 0 ; i < Q_1D; i++) {
326327 *V += B[t_id_y + i * P_1D] * data.slice [t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction
327328 }
328329 }
329- __syncthreads ();
330330}
331331
332332// ------------------------------------------------------------------------------
@@ -335,6 +335,7 @@ inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, c
335335template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
336336inline __device__ void ContractTransposeX3dFlattened (SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
337337 const CeedScalar *B, CeedScalar *V) {
338+ __syncthreads ();
338339 data.slice [t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
339340 __syncthreads ();
340341 *V = 0.0 ;
@@ -343,7 +344,6 @@ inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, cons
343344 *V += B[t_id_x + i * P_1D] * data.slice [i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction
344345 }
345346 }
346- __syncthreads ();
347347}
348348
349349// ------------------------------------------------------------------------------
@@ -352,14 +352,14 @@ inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, cons
352352template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
353353inline __device__ void ContractTransposeAddX3dFlattened (SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
354354 const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
355+ __syncthreads ();
355356 data.slice [t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
356357 __syncthreads ();
357358 if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
358359 for (CeedInt i = 0 ; i < Q_1D; i++) {
359360 *V += B[t_id_x + i * P_1D] * data.slice [i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction
360361 }
361362 }
362- __syncthreads ();
363363}
364364
365365// ------------------------------------------------------------------------------
@@ -370,10 +370,10 @@ inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const in
370370 const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
371371
372372 for (CeedInt comp = 0 ; comp < NUM_COMP; comp++) {
373+ __syncthreads ();
373374 if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice [t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
374375 __syncthreads ();
375376 U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice [new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0 ;
376- __syncthreads ();
377377 }
378378}
379379
@@ -382,10 +382,10 @@ inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const
382382 const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
383383
384384 for (CeedInt comp = 0 ; comp < NUM_COMP; comp++) {
385+ __syncthreads ();
385386 if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice [old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
386387 __syncthreads ();
387388 U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice [t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0 ;
388- __syncthreads ();
389389 }
390390}
391391
0 commit comments