Skip to content

Commit d6c19ee

Browse files
authored
gpu - clarify __syncthreads usage (#1838)
1 parent 506c99b commit d6c19ee

10 files changed

+79
-76
lines changed

include/ceed/jit-source/cuda/cuda-gen-templates.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@ inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q,
274274
CeedScalar *__restrict__ r_V) {
275275
if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
276276
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
277+
__syncthreads();
277278
data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[q + comp * Q_1D];
278279
__syncthreads();
279280
// X derivative
@@ -291,7 +292,6 @@ inline __device__ void GradColloSlice3d(SharedData_Cuda &data, const CeedInt q,
291292
for (CeedInt i = 0; i < Q_1D; i++) {
292293
r_V[comp + 2 * NUM_COMP] += c_G[i + q * Q_1D] * r_U[i + comp * Q_1D];
293294
}
294-
__syncthreads();
295295
}
296296
}
297297
}
@@ -304,20 +304,20 @@ inline __device__ void GradColloSliceTranspose3d(SharedData_Cuda &data, const Ce
304304
CeedScalar *__restrict__ r_V) {
305305
if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) {
306306
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
307+
__syncthreads();
307308
data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 0 * NUM_COMP];
308309
__syncthreads();
309310
// X derivative
310311
for (CeedInt i = 0; i < Q_1D; i++) {
311312
r_V[q + comp * Q_1D] += c_G[data.t_id_x + i * Q_1D] * data.slice[i + data.t_id_y * T_1D];
312313
}
313-
__syncthreads();
314314
// Y derivative
315+
__syncthreads();
315316
data.slice[data.t_id_x + data.t_id_y * T_1D] = r_U[comp + 1 * NUM_COMP];
316317
__syncthreads();
317318
for (CeedInt i = 0; i < Q_1D; i++) {
318319
r_V[q + comp * Q_1D] += c_G[data.t_id_y + i * Q_1D] * data.slice[data.t_id_x + i * T_1D];
319320
}
320-
__syncthreads();
321321
// Z derivative
322322
for (CeedInt i = 0; i < Q_1D; i++) {
323323
r_V[i + comp * Q_1D] += c_G[i + q * Q_1D] * r_U[comp + 2 * NUM_COMP];

include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points-templates.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ inline __device__ void GradAtPoints1d(SharedData_Cuda &data, const CeedInt p, co
9595
for (CeedInt i = 0; i < NUM_COMP; i++) r_V[i] = 0.0;
9696
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
9797
// Load coefficients
98+
__syncthreads();
9899
if (data.t_id_x < Q_1D) data.slice[data.t_id_x] = r_C[comp];
99100
__syncthreads();
100101
// Contract x direction
@@ -145,6 +146,7 @@ inline __device__ void InterpAtPoints2d(SharedData_Cuda &data, const CeedInt p,
145146
CeedScalar chebyshev_x[Q_1D];
146147

147148
// Load coefficients
149+
__syncthreads();
148150
if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
149151
__syncthreads();
150152
// Contract x direction
@@ -213,6 +215,7 @@ inline __device__ void GradAtPoints2d(SharedData_Cuda &data, const CeedInt p, co
213215
CeedScalar chebyshev_x[Q_1D];
214216

215217
// Load coefficients
218+
__syncthreads();
216219
if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[comp];
217220
__syncthreads();
218221
for (CeedInt dim = 0; dim < 2; dim++) {
@@ -294,6 +297,7 @@ inline __device__ void InterpAtPoints3d(SharedData_Cuda &data, const CeedInt p,
294297
CeedScalar chebyshev_x[Q_1D];
295298

296299
// Load coefficients
300+
__syncthreads();
297301
if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
298302
__syncthreads();
299303
// Contract x direction
@@ -372,6 +376,7 @@ inline __device__ void GradAtPoints3d(SharedData_Cuda &data, const CeedInt p, co
372376
CeedScalar chebyshev_x[Q_1D];
373377

374378
// Load coefficients
379+
__syncthreads();
375380
if (data.t_id_x < Q_1D && data.t_id_y < Q_1D) data.slice[data.t_id_x + data.t_id_y * Q_1D] = r_C[k + comp * Q_1D];
376381
__syncthreads();
377382
for (CeedInt dim = 0; dim < 3; dim++) {

include/ceed/jit-source/cuda/cuda-shared-basis-tensor-at-points.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,6 @@ extern "C" __global__ void InterpTransposeAtPoints(const CeedInt num_elem, const
129129
InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
130130
}
131131
}
132-
__syncthreads();
133132

134133
// Map from coefficients
135134
if (BASIS_DIM == 1) {
@@ -189,7 +188,6 @@ extern "C" __global__ void InterpTransposeAddAtPoints(const CeedInt num_elem, co
189188
InterpTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
190189
}
191190
}
192-
__syncthreads();
193191

194192
// Map from coefficients
195193
if (BASIS_DIM == 1) {
@@ -319,7 +317,6 @@ extern "C" __global__ void GradTransposeAtPoints(const CeedInt num_elem, const C
319317
GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
320318
}
321319
}
322-
__syncthreads();
323320

324321
// Map from coefficients
325322
if (BASIS_DIM == 1) {
@@ -380,7 +377,6 @@ extern "C" __global__ void GradTransposeAddAtPoints(const CeedInt num_elem, cons
380377
GradTransposeAtPoints3d<BASIS_NUM_COMP, BASIS_NUM_PTS, BASIS_P_1D, BASIS_Q_1D>(data, i, r_U, r_X, r_C);
381378
}
382379
}
383-
__syncthreads();
384380

385381
// Map from coefficients
386382
if (BASIS_DIM == 1) {

include/ceed/jit-source/cuda/cuda-shared-basis-tensor-flattened-templates.h

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
2020
inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
2121
CeedScalar *V) {
22+
__syncthreads();
2223
data.slice[t_id_x + t_id_y * T_1D] = *U;
2324
__syncthreads();
2425
*V = 0.0;
@@ -27,7 +28,6 @@ inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_i
2728
*V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction
2829
}
2930
}
30-
__syncthreads();
3131
}
3232

3333
//------------------------------------------------------------------------------
@@ -36,6 +36,7 @@ inline __device__ void ContractX2dFlattened(SharedData_Cuda &data, const int t_i
3636
template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
3737
inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U, const CeedScalar *B,
3838
CeedScalar *V) {
39+
__syncthreads();
3940
data.slice[t_id_x + t_id_y * T_1D] = *U;
4041
__syncthreads();
4142
*V = 0.0;
@@ -44,7 +45,6 @@ inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_i
4445
*V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction
4546
}
4647
}
47-
__syncthreads();
4848
}
4949

5050
//------------------------------------------------------------------------------
@@ -53,6 +53,7 @@ inline __device__ void ContractY2dFlattened(SharedData_Cuda &data, const int t_i
5353
template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
5454
inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
5555
const CeedScalar *B, CeedScalar *V) {
56+
__syncthreads();
5657
data.slice[t_id_x + t_id_y * T_1D] = *U;
5758
__syncthreads();
5859
*V = 0.0;
@@ -61,7 +62,6 @@ inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, cons
6162
*V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D]; // Contract y direction
6263
}
6364
}
64-
__syncthreads();
6565
}
6666

6767
//------------------------------------------------------------------------------
@@ -70,6 +70,7 @@ inline __device__ void ContractTransposeY2dFlattened(SharedData_Cuda &data, cons
7070
template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
7171
inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
7272
const CeedScalar *B, CeedScalar *V) {
73+
__syncthreads();
7374
data.slice[t_id_x + t_id_y * T_1D] = *U;
7475
__syncthreads();
7576
*V = 0.0;
@@ -78,7 +79,6 @@ inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, cons
7879
*V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction
7980
}
8081
}
81-
__syncthreads();
8282
}
8383

8484
//------------------------------------------------------------------------------
@@ -87,14 +87,14 @@ inline __device__ void ContractTransposeX2dFlattened(SharedData_Cuda &data, cons
8787
template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
8888
inline __device__ void ContractTransposeAddX2dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const CeedScalar *U,
8989
const CeedScalar *B, CeedScalar *V) {
90+
__syncthreads();
9091
data.slice[t_id_x + t_id_y * T_1D] = *U;
9192
__syncthreads();
9293
if (t_id_x < P_1D && t_id_y < P_1D) {
9394
for (CeedInt i = 0; i < Q_1D; i++) {
9495
*V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D]; // Contract x direction
9596
}
9697
}
97-
__syncthreads();
9898
}
9999

100100
//------------------------------------------------------------------------------
@@ -105,10 +105,10 @@ inline __device__ void QPack2d(SharedData_Cuda &data, const int t_id_x, const in
105105
const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = data.t_id_x / Q_1D;
106106

107107
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
108+
__syncthreads();
108109
if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D] = U[comp];
109110
__syncthreads();
110111
U[comp] = data.t_id_x < (Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D] : 0.0;
111-
__syncthreads();
112112
}
113113
}
114114

@@ -117,10 +117,10 @@ inline __device__ void QUnpack2d(SharedData_Cuda &data, const int t_id_x, const
117117
const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = data.t_id_x / Q_1D;
118118

119119
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
120+
__syncthreads();
120121
if (data.t_id_x < (Q_1D * Q_1D)) data.slice[old_t_id_x + old_t_id_y * T_1D] = U[comp];
121122
__syncthreads();
122123
U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D] : 0.0;
123-
__syncthreads();
124124
}
125125
}
126126

@@ -218,6 +218,7 @@ inline __device__ void WeightTensor2dFlattened(SharedData_Cuda &data, const Ceed
218218
template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
219219
inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
220220
const CeedScalar *B, CeedScalar *V) {
221+
__syncthreads();
221222
data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
222223
__syncthreads();
223224
*V = 0.0;
@@ -226,7 +227,6 @@ inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_i
226227
*V += B[i + t_id_x * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction
227228
}
228229
}
229-
__syncthreads();
230230
}
231231

232232
//------------------------------------------------------------------------------
@@ -235,6 +235,7 @@ inline __device__ void ContractX3dFlattened(SharedData_Cuda &data, const int t_i
235235
template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
236236
inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
237237
const CeedScalar *B, CeedScalar *V) {
238+
__syncthreads();
238239
data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
239240
__syncthreads();
240241
*V = 0.0;
@@ -243,7 +244,6 @@ inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_i
243244
*V += B[i + t_id_y * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction
244245
}
245246
}
246-
__syncthreads();
247247
}
248248

249249
//------------------------------------------------------------------------------
@@ -252,6 +252,7 @@ inline __device__ void ContractY3dFlattened(SharedData_Cuda &data, const int t_i
252252
template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
253253
inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
254254
const CeedScalar *B, CeedScalar *V) {
255+
__syncthreads();
255256
data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
256257
__syncthreads();
257258
*V = 0.0;
@@ -260,7 +261,6 @@ inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_i
260261
*V += B[i + t_id_z * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction
261262
}
262263
}
263-
__syncthreads();
264264
}
265265

266266
//------------------------------------------------------------------------------
@@ -269,6 +269,7 @@ inline __device__ void ContractZ3dFlattened(SharedData_Cuda &data, const int t_i
269269
template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
270270
inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
271271
const CeedScalar *B, CeedScalar *V) {
272+
__syncthreads();
272273
data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
273274
__syncthreads();
274275
*V = 0.0;
@@ -277,7 +278,6 @@ inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, cons
277278
*V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction
278279
}
279280
}
280-
__syncthreads();
281281
}
282282

283283
//------------------------------------------------------------------------------
@@ -286,14 +286,14 @@ inline __device__ void ContractTransposeZ3dFlattened(SharedData_Cuda &data, cons
286286
template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
287287
inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
288288
const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
289+
__syncthreads();
289290
data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
290291
__syncthreads();
291292
if (t_id_x < Q_1D && t_id_y < Q_1D && t_id_z < P_1D) {
292293
for (CeedInt i = 0; i < Q_1D; i++) {
293294
*V += B[t_id_z + i * P_1D] * data.slice[t_id_x + t_id_y * T_1D + i * T_1D * T_1D]; // Contract z direction
294295
}
295296
}
296-
__syncthreads();
297297
}
298298

299299
//------------------------------------------------------------------------------
@@ -302,6 +302,7 @@ inline __device__ void ContractTransposeAddZ3dFlattened(SharedData_Cuda &data, c
302302
template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
303303
inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
304304
const CeedScalar *B, CeedScalar *V) {
305+
__syncthreads();
305306
data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
306307
__syncthreads();
307308
*V = 0.0;
@@ -310,7 +311,6 @@ inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, cons
310311
*V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction
311312
}
312313
}
313-
__syncthreads();
314314
}
315315

316316
//------------------------------------------------------------------------------
@@ -319,14 +319,14 @@ inline __device__ void ContractTransposeY3dFlattened(SharedData_Cuda &data, cons
319319
template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
320320
inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
321321
const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
322+
__syncthreads();
322323
data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
323324
__syncthreads();
324325
if (t_id_x < Q_1D && t_id_y < P_1D && t_id_z < P_1D) {
325326
for (CeedInt i = 0; i < Q_1D; i++) {
326327
*V += B[t_id_y + i * P_1D] * data.slice[t_id_x + i * T_1D + t_id_z * T_1D * T_1D]; // Contract y direction
327328
}
328329
}
329-
__syncthreads();
330330
}
331331

332332
//------------------------------------------------------------------------------
@@ -335,6 +335,7 @@ inline __device__ void ContractTransposeAddY3dFlattened(SharedData_Cuda &data, c
335335
template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
336336
inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z, const CeedScalar *U,
337337
const CeedScalar *B, CeedScalar *V) {
338+
__syncthreads();
338339
data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
339340
__syncthreads();
340341
*V = 0.0;
@@ -343,7 +344,6 @@ inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, cons
343344
*V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction
344345
}
345346
}
346-
__syncthreads();
347347
}
348348

349349
//------------------------------------------------------------------------------
@@ -352,14 +352,14 @@ inline __device__ void ContractTransposeX3dFlattened(SharedData_Cuda &data, cons
352352
template <int NUM_COMP, int P_1D, int Q_1D, int T_1D>
353353
inline __device__ void ContractTransposeAddX3dFlattened(SharedData_Cuda &data, const int t_id_x, const int t_id_y, const int t_id_z,
354354
const CeedScalar *U, const CeedScalar *B, CeedScalar *V) {
355+
__syncthreads();
355356
data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = *U;
356357
__syncthreads();
357358
if (t_id_x < P_1D && t_id_y < P_1D && t_id_z < P_1D) {
358359
for (CeedInt i = 0; i < Q_1D; i++) {
359360
*V += B[t_id_x + i * P_1D] * data.slice[i + t_id_y * T_1D + t_id_z * T_1D * T_1D]; // Contract x direction
360361
}
361362
}
362-
__syncthreads();
363363
}
364364

365365
//------------------------------------------------------------------------------
@@ -370,10 +370,10 @@ inline __device__ void QPack3d(SharedData_Cuda &data, const int t_id_x, const in
370370
const CeedInt new_t_id_x = data.t_id_x % Q_1D, new_t_id_y = (data.t_id_x / Q_1D) % Q_1D, new_t_id_z = data.t_id_x / (Q_1D * Q_1D);
371371

372372
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
373+
__syncthreads();
373374
if (t_id_x < Q_1D && t_id_y < Q_1D) data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] = U[comp];
374375
__syncthreads();
375376
U[comp] = data.t_id_x < (Q_1D * Q_1D * Q_1D) ? data.slice[new_t_id_x + new_t_id_y * T_1D + new_t_id_z * T_1D * T_1D] : 0.0;
376-
__syncthreads();
377377
}
378378
}
379379

@@ -382,10 +382,10 @@ inline __device__ void QUnpack3d(SharedData_Cuda &data, const int t_id_x, const
382382
const CeedInt old_t_id_x = data.t_id_x % Q_1D, old_t_id_y = (data.t_id_x / Q_1D) % Q_1D, old_t_id_z = data.t_id_x / (Q_1D * Q_1D);
383383

384384
for (CeedInt comp = 0; comp < NUM_COMP; comp++) {
385+
__syncthreads();
385386
if (data.t_id_x < Q_1D * Q_1D * Q_1D) data.slice[old_t_id_x + old_t_id_y * T_1D + old_t_id_z * T_1D * T_1D] = U[comp];
386387
__syncthreads();
387388
U[comp] = (t_id_x < Q_1D && t_id_y < Q_1D) ? data.slice[t_id_x + t_id_y * T_1D + t_id_z * T_1D * T_1D] : 0.0;
388-
__syncthreads();
389389
}
390390
}
391391

0 commit comments

Comments
 (0)