@@ -236,6 +236,9 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
236236 dim3 grid_1 (number_of_blocks_1) ;
237237 dim3 block (threads_per_block) ;
238238
239+ CUDA_OK (cudaGetLastError ( )) ;
240+ CUDA_OK (cudaStreamSynchronize (stream)) ;
241+
239242 // --------------------------------------------------------------------------
240243 // C<M>=A'*B via jitified kernels
241244 // --------------------------------------------------------------------------
@@ -265,8 +268,8 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
265268 // kernel_timer.Start();
266269 GB_cuda_AxB_dot3_dense_phase1_kernel <<<grid_1, block, 0 , stream>>>
267270 (C, M) ;
268-
269- CUDA_OK (cudaStreamSynchronize (stream)) ; // is this needed?
271+ CUDA_OK ( cudaGetLastError ( )) ;
272+ CUDA_OK (cudaStreamSynchronize (stream)) ;
270273
271274 // kernel_timer.Stop();
272275 // printf ("(GPU phase1 %12.6g ms )\n", kernel_timer.Elapsed()) ;
@@ -364,7 +367,7 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
364367 // printf ("\nLaunching sparse phase1:\n") ;
365368 GB_jit_AxB_dot3_phase1_kernel <<<grid_1, block, 0 , stream>>>
366369 (Nanobuckets, Blockbucket, C, M, A, B) ;
367-
370+ CUDA_OK ( cudaGetLastError ( )) ;
368371 CUDA_OK (cudaStreamSynchronize (stream)) ;
369372
370373 // kernel_timer.Stop();
@@ -385,7 +388,7 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
385388 // printf ("Launching sparse phase2:\n") ;
386389 GB_cuda_AxB_dot3_phase2_kernel <<<grid_2, block, 0 , stream>>>
387390 (Blockbucket, offset, number_of_blocks_1) ;
388-
391+ CUDA_OK ( cudaGetLastError ( )) ;
389392 CUDA_OK (cudaStreamSynchronize (stream)) ;
390393
391394 int64_t s = offset [0 ] ;
@@ -424,8 +427,9 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
424427 // printf ("Launching sparse phase2end:\n") ;
425428 GB_cuda_AxB_dot3_phase2end_kernel <<<grid_1, block, 0 , stream>>>
426429 (Nanobuckets, Blockbucket, Bucketp, Bucket, offset, C, mnz) ;
427-
430+ CUDA_OK ( cudaGetLastError ( )) ;
428431 CUDA_OK (cudaStreamSynchronize (stream)) ;
432+
429433 // kernel_timer.Stop();
430434 // printf ("(GPU phase2end %12.6g ms)\n",kernel_timer.Elapsed());
431435 }
@@ -472,6 +476,8 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
472476 GB_cuda_AxB_dot3_phase3_vsvs_kernel
473477 <<<grid_3, block, 0 , stream>>>
474478 (start, end, Bucket, C, M, A, B, theta) ;
479+ CUDA_OK (cudaGetLastError ( )) ;
480+ CUDA_OK (cudaStreamSynchronize (stream)) ;
475481 }
476482 break ;
477483
@@ -504,6 +510,8 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
504510 GB_cuda_AxB_dot3_phase3_mp_kernel
505511 <<<grid_3, block, shared_bytes, stream>>>
506512 (start, end, Bucket, C, M, A, B, theta) ;
513+ CUDA_OK (cudaGetLastError ( )) ;
514+ CUDA_OK (cudaStreamSynchronize (stream)) ;
507515 }
508516 break ;
509517
@@ -531,6 +539,8 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
531539 GB_cuda_AxB_dot3_phase3_vssp_kernel
532540 <<<grid_3, block, 0 , stream>>>
533541 (start, end, Bucket, C, M, A, B, theta) ;
542+ CUDA_OK (cudaGetLastError ( )) ;
543+ CUDA_OK (cudaStreamSynchronize (stream)) ;
534544 }
535545 break ;
536546
@@ -561,6 +571,8 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
561571 GB_cuda_AxB_dot3_phase3_vsdn_kernel
562572 <<<grid_3, block, 0 , stream>>>
563573 (start, end, Bucket, C, M, A, B, theta) ;
574+ CUDA_OK (cudaGetLastError ( )) ;
575+ CUDA_OK (cudaStreamSynchronize (stream)) ;
564576 }
565577 break ;
566578
@@ -588,6 +600,8 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
588600 GB_cuda_AxB_dot3_phase3_spdn_kernel
589601 <<<grid_3, block, 0 , stream>>>
590602 (start, end, Bucket, C, M, A, B, theta) ;
603+ CUDA_OK (cudaGetLastError ( )) ;
604+ CUDA_OK (cudaStreamSynchronize (stream)) ;
591605 break ;
592606 }
593607 }
0 commit comments