DrTimothyAldenDavis
diff --git a/‎CUDA/GB_cuda_apply_unop.cpp‎
Lines changed: 3 additions & 2 deletions b/‎CUDA/GB_cuda_apply_unop.cpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎CUDA/GB_cuda_select_sparse.cpp‎
Lines changed: 6 additions & 0 deletions b/‎CUDA/GB_cuda_select_sparse.cpp‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎CUDA/template/GB_cuda_ek_slice.cuh‎
Lines changed: 24 additions & 11 deletions b/‎CUDA/template/GB_cuda_ek_slice.cuh‎
Lines changed: 24 additions & 11 deletions
diff --git a/‎CUDA/template/GB_cuda_jit_AxB_dot3_dense_phase1.cuh‎
Lines changed: 2 additions & 2 deletions b/‎CUDA/template/GB_cuda_jit_AxB_dot3_dense_phase1.cuh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎CUDA/template/GB_cuda_jit_AxB_dot3_phase1.cuh‎
Lines changed: 2 additions & 2 deletions b/‎CUDA/template/GB_cuda_jit_AxB_dot3_phase1.cuh‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎CUDA/template/GB_jit_kernel_cuda_AxB_dot3.cu‎
Lines changed: 19 additions & 5 deletions b/‎CUDA/template/GB_jit_kernel_cuda_AxB_dot3.cu‎
Lines changed: 19 additions & 5 deletions
diff --git a/‎CUDA/template/GB_jit_kernel_cuda_apply_bind1st.cu‎
Lines changed: 8 additions & 0 deletions b/‎CUDA/template/GB_jit_kernel_cuda_apply_bind1st.cu‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎CUDA/template/GB_jit_kernel_cuda_apply_bind2nd.cu‎
Lines changed: 9 additions & 1 deletion b/‎CUDA/template/GB_jit_kernel_cuda_apply_bind2nd.cu‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎CUDA/template/GB_jit_kernel_cuda_apply_unop.cu‎
Lines changed: 16 additions & 6 deletions b/‎CUDA/template/GB_jit_kernel_cuda_apply_unop.cu‎
Lines changed: 16 additions & 6 deletions
@@ -33,6 +33,9 @@ GrB_Info GB_cuda_apply_unop
     GB_void *ythunk_cuda = NULL ;
     size_t ythunk_cuda_size = 0 ;
 
+    GrB_Index anz = GB_nnz_held (A) ;
+    if (anz == 0) return (GrB_SUCCESS) ;
+
     // FIXME: use the stream pool
     cudaStream_t stream = nullptr ;
     CUDA_OK (cudaStreamCreate (&stream)) ;
@@ -52,8 +55,6 @@ GrB_Info GB_cuda_apply_unop
         memcpy (ythunk_cuda, ythunk, op->ytype->size) ;
     }
 
-    GrB_Index anz = GB_nnz_held (A) ;
-
     int32_t number_of_sms = GB_Global_gpu_sm_get (0) ;
     int64_t raw_gridsz = GB_ICEIL (anz, BLOCK_SIZE) ;
     // cap #of blocks to 256 * #of sms
 
@@ -68,6 +68,12 @@ GrB_Info GB_cuda_select_sparse
     C->jumbled = A->jumbled ;
     C->iso = C_iso ;
 
+    CUDA_OK (cudaGetLastError ( )) ;    //FIXME: remove
+    CUDA_OK (cudaStreamSynchronize (stream)) ;  //FIXME: remove
+    CUDA_OK (cudaGetLastError ( )) ;    //FIXME: remove
+    CUDA_OK (cudaStreamSynchronize (stream)) ;  //FIXME: remove
+    CUDA_OK (cudaGetLastError ( )) ;    //FIXME: remove
+
     GB_OK (GB_cuda_select_sparse_jit (C, A,
         flipij, ythunk, op, stream, gridsz, BLOCK_SIZE)) ;
 
 
@@ -66,10 +66,10 @@
 // GB_cuda_ek_slice_setup
 //------------------------------------------------------------------------------
 
-static __device__ __inline__ void GB_cuda_ek_slice_setup
+template <typename T> __device__ void GB_cuda_ek_slice_setup
 (
     // inputs, not modified:
-    const GB_Ap_TYPE *Ap,       // array of size anvec+1
+    const T *Ap,                // array of size anvec+1
     const int64_t anvec,        // # of vectors in the matrix A
     const int64_t anz,          // # of entries in the sparse/hyper matrix A
     const int64_t pfirst,       // first entry in A to find k
@@ -107,16 +107,29 @@ static __device__ __inline__ void GB_cuda_ek_slice_setup
 
     (*kfirst) = 0 ;
     int64_t kright = anvec ;
-    GB_trim_binary_search (pfirst, Ap, GB_Ap_IS_32, kfirst, &kright) ;
+    if (sizeof (T) == sizeof (uint32_t))
+    {
+        GB_trim_binary_search_32 (pfirst, (const uint32_t *) Ap, kfirst, &kright) ;
+    }
+    else
+    {
+        GB_trim_binary_search_64 (pfirst, (const uint64_t *) Ap, kfirst, &kright) ;
+    }
 
     // find klast, the last vector of the slice for this chunk.  klast is the
     // vector that owns the entry Ai [plast-1] and Ax [plast-1].  The search
     // does not have to be exact, so klast is an estimate.
 
     (*klast) = (*kfirst) ;
     kright = anvec ;
-    GB_trim_binary_search (plast, Ap, GB_Ap_IS_32, klast, &kright) ;
-
+    if (sizeof (T) == sizeof (uint32_t))
+    {
+        GB_trim_binary_search_32 (plast, (const uint32_t *) Ap, klast, &kright) ;
+    }
+    else
+    {
+        GB_trim_binary_search_64 (plast, (const uint64_t *) Ap, klast, &kright) ;
+    }
     //--------------------------------------------------------------------------
     // find slope of vectors in this chunk, and return result
     //--------------------------------------------------------------------------
@@ -148,15 +161,15 @@ static __device__ __inline__ void GB_cuda_ek_slice_setup
 // The method returns the index k of the vector in A that contains the pth
 // entry in A, at position p = pfirst + pdelta.
 
-static __device__ __inline__ int64_t GB_cuda_ek_slice_entry
+template <typename T> __device__ int64_t GB_cuda_ek_slice_entry
 (
     // output:
     int64_t *p_handle,          // p = pfirst + pdelta
     // inputs, not modified:
     const int64_t pdelta,       // find the k value of the pfirst+pdelta entry
     const int64_t pfirst,       // first entry in A to find k (for which
                                 // pdelta=0)
-    const GB_Ap_TYPE *Ap,       // array of size anvec+1
+    const T *Ap,                // array of size anvec+1
     const int64_t anvec1,       // anvec-1
     const int64_t kfirst,       // estimate of first vector in the chunk
     const float slope           // estimate # vectors in chunk / my_chunk_size
@@ -199,10 +212,10 @@ static __device__ __inline__ int64_t GB_cuda_ek_slice_entry
 // CPU.  The latter is for OpenMP parallelism on the CPU only; it does not
 // need to compute ks.
 
-static __device__ __inline__ int64_t GB_cuda_ek_slice // returns my_chunk_size
+template <typename T>__device__ int64_t GB_cuda_ek_slice // returns my_chunk_size
 (
     // inputs, not modified:
-    const GB_Ap_TYPE *Ap,       // array of size anvec+1
+    const T *Ap,                // array of size anvec+1
     const int64_t anvec,        // # of vectors in the matrix A
     const int64_t anz,          // # of entries in the sparse/hyper matrix A
     const int64_t pfirst,       // first entry in A to find k
@@ -218,7 +231,7 @@ static __device__ __inline__ int64_t GB_cuda_ek_slice // returns my_chunk_size
 
     int64_t my_chunk_size, anvec1, kfirst, klast ;
     float slope ;
-    GB_cuda_ek_slice_setup (Ap, anvec, anz, pfirst, max_pchunk,
+    GB_cuda_ek_slice_setup<T> (Ap, anvec, anz, pfirst, max_pchunk,
         &kfirst, &klast, &my_chunk_size, &anvec1, &slope) ;
 
     //--------------------------------------------------------------------------
@@ -235,7 +248,7 @@ static __device__ __inline__ int64_t GB_cuda_ek_slice // returns my_chunk_size
         //----------------------------------------------------------------------
 
         int64_t p ;     // unused, p = pfirst + pdelta
-        int64_t k = GB_cuda_ek_slice_entry (&p, pdelta, pfirst, Ap, anvec1,
+        int64_t k = GB_cuda_ek_slice_entry<T> (&p, pdelta, pfirst, Ap, anvec1,
             kfirst, slope) ;
 
         //----------------------------------------------------------------------
 
@@ -69,7 +69,7 @@ __global__ void GB_cuda_AxB_dot3_dense_phase1_kernel
         // pfirst + my_chunk_size - 1.
         int64_t my_chunk_size, mnvec1, kfirst, klast ;
         float slope ;
-        GB_cuda_ek_slice_setup (Mp, mnvec, mnz, pfirst, chunk_size,
+        GB_cuda_ek_slice_setup<GB_Mp_TYPE> (Mp, mnvec, mnz, pfirst, chunk_size,
             &kfirst, &klast, &my_chunk_size, &mnvec1, &slope) ;
 
         //----------------------------------------------------------------------
@@ -83,7 +83,7 @@ __global__ void GB_cuda_AxB_dot3_dense_phase1_kernel
 
             // get the pM and k value of Mi,Mx [pM]:
             int64_t pM ;    // = pfirst + pdelta
-            int64_t k = GB_cuda_ek_slice_entry (&pM, pdelta, pfirst, Mp, mnvec1,
+            int64_t k = GB_cuda_ek_slice_entry<GB_Mp_TYPE> (&pM, pdelta, pfirst, Mp, mnvec1,
                 kfirst, slope) ;
 
             #if GB_MASK_STRUCT
 
@@ -138,7 +138,7 @@ __global__ void GB_jit_AxB_dot3_phase1_kernel
         // pfirst + my_chunk_size - 1.
         int64_t my_chunk_size, mnvec1, kfirst, klast ;
         float slope ;
-        GB_cuda_ek_slice_setup (Mp, mnvec, mnz, pfirst, chunk_size,
+        GB_cuda_ek_slice_setup<GB_Mp_TYPE> (Mp, mnvec, mnz, pfirst, chunk_size,
             &kfirst, &klast, &my_chunk_size, &mnvec1, &slope) ;
 
         //----------------------------------------------------------------------
@@ -158,7 +158,7 @@ __global__ void GB_jit_AxB_dot3_phase1_kernel
 
             // get the pM and k value of Mi,Mx [pM]
             int64_t pM ;    // = pfirst + pdelta
-            int64_t k = GB_cuda_ek_slice_entry (&pM, pdelta, pfirst, Mp, mnvec1,
+            int64_t k = GB_cuda_ek_slice_entry<GB_Mp_TYPE> (&pM, pdelta, pfirst, Mp, mnvec1,
                 kfirst, slope) ;
 
             //------------------------------------------------------------------
 
@@ -236,6 +236,9 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
     dim3 grid_1 (number_of_blocks_1) ;
     dim3 block (threads_per_block) ;
 
+    CUDA_OK (cudaGetLastError ( )) ;
+    CUDA_OK (cudaStreamSynchronize (stream)) ;
+
     //--------------------------------------------------------------------------
     // C<M>=A'*B via jitified kernels
     //--------------------------------------------------------------------------
@@ -265,8 +268,8 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
         // kernel_timer.Start();
         GB_cuda_AxB_dot3_dense_phase1_kernel <<<grid_1, block, 0, stream>>>
             (C, M) ;
-
-        CUDA_OK (cudaStreamSynchronize(stream)) ;  // is this needed?
+        CUDA_OK (cudaGetLastError ( )) ;
+        CUDA_OK (cudaStreamSynchronize (stream)) ;
 
         // kernel_timer.Stop();
         // printf ("(GPU phase1 %12.6g ms )\n", kernel_timer.Elapsed()) ;
@@ -364,7 +367,7 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
         // printf ("\nLaunching sparse phase1:\n") ;
         GB_jit_AxB_dot3_phase1_kernel <<<grid_1, block, 0, stream>>>
             (Nanobuckets, Blockbucket, C, M, A, B) ;
-
+        CUDA_OK (cudaGetLastError ( )) ;
         CUDA_OK (cudaStreamSynchronize (stream)) ;
 
         // kernel_timer.Stop();
@@ -385,7 +388,7 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
         // printf ("Launching sparse phase2:\n") ;
         GB_cuda_AxB_dot3_phase2_kernel <<<grid_2, block, 0, stream>>>
             (Blockbucket, offset, number_of_blocks_1) ;
-
+        CUDA_OK (cudaGetLastError ( )) ;
         CUDA_OK (cudaStreamSynchronize (stream)) ;
 
         int64_t s = offset [0] ;
@@ -424,8 +427,9 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
             // printf ("Launching sparse phase2end:\n") ;
             GB_cuda_AxB_dot3_phase2end_kernel <<<grid_1, block, 0, stream>>>
                 (Nanobuckets, Blockbucket, Bucketp, Bucket, offset, C, mnz) ;
-
+            CUDA_OK (cudaGetLastError ( )) ;
             CUDA_OK (cudaStreamSynchronize (stream)) ;
+
             // kernel_timer.Stop();
             // printf ("(GPU phase2end %12.6g ms)\n",kernel_timer.Elapsed());
         }
@@ -472,6 +476,8 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
                             GB_cuda_AxB_dot3_phase3_vsvs_kernel
                                 <<<grid_3, block, 0, stream>>>
                                 (start, end, Bucket, C, M, A, B, theta) ;
+                            CUDA_OK (cudaGetLastError ( )) ;
+                            CUDA_OK (cudaStreamSynchronize (stream)) ;
                         }
                         break ;
 
@@ -504,6 +510,8 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
                             GB_cuda_AxB_dot3_phase3_mp_kernel
                                 <<<grid_3, block, shared_bytes, stream>>>
                                 (start, end, Bucket, C, M, A, B, theta) ;
+                            CUDA_OK (cudaGetLastError ( )) ;
+                            CUDA_OK (cudaStreamSynchronize (stream)) ;
                         }
                         break ;
 
@@ -531,6 +539,8 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
                             GB_cuda_AxB_dot3_phase3_vssp_kernel
                                 <<<grid_3, block, 0, stream>>>
                                 (start, end, Bucket, C, M, A, B, theta) ;
+                            CUDA_OK (cudaGetLastError ( )) ;
+                            CUDA_OK (cudaStreamSynchronize (stream)) ;
                         }
                         break ;
 
@@ -561,6 +571,8 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
                             GB_cuda_AxB_dot3_phase3_vsdn_kernel
                                 <<<grid_3, block, 0, stream>>>
                                 (start, end, Bucket, C, M, A, B, theta) ;
+                            CUDA_OK (cudaGetLastError ( )) ;
+                            CUDA_OK (cudaStreamSynchronize (stream)) ;
                         }
                         break ;
 
@@ -588,6 +600,8 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
                             GB_cuda_AxB_dot3_phase3_spdn_kernel
                                 <<<grid_3, block, 0, stream>>>
                                 (start, end, Bucket, C, M, A, B, theta) ;
+                            CUDA_OK (cudaGetLastError ( )) ;
+                            CUDA_OK (cudaStreamSynchronize (stream)) ;
                             break ;
                         }
                     }
 
@@ -1,3 +1,5 @@
+#define GB_FREE_ALL ;
+
 using namespace cooperative_groups ;
 
 __global__ void GB_cuda_apply_bind1st_kernel
@@ -40,8 +42,14 @@ GB_JIT_CUDA_KERNEL_APPLY_BIND1ST_PROTO (GB_jit_kernel)
 
     dim3 grid (gridsz) ;
     dim3 block (blocksz) ;
+    GB_B_NHELD (nvals) ;
+    if (nvals == 0) return (GrB_SUCCESS) ;
 
+    CUDA_OK (cudaGetLastError ( )) ;
+    CUDA_OK (cudaStreamSynchronize (stream)) ;
     GB_cuda_apply_bind1st_kernel <<<grid, block, 0, stream>>> (Cx, scalarx, B) ;
+    CUDA_OK (cudaGetLastError ( )) ;
+    CUDA_OK (cudaStreamSynchronize (stream)) ;
 
     return (GrB_SUCCESS) ;
 }
@@ -1,3 +1,5 @@
+#define GB_FREE_ALL ;
+
 using namespace cooperative_groups ;
 
 __global__ void GB_cuda_apply_bind2nd_kernel
@@ -40,8 +42,14 @@ GB_JIT_CUDA_KERNEL_APPLY_BIND2ND_PROTO (GB_jit_kernel)
 
     dim3 grid (gridsz) ;
     dim3 block (blocksz) ;
-    
+    GB_A_NHELD (nvals) ;
+    if (nvals == 0) return (GrB_SUCCESS) ;
+
+    CUDA_OK (cudaGetLastError ( )) ;
+    CUDA_OK (cudaStreamSynchronize (stream)) ;
     GB_cuda_apply_bind2nd_kernel <<<grid, block, 0, stream>>> (Cx, A, scalarx) ;
+    CUDA_OK (cudaGetLastError ( )) ;
+    CUDA_OK (cudaStreamSynchronize (stream)) ;
 
     return (GrB_SUCCESS) ;
 }
@@ -1,3 +1,5 @@
+#define GB_FREE_ALL ;
+
 using namespace cooperative_groups ;
 
 #include "GB_cuda_ek_slice.cuh"
@@ -39,16 +41,15 @@ __global__ void GB_cuda_apply_unop_kernel
 
     #define A_iso GB_A_ISO
 
-    int tid = blockDim.x * blockIdx.x + threadIdx.x ;
-    int nthreads = blockDim.x * gridDim.x ;
-
     #if ( GB_DEPENDS_ON_Y )
         // get thunk value (of type GB_Y_TYPE)
         GB_Y_TYPE thunk_value = * ((GB_Y_TYPE *) thunk) ;
     #endif
 
     #if ( GB_A_IS_BITMAP || GB_A_IS_FULL )
         // bitmap/full case
+        int tid = blockDim.x * blockIdx.x + threadIdx.x ;
+        int nthreads = blockDim.x * gridDim.x ;
         for (int64_t p = tid ; p < anz ; p += nthreads)
         {
             if (!GBb_A (Ab, p)) { continue ; }
@@ -74,13 +75,13 @@ __global__ void GB_cuda_apply_unop_kernel
                 {
                     int64_t my_chunk_size, anvec_sub1, kfirst, klast ;
                     float slope ;
-                    GB_cuda_ek_slice_setup (Ap, anvec, anz, pfirst, chunk_size,
+                    GB_cuda_ek_slice_setup<GB_Ap_TYPE> (Ap, anvec, anz, pfirst, chunk_size,
                         &kfirst, &klast, &my_chunk_size, &anvec_sub1, &slope) ;
 
                     for (int64_t pdelta = threadIdx.x ; pdelta < my_chunk_size ; pdelta += blockDim.x)
                     {
                         int64_t p_final ;
-                        int64_t k = GB_cuda_ek_slice_entry (&p_final, pdelta, pfirst, Ap, anvec_sub1, kfirst, slope) ;
+                        int64_t k = GB_cuda_ek_slice_entry<GB_Ap_TYPE> (&p_final, pdelta, pfirst, Ap, anvec_sub1, kfirst, slope) ;
                         int64_t col_idx = GBh_A (Ah, k) ;
 
                         #if ( GB_DEPENDS_ON_I )
@@ -92,8 +93,10 @@ __global__ void GB_cuda_apply_unop_kernel
                     }
                 }
         #else
-            const int64_t avlen = A->vlen ;
             // can do normal method
+            const int64_t avlen = A->vlen ;
+            int tid = blockDim.x * blockIdx.x + threadIdx.x ;
+            int nthreads = blockDim.x * gridDim.x ;
             for (int64_t p = tid ; p < anz ; p += nthreads)
             {
                 #if ( GB_DEPENDS_ON_I )
@@ -116,7 +119,14 @@ GB_JIT_CUDA_KERNEL_APPLY_UNOP_PROTO (GB_jit_kernel)
     dim3 grid (gridsz) ;
     dim3 block (blocksz) ;
 
+    GB_A_NHELD (anz) ;
+    if (anz == 0) return (GrB_SUCCESS) ;
+
+    CUDA_OK (cudaGetLastError ( )) ;
+    CUDA_OK (cudaStreamSynchronize (stream)) ;
     GB_cuda_apply_unop_kernel <<<grid, block, 0, stream>>> (Cx, ythunk, A) ;
+    CUDA_OK (cudaGetLastError ( )) ;
+    CUDA_OK (cudaStreamSynchronize (stream)) ;
 
     return (GrB_SUCCESS) ;
 }