Skip to content

Commit 1f199f6

Browse files
Merge pull request #410 from DrTimothyAldenDavis/dev2
10.0.2
2 parents e0fb970 + 8f09448 commit 1f199f6

26 files changed

+196
-57
lines changed

CUDA/GB_cuda_apply_unop.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ GrB_Info GB_cuda_apply_unop
3333
GB_void *ythunk_cuda = NULL ;
3434
size_t ythunk_cuda_size = 0 ;
3535

36+
GrB_Index anz = GB_nnz_held (A) ;
37+
if (anz == 0) return (GrB_SUCCESS) ;
38+
3639
// FIXME: use the stream pool
3740
cudaStream_t stream = nullptr ;
3841
CUDA_OK (cudaStreamCreate (&stream)) ;
@@ -52,8 +55,6 @@ GrB_Info GB_cuda_apply_unop
5255
memcpy (ythunk_cuda, ythunk, op->ytype->size) ;
5356
}
5457

55-
GrB_Index anz = GB_nnz_held (A) ;
56-
5758
int32_t number_of_sms = GB_Global_gpu_sm_get (0) ;
5859
int64_t raw_gridsz = GB_ICEIL (anz, BLOCK_SIZE) ;
5960
// cap #of blocks to 256 * #of sms

CUDA/GB_cuda_select_sparse.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,12 @@ GrB_Info GB_cuda_select_sparse
6868
C->jumbled = A->jumbled ;
6969
C->iso = C_iso ;
7070

71+
CUDA_OK (cudaGetLastError ( )) ; //FIXME: remove
72+
CUDA_OK (cudaStreamSynchronize (stream)) ; //FIXME: remove
73+
CUDA_OK (cudaGetLastError ( )) ; //FIXME: remove
74+
CUDA_OK (cudaStreamSynchronize (stream)) ; //FIXME: remove
75+
CUDA_OK (cudaGetLastError ( )) ; //FIXME: remove
76+
7177
GB_OK (GB_cuda_select_sparse_jit (C, A,
7278
flipij, ythunk, op, stream, gridsz, BLOCK_SIZE)) ;
7379

CUDA/template/GB_cuda_ek_slice.cuh

Lines changed: 24 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -66,10 +66,10 @@
6666
// GB_cuda_ek_slice_setup
6767
//------------------------------------------------------------------------------
6868

69-
static __device__ __inline__ void GB_cuda_ek_slice_setup
69+
template <typename T> __device__ void GB_cuda_ek_slice_setup
7070
(
7171
// inputs, not modified:
72-
const GB_Ap_TYPE *Ap, // array of size anvec+1
72+
const T *Ap, // array of size anvec+1
7373
const int64_t anvec, // # of vectors in the matrix A
7474
const int64_t anz, // # of entries in the sparse/hyper matrix A
7575
const int64_t pfirst, // first entry in A to find k
@@ -107,16 +107,29 @@ static __device__ __inline__ void GB_cuda_ek_slice_setup
107107

108108
(*kfirst) = 0 ;
109109
int64_t kright = anvec ;
110-
GB_trim_binary_search (pfirst, Ap, GB_Ap_IS_32, kfirst, &kright) ;
110+
if (sizeof (T) == sizeof (uint32_t))
111+
{
112+
GB_trim_binary_search_32 (pfirst, (const uint32_t *) Ap, kfirst, &kright) ;
113+
}
114+
else
115+
{
116+
GB_trim_binary_search_64 (pfirst, (const uint64_t *) Ap, kfirst, &kright) ;
117+
}
111118

112119
// find klast, the last vector of the slice for this chunk. klast is the
113120
// vector that owns the entry Ai [plast-1] and Ax [plast-1]. The search
114121
// does not have to be exact, so klast is an estimate.
115122

116123
(*klast) = (*kfirst) ;
117124
kright = anvec ;
118-
GB_trim_binary_search (plast, Ap, GB_Ap_IS_32, klast, &kright) ;
119-
125+
if (sizeof (T) == sizeof (uint32_t))
126+
{
127+
GB_trim_binary_search_32 (plast, (const uint32_t *) Ap, klast, &kright) ;
128+
}
129+
else
130+
{
131+
GB_trim_binary_search_64 (plast, (const uint64_t *) Ap, klast, &kright) ;
132+
}
120133
//--------------------------------------------------------------------------
121134
// find slope of vectors in this chunk, and return result
122135
//--------------------------------------------------------------------------
@@ -148,15 +161,15 @@ static __device__ __inline__ void GB_cuda_ek_slice_setup
148161
// The method returns the index k of the vector in A that contains the pth
149162
// entry in A, at position p = pfirst + pdelta.
150163

151-
static __device__ __inline__ int64_t GB_cuda_ek_slice_entry
164+
template <typename T> __device__ int64_t GB_cuda_ek_slice_entry
152165
(
153166
// output:
154167
int64_t *p_handle, // p = pfirst + pdelta
155168
// inputs, not modified:
156169
const int64_t pdelta, // find the k value of the pfirst+pdelta entry
157170
const int64_t pfirst, // first entry in A to find k (for which
158171
// pdelta=0)
159-
const GB_Ap_TYPE *Ap, // array of size anvec+1
172+
const T *Ap, // array of size anvec+1
160173
const int64_t anvec1, // anvec-1
161174
const int64_t kfirst, // estimate of first vector in the chunk
162175
const float slope // estimate # vectors in chunk / my_chunk_size
@@ -199,10 +212,10 @@ static __device__ __inline__ int64_t GB_cuda_ek_slice_entry
199212
// CPU. The latter is for OpenMP parallelism on the CPU only; it does not
200213
// need to compute ks.
201214

202-
static __device__ __inline__ int64_t GB_cuda_ek_slice // returns my_chunk_size
215+
template <typename T>__device__ int64_t GB_cuda_ek_slice // returns my_chunk_size
203216
(
204217
// inputs, not modified:
205-
const GB_Ap_TYPE *Ap, // array of size anvec+1
218+
const T *Ap, // array of size anvec+1
206219
const int64_t anvec, // # of vectors in the matrix A
207220
const int64_t anz, // # of entries in the sparse/hyper matrix A
208221
const int64_t pfirst, // first entry in A to find k
@@ -218,7 +231,7 @@ static __device__ __inline__ int64_t GB_cuda_ek_slice // returns my_chunk_size
218231

219232
int64_t my_chunk_size, anvec1, kfirst, klast ;
220233
float slope ;
221-
GB_cuda_ek_slice_setup (Ap, anvec, anz, pfirst, max_pchunk,
234+
GB_cuda_ek_slice_setup<T> (Ap, anvec, anz, pfirst, max_pchunk,
222235
&kfirst, &klast, &my_chunk_size, &anvec1, &slope) ;
223236

224237
//--------------------------------------------------------------------------
@@ -235,7 +248,7 @@ static __device__ __inline__ int64_t GB_cuda_ek_slice // returns my_chunk_size
235248
//----------------------------------------------------------------------
236249

237250
int64_t p ; // unused, p = pfirst + pdelta
238-
int64_t k = GB_cuda_ek_slice_entry (&p, pdelta, pfirst, Ap, anvec1,
251+
int64_t k = GB_cuda_ek_slice_entry<T> (&p, pdelta, pfirst, Ap, anvec1,
239252
kfirst, slope) ;
240253

241254
//----------------------------------------------------------------------

CUDA/template/GB_cuda_jit_AxB_dot3_dense_phase1.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ __global__ void GB_cuda_AxB_dot3_dense_phase1_kernel
6969
// pfirst + my_chunk_size - 1.
7070
int64_t my_chunk_size, mnvec1, kfirst, klast ;
7171
float slope ;
72-
GB_cuda_ek_slice_setup (Mp, mnvec, mnz, pfirst, chunk_size,
72+
GB_cuda_ek_slice_setup<GB_Mp_TYPE> (Mp, mnvec, mnz, pfirst, chunk_size,
7373
&kfirst, &klast, &my_chunk_size, &mnvec1, &slope) ;
7474

7575
//----------------------------------------------------------------------
@@ -83,7 +83,7 @@ __global__ void GB_cuda_AxB_dot3_dense_phase1_kernel
8383

8484
// get the pM and k value of Mi,Mx [pM]:
8585
int64_t pM ; // = pfirst + pdelta
86-
int64_t k = GB_cuda_ek_slice_entry (&pM, pdelta, pfirst, Mp, mnvec1,
86+
int64_t k = GB_cuda_ek_slice_entry<GB_Mp_TYPE> (&pM, pdelta, pfirst, Mp, mnvec1,
8787
kfirst, slope) ;
8888

8989
#if GB_MASK_STRUCT

CUDA/template/GB_cuda_jit_AxB_dot3_phase1.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ __global__ void GB_jit_AxB_dot3_phase1_kernel
138138
// pfirst + my_chunk_size - 1.
139139
int64_t my_chunk_size, mnvec1, kfirst, klast ;
140140
float slope ;
141-
GB_cuda_ek_slice_setup (Mp, mnvec, mnz, pfirst, chunk_size,
141+
GB_cuda_ek_slice_setup<GB_Mp_TYPE> (Mp, mnvec, mnz, pfirst, chunk_size,
142142
&kfirst, &klast, &my_chunk_size, &mnvec1, &slope) ;
143143

144144
//----------------------------------------------------------------------
@@ -158,7 +158,7 @@ __global__ void GB_jit_AxB_dot3_phase1_kernel
158158

159159
// get the pM and k value of Mi,Mx [pM]
160160
int64_t pM ; // = pfirst + pdelta
161-
int64_t k = GB_cuda_ek_slice_entry (&pM, pdelta, pfirst, Mp, mnvec1,
161+
int64_t k = GB_cuda_ek_slice_entry<GB_Mp_TYPE> (&pM, pdelta, pfirst, Mp, mnvec1,
162162
kfirst, slope) ;
163163

164164
//------------------------------------------------------------------

CUDA/template/GB_jit_kernel_cuda_AxB_dot3.cu

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,9 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
236236
dim3 grid_1 (number_of_blocks_1) ;
237237
dim3 block (threads_per_block) ;
238238

239+
CUDA_OK (cudaGetLastError ( )) ;
240+
CUDA_OK (cudaStreamSynchronize (stream)) ;
241+
239242
//--------------------------------------------------------------------------
240243
// C<M>=A'*B via jitified kernels
241244
//--------------------------------------------------------------------------
@@ -265,8 +268,8 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
265268
// kernel_timer.Start();
266269
GB_cuda_AxB_dot3_dense_phase1_kernel <<<grid_1, block, 0, stream>>>
267270
(C, M) ;
268-
269-
CUDA_OK (cudaStreamSynchronize(stream)) ; // is this needed?
271+
CUDA_OK (cudaGetLastError ( )) ;
272+
CUDA_OK (cudaStreamSynchronize (stream)) ;
270273

271274
// kernel_timer.Stop();
272275
// printf ("(GPU phase1 %12.6g ms )\n", kernel_timer.Elapsed()) ;
@@ -364,7 +367,7 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
364367
// printf ("\nLaunching sparse phase1:\n") ;
365368
GB_jit_AxB_dot3_phase1_kernel <<<grid_1, block, 0, stream>>>
366369
(Nanobuckets, Blockbucket, C, M, A, B) ;
367-
370+
CUDA_OK (cudaGetLastError ( )) ;
368371
CUDA_OK (cudaStreamSynchronize (stream)) ;
369372

370373
// kernel_timer.Stop();
@@ -385,7 +388,7 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
385388
// printf ("Launching sparse phase2:\n") ;
386389
GB_cuda_AxB_dot3_phase2_kernel <<<grid_2, block, 0, stream>>>
387390
(Blockbucket, offset, number_of_blocks_1) ;
388-
391+
CUDA_OK (cudaGetLastError ( )) ;
389392
CUDA_OK (cudaStreamSynchronize (stream)) ;
390393

391394
int64_t s = offset [0] ;
@@ -424,8 +427,9 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
424427
// printf ("Launching sparse phase2end:\n") ;
425428
GB_cuda_AxB_dot3_phase2end_kernel <<<grid_1, block, 0, stream>>>
426429
(Nanobuckets, Blockbucket, Bucketp, Bucket, offset, C, mnz) ;
427-
430+
CUDA_OK (cudaGetLastError ( )) ;
428431
CUDA_OK (cudaStreamSynchronize (stream)) ;
432+
429433
// kernel_timer.Stop();
430434
// printf ("(GPU phase2end %12.6g ms)\n",kernel_timer.Elapsed());
431435
}
@@ -472,6 +476,8 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
472476
GB_cuda_AxB_dot3_phase3_vsvs_kernel
473477
<<<grid_3, block, 0, stream>>>
474478
(start, end, Bucket, C, M, A, B, theta) ;
479+
CUDA_OK (cudaGetLastError ( )) ;
480+
CUDA_OK (cudaStreamSynchronize (stream)) ;
475481
}
476482
break ;
477483

@@ -504,6 +510,8 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
504510
GB_cuda_AxB_dot3_phase3_mp_kernel
505511
<<<grid_3, block, shared_bytes, stream>>>
506512
(start, end, Bucket, C, M, A, B, theta) ;
513+
CUDA_OK (cudaGetLastError ( )) ;
514+
CUDA_OK (cudaStreamSynchronize (stream)) ;
507515
}
508516
break ;
509517

@@ -531,6 +539,8 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
531539
GB_cuda_AxB_dot3_phase3_vssp_kernel
532540
<<<grid_3, block, 0, stream>>>
533541
(start, end, Bucket, C, M, A, B, theta) ;
542+
CUDA_OK (cudaGetLastError ( )) ;
543+
CUDA_OK (cudaStreamSynchronize (stream)) ;
534544
}
535545
break ;
536546

@@ -561,6 +571,8 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
561571
GB_cuda_AxB_dot3_phase3_vsdn_kernel
562572
<<<grid_3, block, 0, stream>>>
563573
(start, end, Bucket, C, M, A, B, theta) ;
574+
CUDA_OK (cudaGetLastError ( )) ;
575+
CUDA_OK (cudaStreamSynchronize (stream)) ;
564576
}
565577
break ;
566578

@@ -588,6 +600,8 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
588600
GB_cuda_AxB_dot3_phase3_spdn_kernel
589601
<<<grid_3, block, 0, stream>>>
590602
(start, end, Bucket, C, M, A, B, theta) ;
603+
CUDA_OK (cudaGetLastError ( )) ;
604+
CUDA_OK (cudaStreamSynchronize (stream)) ;
591605
break ;
592606
}
593607
}

CUDA/template/GB_jit_kernel_cuda_apply_bind1st.cu

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#define GB_FREE_ALL ;
2+
13
using namespace cooperative_groups ;
24

35
__global__ void GB_cuda_apply_bind1st_kernel
@@ -40,8 +42,14 @@ GB_JIT_CUDA_KERNEL_APPLY_BIND1ST_PROTO (GB_jit_kernel)
4042

4143
dim3 grid (gridsz) ;
4244
dim3 block (blocksz) ;
45+
GB_B_NHELD (nvals) ;
46+
if (nvals == 0) return (GrB_SUCCESS) ;
4347

48+
CUDA_OK (cudaGetLastError ( )) ;
49+
CUDA_OK (cudaStreamSynchronize (stream)) ;
4450
GB_cuda_apply_bind1st_kernel <<<grid, block, 0, stream>>> (Cx, scalarx, B) ;
51+
CUDA_OK (cudaGetLastError ( )) ;
52+
CUDA_OK (cudaStreamSynchronize (stream)) ;
4553

4654
return (GrB_SUCCESS) ;
4755
}

CUDA/template/GB_jit_kernel_cuda_apply_bind2nd.cu

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#define GB_FREE_ALL ;
2+
13
using namespace cooperative_groups ;
24

35
__global__ void GB_cuda_apply_bind2nd_kernel
@@ -40,8 +42,14 @@ GB_JIT_CUDA_KERNEL_APPLY_BIND2ND_PROTO (GB_jit_kernel)
4042

4143
dim3 grid (gridsz) ;
4244
dim3 block (blocksz) ;
43-
45+
GB_A_NHELD (nvals) ;
46+
if (nvals == 0) return (GrB_SUCCESS) ;
47+
48+
CUDA_OK (cudaGetLastError ( )) ;
49+
CUDA_OK (cudaStreamSynchronize (stream)) ;
4450
GB_cuda_apply_bind2nd_kernel <<<grid, block, 0, stream>>> (Cx, A, scalarx) ;
51+
CUDA_OK (cudaGetLastError ( )) ;
52+
CUDA_OK (cudaStreamSynchronize (stream)) ;
4553

4654
return (GrB_SUCCESS) ;
4755
}

CUDA/template/GB_jit_kernel_cuda_apply_unop.cu

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#define GB_FREE_ALL ;
2+
13
using namespace cooperative_groups ;
24

35
#include "GB_cuda_ek_slice.cuh"
@@ -39,16 +41,15 @@ __global__ void GB_cuda_apply_unop_kernel
3941

4042
#define A_iso GB_A_ISO
4143

42-
int tid = blockDim.x * blockIdx.x + threadIdx.x ;
43-
int nthreads = blockDim.x * gridDim.x ;
44-
4544
#if ( GB_DEPENDS_ON_Y )
4645
// get thunk value (of type GB_Y_TYPE)
4746
GB_Y_TYPE thunk_value = * ((GB_Y_TYPE *) thunk) ;
4847
#endif
4948

5049
#if ( GB_A_IS_BITMAP || GB_A_IS_FULL )
5150
// bitmap/full case
51+
int tid = blockDim.x * blockIdx.x + threadIdx.x ;
52+
int nthreads = blockDim.x * gridDim.x ;
5253
for (int64_t p = tid ; p < anz ; p += nthreads)
5354
{
5455
if (!GBb_A (Ab, p)) { continue ; }
@@ -74,13 +75,13 @@ __global__ void GB_cuda_apply_unop_kernel
7475
{
7576
int64_t my_chunk_size, anvec_sub1, kfirst, klast ;
7677
float slope ;
77-
GB_cuda_ek_slice_setup (Ap, anvec, anz, pfirst, chunk_size,
78+
GB_cuda_ek_slice_setup<GB_Ap_TYPE> (Ap, anvec, anz, pfirst, chunk_size,
7879
&kfirst, &klast, &my_chunk_size, &anvec_sub1, &slope) ;
7980

8081
for (int64_t pdelta = threadIdx.x ; pdelta < my_chunk_size ; pdelta += blockDim.x)
8182
{
8283
int64_t p_final ;
83-
int64_t k = GB_cuda_ek_slice_entry (&p_final, pdelta, pfirst, Ap, anvec_sub1, kfirst, slope) ;
84+
int64_t k = GB_cuda_ek_slice_entry<GB_Ap_TYPE> (&p_final, pdelta, pfirst, Ap, anvec_sub1, kfirst, slope) ;
8485
int64_t col_idx = GBh_A (Ah, k) ;
8586

8687
#if ( GB_DEPENDS_ON_I )
@@ -92,8 +93,10 @@ __global__ void GB_cuda_apply_unop_kernel
9293
}
9394
}
9495
#else
95-
const int64_t avlen = A->vlen ;
9696
// can do normal method
97+
const int64_t avlen = A->vlen ;
98+
int tid = blockDim.x * blockIdx.x + threadIdx.x ;
99+
int nthreads = blockDim.x * gridDim.x ;
97100
for (int64_t p = tid ; p < anz ; p += nthreads)
98101
{
99102
#if ( GB_DEPENDS_ON_I )
@@ -116,7 +119,14 @@ GB_JIT_CUDA_KERNEL_APPLY_UNOP_PROTO (GB_jit_kernel)
116119
dim3 grid (gridsz) ;
117120
dim3 block (blocksz) ;
118121

122+
GB_A_NHELD (anz) ;
123+
if (anz == 0) return (GrB_SUCCESS) ;
124+
125+
CUDA_OK (cudaGetLastError ( )) ;
126+
CUDA_OK (cudaStreamSynchronize (stream)) ;
119127
GB_cuda_apply_unop_kernel <<<grid, block, 0, stream>>> (Cx, ythunk, A) ;
128+
CUDA_OK (cudaGetLastError ( )) ;
129+
CUDA_OK (cudaStreamSynchronize (stream)) ;
120130

121131
return (GrB_SUCCESS) ;
122132
}

0 commit comments

Comments
 (0)