Skip to content

Commit 1bf1dde

Browse files
Merge pull request #144 from DrTimothyAldenDavis/master
Master
2 parents 8881e16 + 0528f93 commit 1bf1dde

File tree

116 files changed

+9891
-40793
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

116 files changed

+9891
-40793
lines changed

CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,10 @@ endif ( )
2626
set ( CMAKE_MACOSX_RPATH TRUE )
2727

2828
# version of SuiteSparse:GraphBLAS
29-
set ( GraphBLAS_DATE "Apr 25, 2022" )
29+
set ( GraphBLAS_DATE "May 20, 2022" )
3030
set ( GraphBLAS_VERSION_MAJOR 7 )
31-
set ( GraphBLAS_VERSION_MINOR 0 )
32-
set ( GraphBLAS_VERSION_SUB 4 )
31+
set ( GraphBLAS_VERSION_MINOR 1 )
32+
set ( GraphBLAS_VERSION_SUB 0 )
3333

3434
message ( STATUS "Building SuiteSparse:GraphBLAS version: v" ${GraphBLAS_VERSION_MAJOR}.${GraphBLAS_VERSION_MINOR}.${GraphBLAS_VERSION_SUB} " date: " ${GraphBLAS_DATE} )
3535

CUDA/.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
*.so
55
jitFactory
66
stringify
7+
rmm_log.txt
78

89
# Do not ignore this file
910
!.gitignore

CUDA/GB_AxB_dot3_cuda.cpp

Lines changed: 23 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -139,12 +139,17 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
139139
int64_t cnz = mnz ;
140140
int64_t cnvec = mnvec ;
141141

142-
int sparsity_M = (M_is_hyper) ? GxB_HYPERSPARSE : GxB_SPARSE ;
142+
int M_sparsity = (M_is_hyper) ? GxB_HYPERSPARSE : GxB_SPARSE ;
143+
int C_sparsity = M_sparsity ;
144+
bool C_iso = false ;
143145
info = GB_new_bix (&C, // sparse or hyper (from M), existing header
144146
ctype, cvlen, cvdim, GB_Ap_malloc, true,
145-
sparsity_M, false, M->hyper_switch, cnvec,
147+
M_sparsity, false, M->hyper_switch, cnvec,
146148
cnz+1, // add one to cnz for GB_cumsum of Cwork
147-
true, /* not iso: */ false, Context) ;
149+
true, C_iso, Context) ;
150+
151+
CHECK_CUDA_SIMPLE(cudaMemset(C->i, 0, (cnz+1) * sizeof(int64_t)));
152+
CHECK_CUDA_SIMPLE(cudaMemset(C->x, 0, (cnz+1) * sizeof(ctype->size)));
148153

149154
if (info != GrB_SUCCESS)
150155
{
@@ -174,7 +179,7 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
174179

175180
C->magic = GB_MAGIC ;
176181
C->nvec_nonempty = M->nvec_nonempty ;
177-
C->nvec = M->nvec ;
182+
// C->nvec = M->nvec ;
178183
// the dot3 CUDA kernel will produce C->i with jumbled indices
179184
C->jumbled = true ;
180185

@@ -183,16 +188,15 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
183188
// stringify the semiring and the mask
184189
//--------------------------------------------------------------------------
185190

186-
GB_cuda_semiring_factory mysemiring = GB_cuda_semiring_factory ( ) ;
191+
GB_cuda_mxm_factory my_mxm_spec = GB_cuda_mxm_factory ( ) ;
187192

188-
// (1) create the semiring code and name
189-
mysemiring.semiring_factory ( semiring, flipxy,
190-
ctype, M->type, A->type, B->type, Mask_struct, // matrix types
191-
false, GB_sparsity(C), GB_sparsity(M), GB_sparsity(A), GB_sparsity(B) ) ;
193+
// (1) create the mxm code and name
194+
my_mxm_spec.mxm_factory ( C_iso, C_sparsity, ctype,
195+
M, Mask_struct, false, semiring, flipxy, A, B) ;
192196

193-
// (2) ensure the jitifier has "GB_semiring_[mysemiring.sr_code].h"
197+
// (2) ensure the jitifier has "GB_mxm_[my_mxm_spec.sr_code].h"
194198
jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
195-
filecache.getFile (mysemiring) ;
199+
filecache.getFile (my_mxm_spec) ;
196200

197201
GBURBLE ("(GPU stringified) ") ;
198202
//--------------------------------------------------------------------------
@@ -201,7 +205,7 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
201205

202206
// on the CPU: nthreads = GB_nthreads (cnz, chunk, nthreads_max) ;
203207
// on the GPU:
204-
phase1launchFactory p1lf(mysemiring);
208+
phase1launchFactory p1lf(my_mxm_spec);
205209
phase2launchFactory p2lf;
206210
phase2endlaunchFactory p2elf;
207211

@@ -233,26 +237,28 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
233237
CHECK_CUDA_SIMPLE(cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
234238
CHECK_CUDA_SIMPLE(cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t), cudaMemAdviseSetAccessedBy, device));
235239

236-
offset = (int64_t*)rmm_wrap_malloc( (NBUCKETS)*sizeof(int64_t)) ;
237240
CHECK_CUDA_SIMPLE(cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
238241
CHECK_CUDA_SIMPLE(cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t), cudaMemAdviseSetAccessedBy, device));
239242

240-
memset( offset, 0, NBUCKETS * sizeof(int64_t) );
241-
242243
//--------------------------------------------------------------------------
243244
// Pre-fetch arrays that will be used on the device
244245
//--------------------------------------------------------------------------
245246

246247
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->p, (mnvec+1) * sizeof (int64_t), device, NULL)) ; //stream_data) ;
247248
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->i, mnz * sizeof (int64_t), device, NULL )) ; //stream_data) ;
249+
// FIXME: if Mask_struct is true, skip this:
248250
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->x, mnz * M->type->size, device, NULL )) ; //stream_data) ;
251+
249252
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->i, (cnz+1) * sizeof (int64_t), device, NULL )); //stream_data) ;
253+
// FIXME: skip if C iso:
250254
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->x, (cnz+1) * C->type->size, device, NULL )); //stream_data) ;
251255
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->p, (anvec+1) * sizeof (int64_t), device, NULL)); // stream_data) ;
252256
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->i, anz * sizeof (int64_t), device, NULL )) ; //stream_data) ;
257+
// FIXME: skip if A iso:
253258
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->x, anz * A->type->size, device, NULL )) ; //stream_data) ;
254259
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( B->p, (bnvec+1) * sizeof (int64_t), device, NULL)); //stream_data) ;
255260
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( B->i, bnz * sizeof (int64_t), device, NULL )); //stream_data) ;
261+
// FIXME: skip if B iso:
256262
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( B->x, bnz * B->type->size, device, NULL )); //stream_data) ;
257263

258264
// The work to compute C(i,j) is held in Ci [p], if C(i,j) appears in
@@ -281,6 +287,7 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
281287
p2lf.jitGridBlockLaunch(Blockbucket, offset, M );
282288

283289
int64_t s= offset[0];
290+
C->nzombies = s;
284291
for ( int bucket = 1 ; bucket < NBUCKETS+1; ++bucket)
285292
{
286293
Bucketp[bucket] = s;
@@ -316,7 +323,7 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
316323
if(end - start > 0) {
317324
printf("Executing bucket: %d with %ld edges\n", bucket, end-start);
318325
// TODO: We might want to consider submitting these in different cuda streams (maybe use cuda stream pool?)
319-
phase3launchFactory p3lf(mysemiring, (GB_bucket_code)bucket);
326+
phase3launchFactory p3lf(my_mxm_spec, (GB_bucket_code)bucket);
320327
p3lf.jitGridBlockLaunch(start, end, Bucketp, Bucket, C, M, A, B);
321328
} else {
322329
printf("Skipping bucket %d, no work to do\n", bucket);

CUDA/GB_AxB_dot3_cuda_branch.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ bool GB_AxB_dot3_cuda_branch
4141
GBURBLE (" work:%g GPUs:%d ", work, ngpus_to_use) ;
4242
if (ngpus_to_use > 0
4343
// FIXME: FUTURE: user-defined types and operators
44-
// && (semiring->header_size == 0) // semiring is built-in
4544
&& (A->type->code != GB_UDT_code)
4645
&& (B->type->code != GB_UDT_code)
4746
// FIXME: M could be hypersparse. we should handle this

CUDA/GB_Matrix_allocate.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11

22
#ifndef GB_MATRIX_ALLOCATE_H
33
#define GB_MATRIX_ALLOCATE_H
4-
#include "matrix.h"
5-
#include "pmr_malloc.h"
4+
#include "GB_cuda_kernel.h"
5+
#include "rmm_wrap.h"
66

77
#ifdef __cplusplus
88
extern "C" {

CUDA/GB_cuda.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ extern "C"
2121
#include "cuda_runtime.h"
2222
#include "cuda.h"
2323
#include "jitify.hpp"
24-
#include "GB_cuda_semiring_factory.hpp"
24+
#include "GB_cuda_mxm_factory.hpp"
2525

2626
#include <cassert>
2727
#include <cmath>

CUDA/GB_cuda_kernel.h

Lines changed: 237 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,237 @@
1+
//------------------------------------------------------------------------------
2+
// CUDA/GB_cuda_kernel.h: definitions for all GraphBLAS CUDA kernels
3+
//------------------------------------------------------------------------------
4+
5+
// SPDX-License-Identifier: Apache-2.0
6+
7+
//------------------------------------------------------------------------------
8+
9+
// This file is #include'd into all CUDA kernels for GraphBLAS. It provides
10+
// a
11+
12+
#pragma once
13+
#undef ASSERT
14+
#define ASSERT(x)
15+
16+
//------------------------------------------------------------------------------
17+
// TODO: this will be in the jit code:
18+
#define chunksize 128
19+
20+
//------------------------------------------------------------------------------
21+
// GETA, GETB: get entries from input matrices A and B
22+
//------------------------------------------------------------------------------
23+
24+
#if GB_FLIPXY
25+
26+
#if GB_A_IS_PATTERN
27+
#define GB_DECLAREA(aval)
28+
#define GB_SHAREDA(aval)
29+
#define GB_GETA( aval, ax, p)
30+
#else
31+
#define GB_DECLAREA(aval) T_Y aval
32+
#define GB_SHAREDA(aval) __shared__ T_Y aval
33+
#if GB_A_ISO
34+
#define GB_GETA( aval, ax, p) aval = (T_Y) (ax [0]) ;
35+
#else
36+
#define GB_GETA( aval, ax, p) aval = (T_Y) (ax [p]) ;
37+
#endif
38+
#endif
39+
40+
#if GB_B_IS_PATTERN
41+
#define GB_DECLAREB(bval)
42+
#define GB_SHAREDB(bval)
43+
#define GB_GETB( bval, bx, p)
44+
#else
45+
#define GB_DECLAREB(bval) T_X bval
46+
#define GB_SHAREDB(bval) __shared__ T_X bval
47+
#if GB_B_ISO
48+
#define GB_GETB( bval, bx, p) bval = (T_X) (bx [0]) ;
49+
#else
50+
#define GB_GETB( bval, bx, p) bval = (T_X) (bx [p]) ;
51+
#endif
52+
#endif
53+
54+
#else
55+
56+
#if GB_A_IS_PATTERN
57+
#define GB_DECLAREA(aval)
58+
#define GB_SHAREDA(aval)
59+
#define GB_GETA( aval, ax, p)
60+
#else
61+
#define GB_DECLAREA(aval) T_X aval
62+
#define GB_SHAREDA(aval) __shared__ T_X aval
63+
#if GB_A_ISO
64+
#define GB_GETA( aval, ax, p) aval = (T_X) (ax [0]) ;
65+
#else
66+
#define GB_GETA( aval, ax, p) aval = (T_X) (ax [p]) ;
67+
#endif
68+
#endif
69+
70+
#if GB_B_IS_PATTERN
71+
#define GB_DECLAREB(bval)
72+
#define GB_SHAREDB(bval)
73+
#define GB_GETB( bval, bx, p)
74+
#else
75+
#define GB_DECLAREB(bval) T_Y bval
76+
#define GB_SHAREDB(bval) __shared__ T_Y bval
77+
#if GB_B_ISO
78+
#define GB_GETB( bval, bx, p) bval = (T_Y) (bx [0]) ;
79+
#else
80+
#define GB_GETB( bval, bx, p) bval = (T_Y) (bx [p]) ;
81+
#endif
82+
#endif
83+
84+
#endif
85+
86+
//------------------------------------------------------------------------------
87+
// operators
88+
//------------------------------------------------------------------------------
89+
90+
#if GB_C_ISO
91+
92+
#define GB_ADD_F( f , s)
93+
#define GB_C_MULT( c, a, b)
94+
#define GB_MULTADD( c, a ,b )
95+
#define GB_DOT_TERMINAL ( c )
96+
#define GB_DOT_MERGE \
97+
{ \
98+
cij_exists = true ; \
99+
}
100+
101+
#else
102+
103+
#define GB_ADD_F( f , s) f = GB_ADD ( f, s )
104+
#define GB_C_MULT( c, a, b) c = GB_MULT( (a), (b) )
105+
#define GB_MULTADD( c, a ,b ) GB_ADD_F( (c), GB_MULT( (a),(b) ) )
106+
#define GB_DOT_TERMINAL ( c )
107+
//# if ( c == TERMINAL_VALUE) break;
108+
// cij += A(k,i) * B(k,j), for merge operation
109+
110+
#define GB_DOT_MERGE \
111+
{ \
112+
GB_GETA ( aki, Ax, pA) ; /* aki = A(k,i) */ \
113+
GB_GETB ( bkj, Bx, pB) ; /* bkj = B(k,j) */ \
114+
if (cij_exists) \
115+
{ \
116+
GB_MULTADD (cij, aki, bkj) ; /* cij += aki * bkj */ \
117+
} \
118+
else \
119+
{ \
120+
/* cij = A(k,i) * B(k,j), and add to the pattern */ \
121+
cij_exists = true ; \
122+
GB_C_MULT (cij, aki, bkj) ; /* cij = aki * bkj */ \
123+
} \
124+
}
125+
126+
#endif
127+
128+
//------------------------------------------------------------------------------
129+
// subset of GraphBLAS.h
130+
//------------------------------------------------------------------------------
131+
132+
#ifndef GRAPHBLAS_H
133+
#define GRAPHBLAS_H
134+
135+
#undef restrict
136+
#undef GB_restrict
137+
#if defined ( GB_CUDA_KERNEL ) || defined ( __NVCC__ )
138+
#define GB_restrict __restrict__
139+
#else
140+
#define GB_restrict
141+
#endif
142+
#define restrict GB_restrict
143+
144+
#include <stdint.h>
145+
#include <stdbool.h>
146+
#include <stddef.h>
147+
#include <string.h>
148+
149+
// GB_STR: convert the content of x into a string "x"
150+
#define GB_XSTR(x) GB_STR(x)
151+
#define GB_STR(x) #x
152+
153+
#undef GB_PUBLIC
154+
#define GB_PUBLIC extern
155+
#undef GxB_MAX_NAME_LEN
156+
#define GxB_MAX_NAME_LEN 128
157+
158+
typedef uint64_t GrB_Index ;
159+
typedef struct GB_Descriptor_opaque *GrB_Descriptor ;
160+
typedef struct GB_Type_opaque *GrB_Type ;
161+
typedef struct GB_UnaryOp_opaque *GrB_UnaryOp ;
162+
typedef struct GB_BinaryOp_opaque *GrB_BinaryOp ;
163+
typedef struct GB_SelectOp_opaque *GxB_SelectOp ;
164+
typedef struct GB_IndexUnaryOp_opaque *GrB_IndexUnaryOp ;
165+
typedef struct GB_Monoid_opaque *GrB_Monoid ;
166+
typedef struct GB_Semiring_opaque *GrB_Semiring ;
167+
typedef struct GB_Scalar_opaque *GrB_Scalar ;
168+
typedef struct GB_Vector_opaque *GrB_Vector ;
169+
typedef struct GB_Matrix_opaque *GrB_Matrix ;
170+
171+
#define GxB_HYPERSPARSE 1 // store matrix in hypersparse form
172+
#define GxB_SPARSE 2 // store matrix as sparse form (compressed vector)
173+
#define GxB_BITMAP 4 // store matrix as a bitmap
174+
#define GxB_FULL 8 // store matrix as full; all entries must be present
175+
176+
typedef void (*GxB_unary_function) (void *, const void *) ;
177+
typedef void (*GxB_binary_function) (void *, const void *, const void *) ;
178+
179+
typedef bool (*GxB_select_function) // return true if A(i,j) is kept
180+
(
181+
GrB_Index i, // row index of A(i,j)
182+
GrB_Index j, // column index of A(i,j)
183+
const void *x, // value of A(i,j)
184+
const void *thunk // optional input for select function
185+
) ;
186+
187+
typedef void (*GxB_index_unary_function)
188+
(
189+
void *z, // output value z, of type ztype
190+
const void *x, // input value x of type xtype; value of v(i) or A(i,j)
191+
GrB_Index i, // row index of A(i,j)
192+
GrB_Index j, // column index of A(i,j), or zero for v(i)
193+
const void *y // input scalar y
194+
) ;
195+
196+
typedef enum
197+
{
198+
// for all GrB_Descriptor fields:
199+
GxB_DEFAULT = 0, // default behavior of the method
200+
201+
// for GrB_OUTP only:
202+
GrB_REPLACE = 1, // clear the output before assigning new values to it
203+
204+
// for GrB_MASK only:
205+
GrB_COMP = 2, // use the structural complement of the input
206+
GrB_SCMP = 2, // same as GrB_COMP (historical; use GrB_COMP instead)
207+
GrB_STRUCTURE = 4, // use the only pattern of the mask, not its values
208+
209+
// for GrB_INP0 and GrB_INP1 only:
210+
GrB_TRAN = 3, // use the transpose of the input
211+
212+
// for GxB_GPU_CONTROL only (DRAFT: in progress, do not use)
213+
GxB_GPU_ALWAYS = 2001,
214+
GxB_GPU_NEVER = 2002,
215+
216+
// for GxB_AxB_METHOD only:
217+
GxB_AxB_GUSTAVSON = 1001, // gather-scatter saxpy method
218+
GxB_AxB_DOT = 1003, // dot product
219+
GxB_AxB_HASH = 1004, // hash-based saxpy method
220+
GxB_AxB_SAXPY = 1005 // saxpy method (any kind)
221+
}
222+
GrB_Desc_Value ;
223+
224+
#include "GB_opaque.h"
225+
#endif
226+
227+
//------------------------------------------------------------------------------
228+
// subset of GB.h
229+
//------------------------------------------------------------------------------
230+
231+
#include "GB_imin.h"
232+
#include "GB_zombie.h"
233+
#include "GB_nnz.h"
234+
#include "GB_partition.h"
235+
#include "GB_binary_search.h"
236+
#include "GB_search_for_vector_template.c"
237+

0 commit comments

Comments
 (0)