Skip to content

Commit 9c3c0bf

Browse files
Merge pull request #133 from DrTimothyAldenDavis/master
Master
2 parents 5b6b45f + 268d4b9 commit 9c3c0bf

24 files changed

+306
-214
lines changed

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ endif ( )
2626
set ( CMAKE_MACOSX_RPATH TRUE )
2727

2828
# version of SuiteSparse:GraphBLAS
29-
set ( GraphBLAS_DATE "Apr 5, 2022" )
29+
set ( GraphBLAS_DATE "Apr 6, 2022" )
3030
set ( GraphBLAS_VERSION_MAJOR 7 )
3131
set ( GraphBLAS_VERSION_MINOR 0 )
3232
set ( GraphBLAS_VERSION_SUB 2 )

CUDA/GB_AxB_dot3_cuda.cpp

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
6868
// check inputs
6969
//--------------------------------------------------------------------------
7070

71+
printf ("HERE IN cuda dot3, mask_struct is %d\n", Mask_struct) ;
72+
7173
// when CUDA is enabled, no static headers are used in all of GraphBLAS
7274
GrB_Info info ;
7375
ASSERT (C != NULL && !(C->static_header)) ;
@@ -155,8 +157,8 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
155157
//auto *Cxtemp = C->x ;
156158
//cudaMalloc ((void**) &(C->i), cnz * sizeof( int64_t) );
157159
//cudaMalloc ((void**) &(C->x), cnz * C->type->size );
158-
CHECK_CUDA_SIMPLE(cudaMemAdvise( C->i, cnz * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, device));
159-
CHECK_CUDA_SIMPLE(cudaMemAdvise( C->x, cnz * C->type->size , cudaMemAdviseSetPreferredLocation, device));
160+
CHECK_CUDA_SIMPLE(cudaMemAdvise( C->i, (cnz+1) * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, device));
161+
CHECK_CUDA_SIMPLE(cudaMemAdvise( C->x, (cnz+1) * C->type->size , cudaMemAdviseSetPreferredLocation, device));
160162

161163

162164
//--------------------------------------------------------------------------
@@ -185,7 +187,7 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
185187

186188
// (1) create the semiring code and name
187189
mysemiring.semiring_factory ( semiring, flipxy,
188-
ctype, A->type, B->type, M->type, Mask_struct, // matrix types
190+
ctype, M->type, A->type, B->type, Mask_struct, // matrix types
189191
false, GB_sparsity(C), GB_sparsity(M), GB_sparsity(A), GB_sparsity(B) ) ;
190192

191193
// (2) ensure the jitifier has "GB_semiring_[mysemiring.sr_code].h"
@@ -243,8 +245,8 @@ GrB_Info GB_AxB_dot3_cuda // C<M> = A'*B using dot product method
243245
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->p, (mnvec+1) * sizeof (int64_t), device, NULL)) ; //stream_data) ;
244246
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->i, mnz * sizeof (int64_t), device, NULL )) ; //stream_data) ;
245247
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->x, mnz * M->type->size, device, NULL )) ; //stream_data) ;
246-
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->i, mnz * sizeof (int64_t), device, NULL )); //stream_data) ;
247-
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->x, mnz * C->type->size, device, NULL )); //stream_data) ;
248+
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->i, (cnz+1) * sizeof (int64_t), device, NULL )); //stream_data) ;
249+
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->x, (cnz+1) * C->type->size, device, NULL )); //stream_data) ;
248250
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->p, (anvec+1) * sizeof (int64_t), device, NULL)); // stream_data) ;
249251
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->i, anz * sizeof (int64_t), device, NULL )) ; //stream_data) ;
250252
CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->x, anz * A->type->size, device, NULL )) ; //stream_data) ;

CUDA/GB_AxB_dot3_cuda_branch.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,9 @@ bool GB_AxB_dot3_cuda_branch
1818
GB_Context Context
1919
)
2020
{
21+
22+
printf ("HERE IN cuda branch, mask_struct is %d\n", Mask_struct) ;
23+
2124
// very rough estimate of the work to do
2225
double adeg = ((double) GB_nnz (A)) / ((double) GB_IMAX (1, A->nvec)) ;
2326
double bdeg = ((double) GB_nnz (B)) / ((double) GB_IMAX (1, B->nvec)) ;

CUDA/GB_cuda_semiring_factory.hpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ class GB_cuda_semiring_factory: public jit::File_Desc {
2727
public:
2828

2929
uint64_t sr_code;
30+
bool mask_struct;
3031

3132
// file ptr
3233
FILE *fp;
@@ -66,7 +67,9 @@ class GB_cuda_semiring_factory: public jit::File_Desc {
6667
int B_sparsity // sparsity structure of B
6768
)
6869
{
69-
std::cout<<" calling stringify semiring: " << semiring << std::endl;
70+
std::cout<<" calling stringify semiring: " << std::endl;
71+
GxB_Semiring_fprint (semiring, "stringfiy the smiering", GxB_COMPLETE, stdout) ;
72+
std::cout<<" Mask_struct: " << Mask_struct << std::endl;
7073
uint64_t scode;
7174
GB_enumify_semiring (
7275
// output:
@@ -90,6 +93,7 @@ class GB_cuda_semiring_factory: public jit::File_Desc {
9093
std::cout << "done stringify semiring" << std::endl;
9194
this->sr_code = scode;
9295

96+
mask_struct = Mask_struct;
9397
std::stringstream ss;
9498
ss << "GB_semiring_" << this->sr_code << ".h";
9599

CUDA/GB_reduce_to_scalar_cuda.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ GrB_Info GB_reduce_to_scalar_cuda
3535

3636
int64_t nz = GB_nnz(A);
3737

38-
GB_cuda_reduce( A, s, (unsigned int)nz, reduce);
38+
GB_cuda_reduce( A, s, reduce);
3939

4040
printf("num_triangles = %d\n", s[0] );
4141

CUDA/GB_reduce_to_scalar_cuda_branch.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,17 @@ bool GB_reduce_to_scalar_cuda_branch
1313
// work to do
1414
double work = GB_nnz (A) ;
1515

16+
// std::cout << "IS_BITMAP: " << GB_IS_BITMAP (A) << "IS_FULL: " << GB_IS_FULL(A) << std::endl;
17+
1618
int ngpus_to_use = GB_ngpus_to_use (work) ;
1719
GBURBLE (" work:%g gpus:%d ", work, ngpus_to_use) ;
1820
printf (" work:%g gpus:%d ", work, ngpus_to_use) ;
1921
if (ngpus_to_use > 0
2022
&& (reduce->header_size == 0) // semiring is built-in
21-
&& (A->type->code != GB_UDT_code))
22-
{
23+
&& (A->type->code != GB_UDT_code)
24+
// FIXME: this is easy
25+
&& !A->iso
26+
) {
2327
return true;
2428
}
2529
else

CUDA/GB_stringify_mask.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,8 @@ void GB_macrofy_mask // return enum to define mask macros
309309
break ;
310310
}
311311

312+
printf ("HERE is the Mask stuff:\n%s\n", f) ;
313+
312314
fprintf( fp, "%s\n", f ) ;
313315
}
314316

CUDA/GB_stringify_semiring.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ void GB_stringify_semiring // build a semiring (name and code)
4444
semiring, flipxy,
4545
ctype, mtype, atype, btype, Mask_struct, Mask_comp,
4646
C_sparsity, M_sparsity, A_sparsity, B_sparsity) ;
47-
printf("done enumify semiring\n");
47+
printf("done enumify semiring: scode is %lu\n", scode);
4848

4949
GB_macrofy_semiring ( fp, scode) ;
5050

@@ -78,7 +78,7 @@ void GB_enumify_semiring // enumerate a semiring
7878
//--------------------------------------------------------------------------
7979
// get the semiring
8080
//--------------------------------------------------------------------------
81-
printf("inside enumify: %p\n", semiring);
81+
printf("inside enumify: \n") ;
8282
GxB_print (semiring, 3) ;
8383

8484
printf("Getting semiring add\n");
@@ -211,9 +211,11 @@ void GB_enumify_semiring // enumerate a semiring
211211
// enumify the mask
212212
//--------------------------------------------------------------------------
213213

214-
printf("Invoking enumify_mask, mtype %p\n", mtype);
214+
printf("Invoking enumify_mask, mtype: \n");
215+
GxB_print (mtype, 3) ;
215216
int mtype_code = (mtype == NULL) ? 0 : mtype->code ; // 0 to 14
216217
int mask_ecode ;
218+
printf("Mask_struct: %d, Mask_comp: %d\n", Mask_struct, Mask_comp);
217219
GB_enumify_mask (&mask_ecode, mtype_code, Mask_struct, Mask_comp) ;
218220
printf ("got mask_ecode: %d\n", mask_ecode) ;
219221

CUDA/jitFactory.hpp

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@ static const std::vector<std::string> compiler_flags{
8989
"-I../../Source",
9090
"-I../../Source/Template",
9191
"-I../templates",
92+
"-I/share/workspace/nvidia_projects/GraphBLAS/CUDA/templates"
93+
"-I/share/workspace/nvidia_projects/GraphBLAS/CUDA/"
9294
// "-L../../build/CUDA",
9395
"-I/usr/local/cuda/include",
9496
};
@@ -141,8 +143,10 @@ class phase1launchFactory
141143
jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
142144
filecache.getFile (semiring_factory_) ;
143145

146+
auto sr_code = std::to_string(semiring_factory_.sr_code);
147+
144148
std::stringstream string_to_be_jitted ;
145-
std::vector<std::string> template_types = {M->type->name};
149+
std::vector<std::string> template_types = {M->type->name, sr_code};
146150

147151
std::string hashable_name = base_name + "_" + kernel_name;
148152
string_to_be_jitted << hashable_name << std::endl <<
@@ -155,7 +159,7 @@ class phase1launchFactory
155159
dim3 grid(get_number_of_blocks(M));
156160
dim3 block(get_threads_per_block());
157161

158-
jit::launcher( hashable_name,
162+
jit::launcher( hashable_name + "_" + M->type->name + "_" + sr_code,
159163
string_to_be_jitted.str(),
160164
header_names,
161165
compiler_flags,
@@ -211,7 +215,7 @@ class phase2launchFactory
211215
const int64_t mnz = GB_nnz (M) ;
212216
jit::launcher( hashable_name,
213217
string_to_be_jitted.str(),
214-
header_names,
218+
header_names,
215219
compiler_flags,
216220
file_callback)
217221
.set_kernel_inst( kernel_name, {})
@@ -228,13 +232,13 @@ class phase2launchFactory
228232
};
229233

230234
template< int threads_per_block = 32, int chunk_size = 128>
231-
class phase2endlaunchFactory
235+
class phase2endlaunchFactory
232236
{
233237

234238
std::string base_name = "GB_jit";
235239
std::string kernel_name = "AxB_phase2end";
236240

237-
public:
241+
public:
238242

239243
int get_threads_per_block() {
240244
return threads_per_block;
@@ -253,8 +257,8 @@ class phase2endlaunchFactory
253257
int64_t *bucketp, int64_t *bucket, int64_t *offset,
254258
GrB_Matrix C, GrB_Matrix M)
255259
{
256-
257-
bool result = false;
260+
261+
bool result = false;
258262

259263
dim3 grid(get_number_of_blocks(M));
260264
dim3 block(get_threads_per_block());
@@ -269,7 +273,7 @@ class phase2endlaunchFactory
269273

270274
jit::launcher( hashable_name,
271275
string_to_be_jitted.str(),
272-
header_names,
276+
header_names,
273277
compiler_flags,
274278
file_callback)
275279
.set_kernel_inst( kernel_name , {})
@@ -306,8 +310,8 @@ class phase3launchFactory
306310

307311
bool jitGridBlockLaunch(int64_t start, int64_t end, int64_t *bucketp, int64_t *bucket,
308312
GrB_Matrix C, GrB_Matrix M, GrB_Matrix A, GrB_Matrix B) {
309-
310-
bool result = false;
313+
314+
bool result = false;
311315

312316
//----------------------------------------------------------------------
313317
// phase3: do the numerical work
@@ -500,13 +504,9 @@ class reduceFactory
500504
}
501505

502506
// Note: this does assume the erased types are compatible w/ the monoid's ztype
503-
bool jitGridBlockLaunch(GrB_Matrix A, void* output, unsigned int N,
507+
bool jitGridBlockLaunch(GrB_Matrix A, void* output,
504508
GrB_Monoid op)
505509
{
506-
int blocksz = get_threads_per_block();
507-
int gridsz = get_number_of_blocks(N);
508-
dim3 grid(gridsz);
509-
dim3 block(blocksz);
510510

511511
// TODO: We probably want to "macrofy" the GrB_Monoid and define it in the `string_to_be_jitted`
512512
// void GB_stringify_binop
@@ -533,6 +533,14 @@ class reduceFactory
533533
hashable_name << std::endl << R"(#include ")" <<
534534
hashable_name << R"(.cuh")" << std::endl;
535535

536+
bool is_sparse = GB_IS_SPARSE(A);
537+
int64_t N = is_sparse ? GB_nnz(A) : GB_NCOLS(A) * GB_NROWS(A);
538+
539+
int blocksz = get_threads_per_block();
540+
int gridsz = get_number_of_blocks(N);
541+
dim3 grid(gridsz);
542+
dim3 block(blocksz);
543+
536544
jit::launcher(hashable_name,
537545
string_to_be_jitted.str(),
538546
header_names,
@@ -542,7 +550,7 @@ class reduceFactory
542550
.configure(grid, block)
543551

544552
// FIXME: GB_ADD is hardcoded into kernel for now
545-
.launch( A, temp_scalar, N);
553+
.launch( A, temp_scalar, N, is_sparse);
546554

547555

548556
checkCudaErrors( cudaDeviceSynchronize() );
@@ -589,9 +597,9 @@ inline bool GB_cuda_mxm_phase3(GB_cuda_semiring_factory &mysemiringfactory, GB_b
589597
}
590598

591599

592-
inline bool GB_cuda_reduce(GrB_Matrix A, void *output, unsigned int N, GrB_Monoid op) {
600+
inline bool GB_cuda_reduce(GrB_Matrix A, void *output, GrB_Monoid op) {
593601
reduceFactory rf;
594-
return rf.jitGridBlockLaunch(A, output, N, op);
602+
return rf.jitGridBlockLaunch(A, output, op);
595603
}
596604

597605

CUDA/matrix.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,10 @@
5757
// remainder of this file is extracted from GraphBLAS.h:
5858
//------------------------------------------------------------------------------
5959

60+
// GB_STR: convert the content of x into a string "x"
61+
#define GB_XSTR(x) GB_STR(x)
62+
#define GB_STR(x) #x
63+
6064
#undef GB_PUBLIC
6165
#define GB_PUBLIC extern
6266
#undef GxB_MAX_NAME_LEN

0 commit comments

Comments
 (0)