DrTimothyAldenDavis
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CUDA/GB_AxB_dot3_cuda.cpp‎
Lines changed: 7 additions & 5 deletions b/‎CUDA/GB_AxB_dot3_cuda.cpp‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎CUDA/GB_AxB_dot3_cuda_branch.cpp‎
Lines changed: 3 additions & 0 deletions b/‎CUDA/GB_AxB_dot3_cuda_branch.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎CUDA/GB_cuda_semiring_factory.hpp‎
Lines changed: 5 additions & 1 deletion b/‎CUDA/GB_cuda_semiring_factory.hpp‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎CUDA/GB_reduce_to_scalar_cuda.cpp‎
Lines changed: 1 addition & 1 deletion b/‎CUDA/GB_reduce_to_scalar_cuda.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CUDA/GB_reduce_to_scalar_cuda_branch.cpp‎
Lines changed: 6 additions & 2 deletions b/‎CUDA/GB_reduce_to_scalar_cuda_branch.cpp‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎CUDA/GB_stringify_mask.c‎
Lines changed: 2 additions & 0 deletions b/‎CUDA/GB_stringify_mask.c‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎CUDA/GB_stringify_semiring.c‎
Lines changed: 5 additions & 3 deletions b/‎CUDA/GB_stringify_semiring.c‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎CUDA/jitFactory.hpp‎
Lines changed: 26 additions & 18 deletions b/‎CUDA/jitFactory.hpp‎
Lines changed: 26 additions & 18 deletions
diff --git a/‎CUDA/matrix.h‎
Lines changed: 4 additions & 0 deletions b/‎CUDA/matrix.h‎
Lines changed: 4 additions & 0 deletions
@@ -26,7 +26,7 @@ endif ( )
 set ( CMAKE_MACOSX_RPATH TRUE )
 
 # version of SuiteSparse:GraphBLAS
-set ( GraphBLAS_DATE "Apr 5, 2022" )
+set ( GraphBLAS_DATE "Apr 6, 2022" )
 set ( GraphBLAS_VERSION_MAJOR 7 )
 set ( GraphBLAS_VERSION_MINOR 0 )
 set ( GraphBLAS_VERSION_SUB   2 )
 
@@ -68,6 +68,8 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
     // check inputs
     //--------------------------------------------------------------------------
 
+    printf ("HERE IN cuda dot3, mask_struct is %d\n", Mask_struct) ;
+
     // when CUDA is enabled, no static headers are used in all of GraphBLAS
     GrB_Info info ;
     ASSERT (C != NULL && !(C->static_header)) ;
@@ -155,8 +157,8 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
     //auto *Cxtemp = C->x ;        
     //cudaMalloc ((void**) &(C->i), cnz * sizeof( int64_t) ); 
     //cudaMalloc ((void**) &(C->x), cnz * C->type->size ); 
-    CHECK_CUDA_SIMPLE(cudaMemAdvise( C->i, cnz * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, device));
-    CHECK_CUDA_SIMPLE(cudaMemAdvise( C->x, cnz * C->type->size , cudaMemAdviseSetPreferredLocation, device));
+    CHECK_CUDA_SIMPLE(cudaMemAdvise( C->i, (cnz+1) * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, device));
+    CHECK_CUDA_SIMPLE(cudaMemAdvise( C->x, (cnz+1) * C->type->size , cudaMemAdviseSetPreferredLocation, device));
 
 
     //--------------------------------------------------------------------------
@@ -185,7 +187,7 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
 
     // (1) create the semiring code and name
     mysemiring.semiring_factory ( semiring, flipxy,
-        ctype, A->type, B->type, M->type, Mask_struct,  // matrix types
+        ctype, M->type, A->type, B->type, Mask_struct,  // matrix types
         false, GB_sparsity(C), GB_sparsity(M), GB_sparsity(A), GB_sparsity(B) ) ;
 
     // (2) ensure the jitifier has "GB_semiring_[mysemiring.sr_code].h"
@@ -243,8 +245,8 @@ GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
     CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->p, (mnvec+1) * sizeof (int64_t), device, NULL)) ; //stream_data) ;
     CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->i, mnz * sizeof (int64_t), device, NULL )) ; //stream_data) ;
     CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( M->x, mnz * M->type->size, device, NULL )) ; //stream_data) ;
-    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->i, mnz * sizeof (int64_t), device, NULL )); //stream_data) ;
-    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->x, mnz * C->type->size, device, NULL )); //stream_data) ;
+    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->i, (cnz+1) * sizeof (int64_t), device, NULL )); //stream_data) ;
+    CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( C->x, (cnz+1) * C->type->size, device, NULL )); //stream_data) ;
     CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->p, (anvec+1) * sizeof (int64_t), device, NULL)); // stream_data) ;
     CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->i, anz * sizeof (int64_t), device, NULL )) ; //stream_data) ;
     CHECK_CUDA_SIMPLE(cudaMemPrefetchAsync( A->x, anz * A->type->size, device, NULL )) ; //stream_data) ;
 
@@ -18,6 +18,9 @@ bool GB_AxB_dot3_cuda_branch
     GB_Context Context
 )
 {
+
+    printf ("HERE IN cuda branch, mask_struct is %d\n", Mask_struct) ;
+
         // very rough estimate of the work to do
         double adeg = ((double) GB_nnz (A)) / ((double) GB_IMAX (1, A->nvec)) ;
         double bdeg = ((double) GB_nnz (B)) / ((double) GB_IMAX (1, B->nvec)) ;
 
@@ -27,6 +27,7 @@ class GB_cuda_semiring_factory: public jit::File_Desc {
     public:
 
         uint64_t sr_code;
+        bool mask_struct;
 
         // file ptr
         FILE *fp;
@@ -66,7 +67,9 @@ class GB_cuda_semiring_factory: public jit::File_Desc {
         int B_sparsity          // sparsity structure of B
     )
     {
-       std::cout<<" calling stringify semiring: " << semiring << std::endl;
+       std::cout<<" calling stringify semiring: " << std::endl;
+       GxB_Semiring_fprint (semiring, "stringfiy the smiering", GxB_COMPLETE, stdout) ;
+       std::cout<<" Mask_struct: " << Mask_struct << std::endl;
        uint64_t scode; 
        GB_enumify_semiring (
 	    // output:
@@ -90,6 +93,7 @@ class GB_cuda_semiring_factory: public jit::File_Desc {
        std::cout << "done stringify semiring" << std::endl;
        this->sr_code = scode;
 
+       mask_struct = Mask_struct;
        std::stringstream ss;
        ss << "GB_semiring_" << this->sr_code << ".h";
 
 
@@ -35,7 +35,7 @@ GrB_Info GB_reduce_to_scalar_cuda
 
     int64_t nz = GB_nnz(A);
 
-    GB_cuda_reduce( A, s, (unsigned int)nz, reduce);
+    GB_cuda_reduce( A, s, reduce);
 
     printf("num_triangles = %d\n",  s[0] );
 
 
@@ -13,13 +13,17 @@ bool GB_reduce_to_scalar_cuda_branch
     // work to do
     double work = GB_nnz (A) ;
 
+//    std::cout << "IS_BITMAP: " << GB_IS_BITMAP (A) << "IS_FULL: " << GB_IS_FULL(A) << std::endl;
+
     int ngpus_to_use = GB_ngpus_to_use (work) ;
     GBURBLE (" work:%g gpus:%d ", work, ngpus_to_use) ;
     printf (" work:%g gpus:%d ", work, ngpus_to_use) ;
     if (ngpus_to_use > 0
         && (reduce->header_size == 0)     // semiring is built-in
-        && (A->type->code != GB_UDT_code))
-    {
+        && (A->type->code != GB_UDT_code)
+        // FIXME: this is easy
+        && !A->iso
+    ) {
         return true;
     }
     else
 
@@ -309,6 +309,8 @@ void GB_macrofy_mask       // return enum to define mask macros
             break ;
     }
 
+    printf ("HERE is the Mask stuff:\n%s\n", f) ;
+
     fprintf( fp, "%s\n", f ) ;
 }
 
@@ -44,7 +44,7 @@ void GB_stringify_semiring     // build a semiring (name and code)
         semiring, flipxy,
         ctype, mtype, atype, btype, Mask_struct, Mask_comp,
         C_sparsity, M_sparsity, A_sparsity, B_sparsity) ;
-    printf("done enumify semiring\n");
+    printf("done enumify semiring: scode is %lu\n", scode);
 
     GB_macrofy_semiring ( fp, scode) ;
 
@@ -78,7 +78,7 @@ void GB_enumify_semiring   // enumerate a semiring
     //--------------------------------------------------------------------------
     // get the semiring
     //--------------------------------------------------------------------------
-    printf("inside enumify: %p\n", semiring);
+    printf("inside enumify: \n") ;
     GxB_print (semiring, 3) ;
 
     printf("Getting semiring add\n");
@@ -211,9 +211,11 @@ void GB_enumify_semiring   // enumerate a semiring
     // enumify the mask
     //--------------------------------------------------------------------------
 
-    printf("Invoking enumify_mask, mtype %p\n", mtype);
+    printf("Invoking enumify_mask, mtype: \n");
+    GxB_print (mtype, 3) ;
     int mtype_code = (mtype == NULL) ? 0 : mtype->code ; // 0 to 14
     int mask_ecode ;
+    printf("Mask_struct: %d, Mask_comp: %d\n", Mask_struct, Mask_comp);
     GB_enumify_mask (&mask_ecode, mtype_code, Mask_struct, Mask_comp) ;
     printf ("got mask_ecode: %d\n", mask_ecode) ;
 
 
@@ -89,6 +89,8 @@ static const std::vector<std::string> compiler_flags{
    "-I../../Source",
    "-I../../Source/Template",
    "-I../templates",
+   "-I/share/workspace/nvidia_projects/GraphBLAS/CUDA/templates"
+   "-I/share/workspace/nvidia_projects/GraphBLAS/CUDA/"
 //   "-L../../build/CUDA",
    "-I/usr/local/cuda/include",
 };
@@ -141,8 +143,10 @@ class phase1launchFactory
     jit::GBJitCache filecache = jit::GBJitCache::Instance() ;
     filecache.getFile (semiring_factory_) ;
 
+    auto sr_code = std::to_string(semiring_factory_.sr_code);
+
     std::stringstream string_to_be_jitted ;
-    std::vector<std::string> template_types = {M->type->name};
+    std::vector<std::string> template_types = {M->type->name, sr_code};
 
     std::string hashable_name = base_name + "_" + kernel_name;
     string_to_be_jitted << hashable_name << std::endl <<
@@ -155,7 +159,7 @@ class phase1launchFactory
     dim3 grid(get_number_of_blocks(M));
     dim3 block(get_threads_per_block());
 
-    jit::launcher( hashable_name,
+    jit::launcher( hashable_name + "_" + M->type->name + "_" + sr_code,
                    string_to_be_jitted.str(),
                    header_names,
                    compiler_flags,
@@ -211,7 +215,7 @@ class phase2launchFactory
       const int64_t mnz = GB_nnz (M) ;
       jit::launcher( hashable_name,
                      string_to_be_jitted.str(),
-                     header_names, 
+                     header_names,
                      compiler_flags,
                      file_callback)
                    .set_kernel_inst( kernel_name, {})
@@ -228,13 +232,13 @@ class phase2launchFactory
 };
 
 template< int threads_per_block = 32, int chunk_size = 128>
-class phase2endlaunchFactory 
+class phase2endlaunchFactory
 {
 
   std::string base_name = "GB_jit";
   std::string kernel_name = "AxB_phase2end";
 
-public: 
+public:
 
   int get_threads_per_block() {
         return threads_per_block;
@@ -253,8 +257,8 @@ class phase2endlaunchFactory
                           int64_t *bucketp, int64_t *bucket, int64_t *offset,
                           GrB_Matrix C, GrB_Matrix M)
      {
-      
-      bool result = false; 
+
+      bool result = false;
 
       dim3 grid(get_number_of_blocks(M));
       dim3 block(get_threads_per_block());
@@ -269,7 +273,7 @@ class phase2endlaunchFactory
 
       jit::launcher( hashable_name,
                      string_to_be_jitted.str(),
-                     header_names, 
+                     header_names,
                      compiler_flags,
                      file_callback)
                    .set_kernel_inst(  kernel_name , {})
@@ -306,8 +310,8 @@ class phase3launchFactory
 
   bool jitGridBlockLaunch(int64_t start, int64_t end, int64_t *bucketp, int64_t *bucket,
                           GrB_Matrix C,  GrB_Matrix M, GrB_Matrix A, GrB_Matrix B) {
-      
-      bool result = false; 
+
+      bool result = false;
 
     //----------------------------------------------------------------------
     // phase3: do the numerical work
@@ -500,13 +504,9 @@ class reduceFactory
   }
 
   // Note: this does assume the erased types are compatible w/ the monoid's ztype
-  bool jitGridBlockLaunch(GrB_Matrix A, void* output, unsigned int N,
+  bool jitGridBlockLaunch(GrB_Matrix A, void* output,
                           GrB_Monoid op)
   {
-      int blocksz = get_threads_per_block();
-      int gridsz = get_number_of_blocks(N);
-      dim3 grid(gridsz);
-      dim3 block(blocksz);
 
       // TODO: We probably want to "macrofy" the GrB_Monoid and define it in the `string_to_be_jitted`
 //      void GB_stringify_binop
@@ -533,6 +533,14 @@ class reduceFactory
       hashable_name << std::endl << R"(#include ")" <<
         hashable_name << R"(.cuh")" << std::endl;
 
+      bool is_sparse = GB_IS_SPARSE(A);
+      int64_t N = is_sparse ? GB_nnz(A) : GB_NCOLS(A) * GB_NROWS(A);
+
+      int blocksz = get_threads_per_block();
+      int gridsz = get_number_of_blocks(N);
+      dim3 grid(gridsz);
+      dim3 block(blocksz);
+
       jit::launcher(hashable_name,
                     string_to_be_jitted.str(),
                     header_names,
@@ -542,7 +550,7 @@ class reduceFactory
                .configure(grid, block)
 
                // FIXME: GB_ADD is hardcoded into kernel for now
-               .launch( A, temp_scalar, N);
+               .launch( A, temp_scalar, N, is_sparse);
 
 
       checkCudaErrors( cudaDeviceSynchronize() );
@@ -589,9 +597,9 @@ inline bool GB_cuda_mxm_phase3(GB_cuda_semiring_factory &mysemiringfactory, GB_b
 }
 
 
-inline bool GB_cuda_reduce(GrB_Matrix A, void *output, unsigned int N, GrB_Monoid op) {
+inline bool GB_cuda_reduce(GrB_Matrix A, void *output, GrB_Monoid op) {
     reduceFactory rf;
-    return rf.jitGridBlockLaunch(A, output, N, op);
+    return rf.jitGridBlockLaunch(A, output, op);
 }
 
 
 
@@ -57,6 +57,10 @@
 // remainder of this file is extracted from GraphBLAS.h:
 //------------------------------------------------------------------------------
 
+// GB_STR: convert the content of x into a string "x"
+#define GB_XSTR(x) GB_STR(x)
+#define GB_STR(x) #x
+
 #undef  GB_PUBLIC
 #define GB_PUBLIC extern
 #undef  GxB_MAX_NAME_LEN
Original file line number	Diff line number	Diff line change
`@@ -18,6 +18,9 @@ bool GB_AxB_dot3_cuda_branch`
`18`	`18`	`GB_Context Context`
`19`	`19`	`)`
`20`	`20`	`{`
	`21`	`+`
	`22`	`+ printf ("HERE IN cuda branch, mask_struct is %d\n", Mask_struct) ;`
	`23`	`+`
`21`	`24`	`// very rough estimate of the work to do`
`22`	`25`	`double adeg = ((double) GB_nnz (A)) / ((double) GB_IMAX (1, A->nvec)) ;`
`23`	`26`	`double bdeg = ((double) GB_nnz (B)) / ((double) GB_IMAX (1, B->nvec)) ;`
Original file line number	Diff line number	Diff line change
`@@ -309,6 +309,8 @@ void GB_macrofy_mask // return enum to define mask macros`
`309`	`309`	`break ;`
`310`	`310`	`}`
`311`	`311`
	`312`	`+ printf ("HERE is the Mask stuff:\n%s\n", f) ;`
	`313`	`+`
`312`	`314`	`fprintf( fp, "%s\n", f ) ;`
`313`	`315`	`}`
`314`	`316`