Merge pull request #135 from DrTimothyAldenDavis/master

DrTimothyAldenDavis · web-flow · commit 4bb44bb93a30 · 2022-04-08T17:47:44.000-05:00
Master
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -26,10 +26,10 @@ endif ( )
 set ( CMAKE_MACOSX_RPATH TRUE )
 
 # version of SuiteSparse:GraphBLAS
-set ( GraphBLAS_DATE "Apr 6, 2022" )
+set ( GraphBLAS_DATE "Apr 8, 2022" )
 set ( GraphBLAS_VERSION_MAJOR 7 )
 set ( GraphBLAS_VERSION_MINOR 0 )
-set ( GraphBLAS_VERSION_SUB   2 )
+set ( GraphBLAS_VERSION_SUB   3 )
 
 message ( STATUS "Building SuiteSparse:GraphBLAS version: v" ${GraphBLAS_VERSION_MAJOR}.${GraphBLAS_VERSION_MINOR}.${GraphBLAS_VERSION_SUB}  " date: " ${GraphBLAS_DATE} )
 
diff --git a/CUDA/GB_jit_cache.cu b/CUDA/GB_jit_cache.cu
@@ -46,6 +46,12 @@ namespace jit {
         }
     }
 
+// Get the directory in home to use for storing the cache
+    std::string get_user_graphblas_source_path() {
+        auto gb_home = std::getenv("GRAPHBLAS_SOURCE_PATH");
+        if (gb_home != nullptr) return std::string(gb_home);
+        else return std::string();
+    }
 
 
 // Default `GRAPHBLAS_CACHE_PATH` to `$HOME/.GraphBLAS`.
diff --git a/CUDA/GB_jit_cache.h b/CUDA/GB_jit_cache.h
@@ -31,6 +31,7 @@
 namespace jit {
 
 std::string get_user_home_cache_dir();
+std::string get_user_graphblas_source_path();
 std::string getCacheDir(void);
 
 template <typename Tv>
diff --git a/CUDA/jitFactory.hpp b/CUDA/jitFactory.hpp
@@ -89,9 +89,12 @@ static const std::vector<std::string> compiler_flags{
    "-I../../Source",
    "-I../../Source/Template",
    "-I../templates",
-   "-I/share/workspace/nvidia_projects/GraphBLAS/CUDA/templates"
-   "-I/share/workspace/nvidia_projects/GraphBLAS/CUDA/"
-//   "-L../../build/CUDA",
+
+   // Add includes relative to GRAPHBLAS_SOURCE_PATH variable
+   "-I" + jit::get_user_graphblas_source_path() + "/CUDA",
+   "-I" + jit::get_user_graphblas_source_path() + "/CUDA/templates",
+   "-I" + jit::get_user_graphblas_source_path() + "/Source",
+   "-I" + jit::get_user_graphblas_source_path() + "/Source/Template",
    "-I/usr/local/cuda/include",
 };
 
@@ -151,7 +154,7 @@ class phase1launchFactory
     std::string hashable_name = base_name + "_" + kernel_name;
     string_to_be_jitted << hashable_name << std::endl <<
     R"(#include ")" << jit::get_user_home_cache_dir() << "/" << semiring_factory_.filename << R"(")" << std::endl <<
-    R"(#include ")" << hashable_name << R"(.cuh")" << std::endl;
+    R"(#include "templates/)" << hashable_name << R"(.cuh")" << std::endl;
     std::cout << string_to_be_jitted.str();
 
     bool result = false;
diff --git a/CUDA/templates/GB_jit_AxB_phase1.cuh b/CUDA/templates/GB_jit_AxB_phase1.cuh
@@ -313,10 +313,10 @@ __global__ void AxB_phase1
     __syncthreads();
     if (threadIdx.x==0 && blockIdx.x == 0)
     {
-        printf ("Here in phase1, what I see is this:\n") ;
-        printf ("MX(pM) is: %s\n", GB_XSTR (MX (pM))) ;
-        printf ("GB_MULT(x,y) is: %s\n", GB_XSTR (GB_MULT (x,y))) ;
-        printf ("GB_ADD(x,y)  is: %s\n", GB_XSTR (GB_ADD (x,y))) ;
+//        printf ("Here in phase1, what I see is this:\n") ;
+//        printf ("MX(pM) is: %s\n", GB_XSTR (MX (pM))) ;
+//        printf ("GB_MULT(x,y) is: %s\n", GB_XSTR (GB_MULT (x,y))) ;
+//        printf ("GB_ADD(x,y)  is: %s\n", GB_XSTR (GB_ADD (x,y))) ;
         // #define GB_GETA(blob)
         // #define GB_GETB(blob)
         // #define GB_MULT(x,y) (1)
@@ -382,11 +382,11 @@ __global__ void AxB_phase1
 
       int64_t k_end = GB_IMIN(  pointerchunk ,  klast - kfirst +2 ) ;
         
-      if( threadIdx.x ==0) 
-      {
-         printf("chunk%ld pfirst,plast,ch_end =%ld,%ld,%ld kfirst,klast,kend = %ld,%ld,%ld\n",
-                 chunk, pfirst, plast, chunk_end, kfirst, klast, k_end ) ;
-      }
+//      if( threadIdx.x ==0)
+//      {
+//         printf("chunk%ld pfirst,plast,ch_end =%ld,%ld,%ld kfirst,klast,kend = %ld,%ld,%ld\n",
+//                 chunk, pfirst, plast, chunk_end, kfirst, klast, k_end ) ;
+//      }
       __syncthreads();
       
       
@@ -399,10 +399,10 @@ __global__ void AxB_phase1
       __syncthreads();
       if (threadIdx.x == 0)
       {
-        for (int64_t i = 0 ; i < k_end ; i++)
-        {
-            printf ("Mps [%d] = %ld\n", i, Mps [i]) ;
-        }
+//        for (int64_t i = 0 ; i < k_end ; i++)
+//        {
+//            printf ("Mps [%d] = %ld\n", i, Mps [i]) ;
+//        }
       }
       __syncthreads();
 
@@ -420,10 +420,10 @@ __global__ void AxB_phase1
       __syncthreads();
       if (threadIdx.x == 0)
       {
-        for (int64_t i = 0 ; i < chunksize ; i++)
-        {
-            printf ("ks [%d] = %ld\n", i, ks [i]) ;
-        }
+//        for (int64_t i = 0 ; i < chunksize ; i++)
+//        {
+//            printf ("ks [%d] = %ld\n", i, ks [i]) ;
+//        }
       }
       __syncthreads();
 
@@ -461,7 +461,7 @@ __global__ void AxB_phase1
             GB_bucket_code bucket = GB_BUCKET_ZOMBIE ;
             int64_t k = ks[ pM - pfirst ] ;
             //k += ( pM == Mp[k+1] ) ;
-            printf ("tid%d  k %ld pM %ld MX(pM): %d\n", threadIdx.x, k, pM, MX (pM));
+//            printf ("tid%d  k %ld pM %ld MX(pM): %d\n", threadIdx.x, k, pM, MX (pM));
             int64_t i = Mi [ pM ] ;
 int64_t j = k ; // HACK, does not need to be initialized here
 
@@ -515,16 +515,16 @@ pA_end = Ap [i+1] ;
 
                         //bucket = GB_BUCKET_MERGEPATH ;
                         bucket= GB_bucket_assignment ( ainz, bjnz, bvlen) ;
-                        printf ("tid%d  i %ld j %ld ainz %ld bjnz %ld: bucket %d\n",
-                            threadIdx.x, i, j, ainz, bjnz, (int) bucket) ;
+//                        printf ("tid%d  i %ld j %ld ainz %ld bjnz %ld: bucket %d\n",
+//                            threadIdx.x, i, j, ainz, bjnz, (int) bucket) ;
                     }
                 }
             }
 
             if (bucket == GB_BUCKET_ZOMBIE)
             {
                 // mark C(i,j) is a zombie
-                printf ("tid%d pM=%d %d,%d prezombie\n",threadIdx.x,pM,i,j) ;
+//                printf ("tid%d pM=%d %d,%d prezombie\n",threadIdx.x,pM,i,j) ;
                 Ci [pM] = GB_FLIP (i) << 4 ;
                 // GB_BUCKET_COUNT (GB_BUCKET_ZOMBIE) ;
                 my_bucket_0++ ; //0 is the zombie bucket
@@ -534,7 +534,7 @@ pA_end = Ap [i+1] ;
                 // place C(i,j) in its bucket
                 Ci [pM] = (k << 4) + bucket ;
                 GB_BUCKET_COUNT (bucket) ;
-                printf ("tid%d pM=%d %d,%d b=%d\n",threadIdx.x, pM, i,j, (int)bucket) ;
+//                printf ("tid%d pM=%d %d,%d b=%d\n",threadIdx.x, pM, i,j, (int)bucket) ;
             }
          }
             
diff --git a/CUDA/templates/GB_jit_reduceNonZombiesWarp.cuh b/CUDA/templates/GB_jit_reduceNonZombiesWarp.cuh
@@ -99,8 +99,6 @@ __global__ void reduceNonZombiesWarp
     for(int i = blockIdx.x * blockDim.x + threadIdx.x; 
         i < N;
         i += blockDim.x * gridDim.x) {
-        printf("tid=%d, N: %ud\n", tid, N);
-
         if (is_sparse && index[i] < 0) continue; // skip zombies
         T fold = g_idata[i];
         sum = GB_ADD( sum, fold );
diff --git a/Doc/ChangeLog b/Doc/ChangeLog
@@ -1,3 +1,7 @@
+Version 7.0.3, Apr 8, 2022
+
+    * faster transpose when using 2 threads
+
 Version 7.0.2, Apr 6, 2022
 
     * (45) bug fix: vector iterator was broken for iterating across a
diff --git a/Doc/GraphBLAS_UserGuide.pdf b/Doc/GraphBLAS_UserGuide.pdf
diff --git a/Doc/GraphBLAS_UserGuide.tex b/Doc/GraphBLAS_UserGuide.tex
@@ -148,6 +148,12 @@ \subsection{Release Notes}
 
 \begin{itemize}
 
+\item Version 7.0.3 (Apr 8, 2022)
+
+    \begin{packed_itemize}
+    \item faster transpose when using 2 threads
+    \end{packed_itemize}
+
 \item Version 7.0.2 (Apr 5, 2022)
 
     \begin{packed_itemize}
@@ -5934,8 +5940,8 @@ \subsection{Serialize/deserialize methods}
 \url{https://cwe.mitre.org/data/definitions/502.html}. The deserialization
 methods do a few basic checks so that no out-of-bounds access occurs during
 deserialization, but the output matrix or vector itself may still be corrupted.
-If the data is untrusted, use check the matrix or vector after
-deserializing it:
+If the data is untrusted, use \verb'GxB_*_fprint' to
+check the matrix or vector after deserializing it:
 
 {\footnotesize
 \begin{verbatim}
diff --git a/Doc/GraphBLAS_version.tex b/Doc/GraphBLAS_version.tex
@@ -1,5 +1,5 @@
 % version of SuiteSparse:GraphBLAS
 \date{VERSION
-7.0.2,
-Apr 6, 2022}
+7.0.3,
+Apr 8, 2022}
 
diff --git a/GraphBLAS/CMakeLists.txt b/GraphBLAS/CMakeLists.txt
@@ -29,10 +29,10 @@ endif ( )
 set ( CMAKE_MACOSX_RPATH TRUE )
 
 # version of SuiteSparse:GraphBLAS (must match ../CMakeLists.txt)
-set ( GraphBLAS_DATE "Apr 6, 2022" )
+set ( GraphBLAS_DATE "Apr 8, 2022" )
 set ( GraphBLAS_VERSION_MAJOR 7 )
 set ( GraphBLAS_VERSION_MINOR 0 )
-set ( GraphBLAS_VERSION_SUB   2 )
+set ( GraphBLAS_VERSION_SUB   3 )
 
 message ( STATUS "Building SuiteSparse:GraphBLAS version: v" ${GraphBLAS_VERSION_MAJOR}.${GraphBLAS_VERSION_MINOR}.${GraphBLAS_VERSION_SUB}  " date: " ${GraphBLAS_DATE} )
 
diff --git a/Include/GraphBLAS.h b/Include/GraphBLAS.h
@@ -221,10 +221,10 @@
 
 // The version of this implementation, and the GraphBLAS API version:
 #define GxB_IMPLEMENTATION_NAME "SuiteSparse:GraphBLAS"
-#define GxB_IMPLEMENTATION_DATE "Apr 6, 2022"
+#define GxB_IMPLEMENTATION_DATE "Apr 8, 2022"
 #define GxB_IMPLEMENTATION_MAJOR 7
 #define GxB_IMPLEMENTATION_MINOR 0
-#define GxB_IMPLEMENTATION_SUB   2
+#define GxB_IMPLEMENTATION_SUB   3
 #define GxB_SPEC_DATE "Nov 15, 2021"
 #define GxB_SPEC_MAJOR 2
 #define GxB_SPEC_MINOR 0
diff --git a/README.md b/README.md
@@ -8,7 +8,7 @@ For the GraphBLAS/GraphBLAS Octave/MATLAB interface *only*:
 SPDX-License-Identifier: GPL-3.0-or-later
 (see below for a discussion of the licensing of this package).
 
-VERSION 7.0.2, Apr 6, 2022
+VERSION 7.0.3, Apr 8, 2022
 
 SuiteSparse:GraphBLAS is a complete implementation of the GraphBLAS standard,
 which defines a set of sparse matrix operations on an extended algebra of
diff --git a/Source/GB_transpose_method.c b/Source/GB_transpose_method.c
@@ -44,9 +44,10 @@ bool GB_transpose_method        // if true: use GB_builder, false: use bucket
     //--------------------------------------------------------------------------
 
     bool atomics ;
-    if (nthreads == 1)
+    if (nthreads <= 2)
     { 
-        // sequential bucket method, no atomics needed
+        // sequential bucket method: no atomics needed
+        // 2 threads: always use non-atomic method
         atomics = false ;
     }
     else if ((double) nthreads * (double) avlen > (double) anz)
@@ -70,7 +71,7 @@ bool GB_transpose_method        // if true: use GB_builder, false: use bucket
         { 
             switch (anzlog)
             {
-                case 14: atol = -4 ; break ;        // 16K entried in A
+                case 14: atol = -4 ; break ;        // 16K entries in A
                 case 15: atol = -3 ; break ;        // 32K
                 case 16: atol = -2 ; break ;        // 64K
                 case 17: atol = -1 ; break ;        // 128K
@@ -159,6 +160,7 @@ bool GB_transpose_method        // if true: use GB_builder, false: use bucket
     // select the method with the least amount of work
     //--------------------------------------------------------------------------
 
-    return (builder_work < bucket_work) ;
+    bool use_builder = (builder_work < bucket_work) ;
+    return (use_builder) ;
 }
 
diff --git a/Tcov/log_Apr6_2022.txt b/Tcov/log_Apr6_2022.txt
diff --git a/Tcov/log_Apr8_2022.txt b/Tcov/log_Apr8_2022.txt
diff --git a/alternative/Makefile b/alternative/Makefile

Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,12 @@ namespace jit {`
`46`	`46`	`}`
`47`	`47`	`}`
`48`	`48`
	`49`	`+// Get the directory in home to use for storing the cache`
	`50`	`+ std::string get_user_graphblas_source_path() {`
	`51`	`+ auto gb_home = std::getenv("GRAPHBLAS_SOURCE_PATH");`
	`52`	`+ if (gb_home != nullptr) return std::string(gb_home);`
	`53`	`+ else return std::string();`
	`54`	`+ }`
`49`	`55`
`50`	`56`
`51`	`57`	// Default `GRAPHBLAS_CACHE_PATH` to `$HOME/.GraphBLAS`.