DrTimothyAldenDavis
diff --git a/‎CUDA/template/GB_cuda_jit_AxB_dot3_phase2.cuh‎
Lines changed: 4 additions & 28 deletions b/‎CUDA/template/GB_cuda_jit_AxB_dot3_phase2.cuh‎
Lines changed: 4 additions & 28 deletions
diff --git a/‎CUDA/template/GB_jit_kernel_cuda_AxB_dot3.cu‎
Lines changed: 31 additions & 19 deletions b/‎CUDA/template/GB_jit_kernel_cuda_AxB_dot3.cu‎
Lines changed: 31 additions & 19 deletions
diff --git a/‎Doc/ChangeLog‎
Lines changed: 8 additions & 0 deletions b/‎Doc/ChangeLog‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎Doc/GraphBLAS_UserGuide.pdf‎
-2.62 KB b/‎Doc/GraphBLAS_UserGuide.pdf‎
-2.62 KB
diff --git a/‎Doc/GraphBLAS_version.tex‎
Lines changed: 2 additions & 2 deletions b/‎Doc/GraphBLAS_version.tex‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎Doc/UserGuide/GrB_installing.tex‎
Lines changed: 7 additions & 79 deletions b/‎Doc/UserGuide/GrB_installing.tex‎
Lines changed: 7 additions & 79 deletions
@@ -58,10 +58,11 @@ __inline__ __device__ void blockBucketExclusiveSum
     BlockPrefixCallbackOp prefix_op (0) ;
 
     // Have the block iterate over segments of items
-    int64_t data = 0 ;
 
-    for (int block_id = 0 ; block_id < nblocks ; block_id += blocksize)
+    for (int block_id = 0 ; block_id <= nblocks ; block_id += threads_per_block)
     {
+        int64_t data = 0 ;
+
         // Load a segment of consecutive items that are blocked across threads
 
         int loc = block_id + threadIdx.x;
@@ -79,10 +80,6 @@ __inline__ __device__ void blockBucketExclusiveSum
         {
             Blockbucket [bucketId*(nblocks+1) + loc] = data ;
         }
-
-        // this_thread_block().sync();
-
-        data = 0 ;
     }
 }
 
@@ -104,27 +101,6 @@ __global__ void GB_cuda_AxB_dot3_phase2_kernel
                                     // across, ie size of vector for 1 bucket
 )
 {
-
-    this_thread_block().sync() ;    // delete this?
-
-    if (gridDim.x >= NBUCKETS)
-    {
-        // Cumulative sum across blocks for each bucket
-        if (blockIdx.x < NBUCKETS)
-        {
-            blockBucketExclusiveSum (blockIdx.x, Blockbucket, nblocks) ;
-        }
-    }
-    else
-    {
-        if (blockIdx.x == 0)
-        {
-            #pragma unroll
-            for (int b = 0 ; b < NBUCKETS ; b++)
-            {
-                blockBucketExclusiveSum (b, Blockbucket, nblocks) ;
-            }
-        }
-    }
+    blockBucketExclusiveSum (blockIdx.x, Blockbucket, nblocks) ;
 }
 
@@ -34,7 +34,6 @@
 #define tile_sz 32 
 #define log2_tile_sz 5 
 #define shared_vector_size 256 
-#define blocksize  32
 #define threads_per_block 32
 
 //------------------------------------------------------------------------------
@@ -285,11 +284,11 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
             work_per_thread = 64 ;
         }
         int gridsz = GB_ICEIL (mnz, work_per_thread*blocksz) ;
-        dim3 grid_2 (gridsz) ;
+        dim3 grid_2dn (gridsz) ;
 
         // kernel_timer.Start();
 
-        GB_cuda_AxB_dot3_phase3_dndn_kernel <<grid_2, block, 0, stream>>
+        GB_cuda_AxB_dot3_phase3_dndn_kernel <<grid_2dn, block, 0, stream>>
             (C, M, A, B, theta) ;
 
     }
@@ -360,38 +359,51 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
         // kernel_timer.Stop();
         // printf ("(GPU phase1 %12.6g ms )\n", kernel_timer.Elapsed()) ;
 
-#if 0
-        printf ("Blockbucket [%d] = {\n", NBUCKETS * number_of_blocks_1) ;
-        for (int b = 0 ; b < NBUCKETS ; b++)
-        {
-            for (int k = 0 ; k < number_of_blocks_1 ; k++)
-            {
-                printf ("  %ld,\n", Blockbucket [b * (number_of_blocks_1+1) + k]) ;
-            }
-        }
-        printf ("};\n") ;
-#endif
-
         //----------------------------------------------------------------------
         // phase2: cumsum across the Blockbuckets, propagate to thread level
         //----------------------------------------------------------------------
 
         // # of blocks for phase2:
-        // number_of_blocks_2 = ceil ((number_of_blocks_1+1) / threads_per_block)
-        int number_of_blocks_2 = ((number_of_blocks_1+1) + threads_per_block - 1) / threads_per_block ;
+//      // number_of_blocks_2 = ceil ((number_of_blocks_1+1) / threads_per_block)
+//      int number_of_blocks_2 = ((number_of_blocks_1) + threads_per_block - 1) / threads_per_block ;
 
 //      number_of_blocks_2 = 1 ;
-        printf ("number_of_blocks_2: %d\n", number_of_blocks_2) ;
-        dim3 grid_2 (number_of_blocks_2) ;
+//      printf ("number_of_blocks_2: %d\n", number_of_blocks_2) ;
+//      dim3 grid_2 (number_of_blocks_2) ;
+
+        // # of blocks for phase2: one threadblock per bucket
+        dim3 grid_2 (NBUCKETS) ;
 
         // kernel_timer.Start();
 
+#if 0
+        for (int b = 0 ; b < NBUCKETS ; b++)
+        {
+            printf ("\n\n=================== Bucket: %d\n", b) ;
+            for (int64_t tid = 0 ; tid <= number_of_blocks_1 ; tid++)
+            {
+                printf ("   %ld: %ld\n", tid, Blockbucket [b * (number_of_blocks_1+1) + tid]) ;
+            }
+        }
+#endif
+
         // printf ("Launching sparse phase2:\n") ;
         GB_cuda_AxB_dot3_phase2_kernel <<<grid_2, block, 0, stream>>>
             (Blockbucket, number_of_blocks_1) ;
         CUDA_OK (cudaGetLastError ( )) ;
         CUDA_OK (cudaStreamSynchronize (stream)) ;
 
+#if 0
+        for (int b = 0 ; b < NBUCKETS ; b++)
+        {
+            printf ("\n\n=================== Bucket after cumsum: %d\n", b) ;
+            for (int64_t tid = 0 ; tid <= number_of_blocks_1 ; tid++)
+            {
+                printf ("   %ld: %ld\n", tid, Blockbucket [b * (number_of_blocks_1+1) + tid]) ;
+            }
+        }
+#endif
+
         // get the total number of zombies in the zombie bucket
         int64_t s = Blockbucket [number_of_blocks_1] ;
         C->nzombies = s ;
 
@@ -1,3 +1,11 @@
+May 5, 2025: version 10.0.5
+
+    * revised cmake build system
+    * (64) bug fix: GrB_assign, C<M>+=A, method 08n, when A is full.
+        Caught by Gabe Gomez.
+    * (63) bug fix: GrB_mxm when using the masked dot-product
+        and the output matrix is iso-valued.
+
 Apr 10, 2025: version 10.0.3
 
     * upgrade xxHash to 0.8.3: contributed by Christoph Grueninger
 
@@ -1,5 +1,5 @@
 % version of SuiteSparse:GraphBLAS
 \date{VERSION
-10.0.3,
-Apr 10, 2025}
+10.0.5,
+May 5, 2025}
 
@@ -66,68 +66,11 @@ \subsection{Requirements}
 See Section~\ref{cache_path} for more details.
 
 %----------------------------------------
-\subsection{Quick Start for MATLAB/Octave}
+\subsection{Installing GraphBLAS for MATLAB/Octave}
 %----------------------------------------
 
-Before you try this, first be sure you have \verb'cmake' (v3.20 or later) and
-the OpenMP library installed.  See Section~\ref{mac_openmp} to install OpenMP
-on the Mac.
-
-Next, in the MATLAB/Octave Command Window, simply type:
-
-    {\small
-    \begin{verbatim}
-    cd GraphBLAS/GraphBLAS
-    graphblas_install
-    cd test
-    gbtest \end{verbatim} }
-
-This will use \verb'cmake' to compile the GraphBLAS library.  Add your
-\verb'GraphBLAS/GraphBLAS' folder to your path, by editting your
-\verb'startup.m' script for MATLAB (usually in your \verb'Documents/MATLAB'
-folder) or your \verb'~/.octaverc' file for Octave.  Add this line to either
-file:
-
-    {\small
-    \begin{verbatim}
-    addpath ('/home/me/GraphBLAS/GraphBLAS') ; \end{verbatim} }
-
-\noindent
-where \verb'/home/me/GraphBLAS' is the top-level folder containing your
-copy of GraphBLAS.
-
-The \verb'graphblas_install' MATLAB/Octave script may fail to run \verb'cmake'.
-If it does, it will print the following workaround, where the commands it
-tells you to use will differ depending on the platform:
-
-    {\small
-    \begin{verbatim}
-    Building GraphBLAS with cmake failed.  Try this outside of MATLAB:
-
-       cd /home/me/GraphBLAS/GraphBLAS/build
-       cmake  ..
-       cmake --build . --config Release -j40
-
-    Then do this inside MATLAB/Octave:
-
-        cd /home/me/GraphBLAS/GraphBLAS/@GrB/private
-        gbmake \end{verbatim} }
-
-\noindent
-where \verb'/home/me/GraphBLAS' is your copy of GraphBLAS.
-
-You cannot use a single copy of the GraphBLAS source distribution to use in
-both MATLAB and Octave on the same system at the same time.  The \verb'*.o'
-files in \verb'GraphBLAS/GraphBLAS/@GrB/private' compiled by
-\verb'graphblas_install.m' will conflict with each other.  To switch between
-MATLAB and Octave, use a second copy of the GraphBLAS source distribution, or
-do a clean installation (via \verb'make purge' in the
-\verb'GraphBLAS/GraphBLAS/@GrB/private' folder) and redo the above
-instructions.  There is no need to recompile the \verb'libgraphblas.so' (or
-\verb'dylib' on the Mac) since Octave uses
-\verb'GraphBLAS/build/libgraphblas.so' while MATLAB  uses
-\verb'GraphBLAS/GraphBLAS//build/libgraphblas_matlab.so'.
-Both MATLAB and Octave can share the same compiled JIT kernels.
+See the \verb'GraphBLAS/GraphBLAS/README.md' file for instructions on
+how to compiler the MATLAB/Octave interface on Linux/Mac/Windows.
 
 %----------------------------------------
 \subsection{More details}
@@ -257,27 +200,12 @@ \subsubsection{On the Intel-based Mac}
     make CC=icc CXX=icpc \end{verbatim} }
 
 %----------------------------------------
-\subsubsection{MATLAB/Octave on the Mac (Apple Silicon based)}
+\subsubsection{On IBM Power}
 %----------------------------------------
 
-MATLAB on the Apple-Silicon-based Mac is now a native ARM64 application (as of
-R2023b).  GraphBLAS is not supported for earlier versions of MATLAB on Apple
-Silicon.
-For Octave, GraphBLAS is designed to use the \verb'brew' version of Octave.
-
-Note that when used inside MATLAB, GraphBLAS must use the same OpenMP library
-as MATLAB.  Similarly, when used in Octave, it must use the same OpenMP library
-as Octave.  Both cases are handled by the \verb'graphblas_install.m' script.
-
-Install Octave on the Mac with:
-
-    {\small
-    \begin{verbatim}
-    brew install octave
-    \end{verbatim} }
-
-\noindent
-Next,
+Do not use \verb'gcc' to compile GraphBLAS.  At least versions up to 14.2.0
+have a bug in the atomic capture on the IBM Power8 (see the comments in the
+\verb'Source/mxm/factory' folder).  Use \verb'clang' or \verb'xlc' instead.
 
 %----------------------------------------
 \subsubsection{On Microsoft Windows}