Skip to content

Commit 8c381c1

Browse files
Merge pull request #419 from DrTimothyAldenDavis/dev2
for GraphBLAS 10.0.5
2 parents 6d54aab + a6f6233 commit 8c381c1

25 files changed

+716
-538
lines changed

CUDA/template/GB_cuda_jit_AxB_dot3_phase2.cuh

Lines changed: 4 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -58,10 +58,11 @@ __inline__ __device__ void blockBucketExclusiveSum
5858
BlockPrefixCallbackOp prefix_op (0) ;
5959

6060
// Have the block iterate over segments of items
61-
int64_t data = 0 ;
6261

63-
for (int block_id = 0 ; block_id < nblocks ; block_id += blocksize)
62+
for (int block_id = 0 ; block_id <= nblocks ; block_id += threads_per_block)
6463
{
64+
int64_t data = 0 ;
65+
6566
// Load a segment of consecutive items that are blocked across threads
6667

6768
int loc = block_id + threadIdx.x;
@@ -79,10 +80,6 @@ __inline__ __device__ void blockBucketExclusiveSum
7980
{
8081
Blockbucket [bucketId*(nblocks+1) + loc] = data ;
8182
}
82-
83-
// this_thread_block().sync();
84-
85-
data = 0 ;
8683
}
8784
}
8885

@@ -104,27 +101,6 @@ __global__ void GB_cuda_AxB_dot3_phase2_kernel
104101
// across, ie size of vector for 1 bucket
105102
)
106103
{
107-
108-
this_thread_block().sync() ; // delete this?
109-
110-
if (gridDim.x >= NBUCKETS)
111-
{
112-
// Cumulative sum across blocks for each bucket
113-
if (blockIdx.x < NBUCKETS)
114-
{
115-
blockBucketExclusiveSum (blockIdx.x, Blockbucket, nblocks) ;
116-
}
117-
}
118-
else
119-
{
120-
if (blockIdx.x == 0)
121-
{
122-
#pragma unroll
123-
for (int b = 0 ; b < NBUCKETS ; b++)
124-
{
125-
blockBucketExclusiveSum (b, Blockbucket, nblocks) ;
126-
}
127-
}
128-
}
104+
blockBucketExclusiveSum (blockIdx.x, Blockbucket, nblocks) ;
129105
}
130106

CUDA/template/GB_jit_kernel_cuda_AxB_dot3.cu

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434
#define tile_sz 32
3535
#define log2_tile_sz 5
3636
#define shared_vector_size 256
37-
#define blocksize 32
3837
#define threads_per_block 32
3938

4039
//------------------------------------------------------------------------------
@@ -285,11 +284,11 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
285284
work_per_thread = 64 ;
286285
}
287286
int gridsz = GB_ICEIL (mnz, work_per_thread*blocksz) ;
288-
dim3 grid_2 (gridsz) ;
287+
dim3 grid_2dn (gridsz) ;
289288

290289
// kernel_timer.Start();
291290

292-
GB_cuda_AxB_dot3_phase3_dndn_kernel <<grid_2, block, 0, stream>>
291+
GB_cuda_AxB_dot3_phase3_dndn_kernel <<grid_2dn, block, 0, stream>>
293292
(C, M, A, B, theta) ;
294293

295294
}
@@ -360,38 +359,51 @@ GB_JIT_CUDA_KERNEL_DOT3_PROTO (GB_jit_kernel)
360359
// kernel_timer.Stop();
361360
// printf ("(GPU phase1 %12.6g ms )\n", kernel_timer.Elapsed()) ;
362361

363-
#if 0
364-
printf ("Blockbucket [%d] = {\n", NBUCKETS * number_of_blocks_1) ;
365-
for (int b = 0 ; b < NBUCKETS ; b++)
366-
{
367-
for (int k = 0 ; k < number_of_blocks_1 ; k++)
368-
{
369-
printf (" %ld,\n", Blockbucket [b * (number_of_blocks_1+1) + k]) ;
370-
}
371-
}
372-
printf ("};\n") ;
373-
#endif
374-
375362
//----------------------------------------------------------------------
376363
// phase2: cumsum across the Blockbuckets, propagate to thread level
377364
//----------------------------------------------------------------------
378365

379366
// # of blocks for phase2:
380-
// number_of_blocks_2 = ceil ((number_of_blocks_1+1) / threads_per_block)
381-
int number_of_blocks_2 = ((number_of_blocks_1+1) + threads_per_block - 1) / threads_per_block ;
367+
// // number_of_blocks_2 = ceil ((number_of_blocks_1+1) / threads_per_block)
368+
// int number_of_blocks_2 = ((number_of_blocks_1) + threads_per_block - 1) / threads_per_block ;
382369

383370
// number_of_blocks_2 = 1 ;
384-
printf ("number_of_blocks_2: %d\n", number_of_blocks_2) ;
385-
dim3 grid_2 (number_of_blocks_2) ;
371+
// printf ("number_of_blocks_2: %d\n", number_of_blocks_2) ;
372+
// dim3 grid_2 (number_of_blocks_2) ;
373+
374+
// # of blocks for phase2: one threadblock per bucket
375+
dim3 grid_2 (NBUCKETS) ;
386376

387377
// kernel_timer.Start();
388378

379+
#if 0
380+
for (int b = 0 ; b < NBUCKETS ; b++)
381+
{
382+
printf ("\n\n=================== Bucket: %d\n", b) ;
383+
for (int64_t tid = 0 ; tid <= number_of_blocks_1 ; tid++)
384+
{
385+
printf (" %ld: %ld\n", tid, Blockbucket [b * (number_of_blocks_1+1) + tid]) ;
386+
}
387+
}
388+
#endif
389+
389390
// printf ("Launching sparse phase2:\n") ;
390391
GB_cuda_AxB_dot3_phase2_kernel <<<grid_2, block, 0, stream>>>
391392
(Blockbucket, number_of_blocks_1) ;
392393
CUDA_OK (cudaGetLastError ( )) ;
393394
CUDA_OK (cudaStreamSynchronize (stream)) ;
394395

396+
#if 0
397+
for (int b = 0 ; b < NBUCKETS ; b++)
398+
{
399+
printf ("\n\n=================== Bucket after cumsum: %d\n", b) ;
400+
for (int64_t tid = 0 ; tid <= number_of_blocks_1 ; tid++)
401+
{
402+
printf (" %ld: %ld\n", tid, Blockbucket [b * (number_of_blocks_1+1) + tid]) ;
403+
}
404+
}
405+
#endif
406+
395407
// get the total number of zombies in the zombie bucket
396408
int64_t s = Blockbucket [number_of_blocks_1] ;
397409
C->nzombies = s ;

Doc/ChangeLog

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
May 5, 2025: version 10.0.5
2+
3+
* revised cmake build system
4+
* (64) bug fix: GrB_assign, C<M>+=A, method 08n, when A is full.
5+
Caught by Gabe Gomez.
6+
* (63) bug fix: GrB_mxm when using the masked dot-product
7+
and the output matrix is iso-valued.
8+
19
Apr 10, 2025: version 10.0.3
210

311
* upgrade xxHash to 0.8.3: contributed by Christoph Grueninger

Doc/GraphBLAS_UserGuide.pdf

-2.62 KB
Binary file not shown.

Doc/GraphBLAS_version.tex

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
% version of SuiteSparse:GraphBLAS
22
\date{VERSION
3-
10.0.3,
4-
Apr 10, 2025}
3+
10.0.5,
4+
May 5, 2025}
55

Doc/UserGuide/GrB_installing.tex

Lines changed: 7 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -66,68 +66,11 @@ \subsection{Requirements}
6666
See Section~\ref{cache_path} for more details.
6767

6868
%----------------------------------------
69-
\subsection{Quick Start for MATLAB/Octave}
69+
\subsection{Installing GraphBLAS for MATLAB/Octave}
7070
%----------------------------------------
7171

72-
Before you try this, first be sure you have \verb'cmake' (v3.20 or later) and
73-
the OpenMP library installed. See Section~\ref{mac_openmp} to install OpenMP
74-
on the Mac.
75-
76-
Next, in the MATLAB/Octave Command Window, simply type:
77-
78-
{\small
79-
\begin{verbatim}
80-
cd GraphBLAS/GraphBLAS
81-
graphblas_install
82-
cd test
83-
gbtest \end{verbatim} }
84-
85-
This will use \verb'cmake' to compile the GraphBLAS library. Add your
86-
\verb'GraphBLAS/GraphBLAS' folder to your path, by editting your
87-
\verb'startup.m' script for MATLAB (usually in your \verb'Documents/MATLAB'
88-
folder) or your \verb'~/.octaverc' file for Octave. Add this line to either
89-
file:
90-
91-
{\small
92-
\begin{verbatim}
93-
addpath ('/home/me/GraphBLAS/GraphBLAS') ; \end{verbatim} }
94-
95-
\noindent
96-
where \verb'/home/me/GraphBLAS' is the top-level folder containing your
97-
copy of GraphBLAS.
98-
99-
The \verb'graphblas_install' MATLAB/Octave script may fail to run \verb'cmake'.
100-
If it does, it will print the following workaround, where the commands it
101-
tells you to use will differ depending on the platform:
102-
103-
{\small
104-
\begin{verbatim}
105-
Building GraphBLAS with cmake failed. Try this outside of MATLAB:
106-
107-
cd /home/me/GraphBLAS/GraphBLAS/build
108-
cmake ..
109-
cmake --build . --config Release -j40
110-
111-
Then do this inside MATLAB/Octave:
112-
113-
cd /home/me/GraphBLAS/GraphBLAS/@GrB/private
114-
gbmake \end{verbatim} }
115-
116-
\noindent
117-
where \verb'/home/me/GraphBLAS' is your copy of GraphBLAS.
118-
119-
You cannot use a single copy of the GraphBLAS source distribution to use in
120-
both MATLAB and Octave on the same system at the same time. The \verb'*.o'
121-
files in \verb'GraphBLAS/GraphBLAS/@GrB/private' compiled by
122-
\verb'graphblas_install.m' will conflict with each other. To switch between
123-
MATLAB and Octave, use a second copy of the GraphBLAS source distribution, or
124-
do a clean installation (via \verb'make purge' in the
125-
\verb'GraphBLAS/GraphBLAS/@GrB/private' folder) and redo the above
126-
instructions. There is no need to recompile the \verb'libgraphblas.so' (or
127-
\verb'dylib' on the Mac) since Octave uses
128-
\verb'GraphBLAS/build/libgraphblas.so' while MATLAB uses
129-
\verb'GraphBLAS/GraphBLAS//build/libgraphblas_matlab.so'.
130-
Both MATLAB and Octave can share the same compiled JIT kernels.
72+
See the \verb'GraphBLAS/GraphBLAS/README.md' file for instructions on
73+
how to compiler the MATLAB/Octave interface on Linux/Mac/Windows.
13174

13275
%----------------------------------------
13376
\subsection{More details}
@@ -257,27 +200,12 @@ \subsubsection{On the Intel-based Mac}
257200
make CC=icc CXX=icpc \end{verbatim} }
258201

259202
%----------------------------------------
260-
\subsubsection{MATLAB/Octave on the Mac (Apple Silicon based)}
203+
\subsubsection{On IBM Power}
261204
%----------------------------------------
262205

263-
MATLAB on the Apple-Silicon-based Mac is now a native ARM64 application (as of
264-
R2023b). GraphBLAS is not supported for earlier versions of MATLAB on Apple
265-
Silicon.
266-
For Octave, GraphBLAS is designed to use the \verb'brew' version of Octave.
267-
268-
Note that when used inside MATLAB, GraphBLAS must use the same OpenMP library
269-
as MATLAB. Similarly, when used in Octave, it must use the same OpenMP library
270-
as Octave. Both cases are handled by the \verb'graphblas_install.m' script.
271-
272-
Install Octave on the Mac with:
273-
274-
{\small
275-
\begin{verbatim}
276-
brew install octave
277-
\end{verbatim} }
278-
279-
\noindent
280-
Next,
206+
Do not use \verb'gcc' to compile GraphBLAS. At least versions up to 14.2.0
207+
have a bug in the atomic capture on the IBM Power8 (see the comments in the
208+
\verb'Source/mxm/factory' folder). Use \verb'clang' or \verb'xlc' instead.
281209

282210
%----------------------------------------
283211
\subsubsection{On Microsoft Windows}

0 commit comments

Comments
 (0)