1+ /*
2+ * Copyright (c) 2022 NECSTLab, Politecnico di Milano. All rights reserved.
3+ *
4+ * Redistribution and use in source and binary forms, with or without
5+ * modification, are permitted provided that the following conditions
6+ * are met:
7+ * * Redistributions of source code must retain the above copyright
8+ * notice, this list of conditions and the following disclaimer.
9+ * * Redistributions in binary form must reproduce the above copyright
10+ * notice, this list of conditions and the following disclaimer in the
11+ * documentation and/or other materials provided with the distribution.
12+ * * Neither the name of NECSTLab nor the names of its
13+ * contributors may be used to endorse or promote products derived
14+ * from this software without specific prior written permission.
15+ * * Neither the name of Politecnico di Milano nor the names of its
16+ * contributors may be used to endorse or promote products derived
17+ * from this software without specific prior written permission.
18+ *
19+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
20+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
23+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
26+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
27+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30+ */
31+
32+ package it .necst .grcuda .benchmark .bench ;
33+
34+ import it .necst .grcuda .benchmark .Benchmark ;
35+ import it .necst .grcuda .benchmark .BenchmarkConfig ;
36+ import org .graalvm .polyglot .Value ;
37+
38+ import java .util .Random ;
39+
40+ import static org .junit .Assert .assertEquals ;
41+
42+ public class B9M extends Benchmark {
43+ /*
44+ Compute the conjugate gradient algorithm on a dense symmetric matrix.
45+ The matrix-vector multiplications are row-partitioned to scale across multiple GPUs;
46+ */
47+
48+ private static final String PRECONDITION_KERNEL = "" +
49+ "// Add a small epsilon to the main diagonal:\n " +
50+ "extern \" C\" __global__ void precondition(float *A, int n, int m, int offset) {\n " +
51+ " for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < m; i += blockDim.x * gridDim.x) {\n " +
52+ " A[i * n + i + offset] += 1e-12; \n " +
53+ " }\n " +
54+ "}" ;
55+
56+ private static final String MMUL_KERNEL = "" +
57+ "// z = x @ y;\n " +
58+ "extern \" C\" __global__ void matrix_vector_mult(const float* x, const float* y, float* z, int n, int m, int z_offset) {\n " +
59+ " for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {\n " +
60+ " float sum = 0;\n " +
61+ " for (int j = 0; j < m; j++) { \n " +
62+ " sum += x[i * m + j] * y[j];\n " +
63+ " }\n " +
64+ " z[z_offset + i] = sum;\n " +
65+ " }\n " +
66+ "}\n " +
67+ "// z := w + alpha * A @ y;\n " +
68+ "extern \" C\" __global__ void matrix_vector_mult_axpy(const float* x, const float* y, const float *w, const float alpha, float* z, int n, int m, int z_offset) {\n " +
69+ " for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {\n " +
70+ " float sum = 0;\n " +
71+ " for (int j = 0; j < m; j++) { \n " +
72+ " sum += x[i * m + j] * y[j];\n " +
73+ " }\n " +
74+ " z[z_offset + i] = alpha * sum + w[z_offset + i];\n " +
75+ " }\n " +
76+ "}" ;
77+
78+ private static final String DP_KERNEL = "" +
79+ "__inline__ __device__ float warp_reduce(float val) {\n " +
80+ " int warp_size = 32;\n " +
81+ " for (int offset = warp_size / 2; offset > 0; offset /= 2) \n " +
82+ " val += __shfl_down_sync(0xFFFFFFFF, val, offset);\n " +
83+ " return val;\n " +
84+ "}\n " +
85+ "// z = <x, x>;\n " +
86+ "extern \" C\" __global__ void l2_norm(const float *x, float* z, int N) {\n " +
87+ " int warp_size = 32;\n " +
88+ " float sum = float(0);\n " +
89+ " for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n " +
90+ " float x_tmp = x[i];\n " +
91+ " sum += x_tmp * x_tmp;\n " +
92+ " }\n " +
93+ " sum = warp_reduce(sum); // Obtain the sum of values in the current warp;\n " +
94+ " if ((threadIdx.x & (warp_size - 1)) == 0) // Same as (threadIdx.x % warp_size) == 0 but faster\n " +
95+ " atomicAdd(z, sum); // The first thread in the warp updates the output;\n " +
96+ "}\n " +
97+ "// z = <x, y>;\n " +
98+ "extern \" C\" __global__ void dot(const float *x, const float *y, float* z, int N) {\n " +
99+ " int warp_size = 32;\n " +
100+ " float sum = float(0);\n " +
101+ " for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) {\n " +
102+ " sum += x[i] * y[i];\n " +
103+ " }\n " +
104+ " sum = warp_reduce(sum); // Obtain the sum of values in the current warp;\n " +
105+ " if ((threadIdx.x & (warp_size - 1)) == 0) // Same as (threadIdx.x % warp_size) == 0 but faster\n " +
106+ " atomicAdd(z, sum); // The first thread in the warp updates the output;\n " +
107+ "}" ;
108+
109+ private static final String SAXPY_KERNEL = "" +
110+ "// y = val + alpha * x;\n " +
111+ "extern \" C\" __global__ void saxpy(float* y, const float *val, const float *x, float alpha, int n) {\n " +
112+ " for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {\n " +
113+ " y[i] = val[i] + alpha * x[i];\n " +
114+ " }\n " +
115+ "}\n " +
116+ "// Simply copy array x into y;\n " +
117+ "extern \" C\" __global__ void cpy(float *y, const float *x, int n) {\n " +
118+ " for(int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) {\n " +
119+ " y[i] = x[i];\n " +
120+ " }\n " +
121+ "}" ;
122+
123+ private Value precondition_kernel , mmul_kernel , mmul_axpy_kernel , l2_norm_kernel , dp_kernel , saxpy_kernel , copy_kernel , initialize_random_symmetric_matrix ;
124+ private Value [] A ;
125+ private Value x , b , p , r , y , t1 , t2 ;
126+ private int S ;
127+
128+ private final int P = 16 ;
129+ private final int ITER = 50 ;
130+
131+ public B9M (BenchmarkConfig currentConfig ) {
132+ super (currentConfig );
133+
134+ this .S = 0 ;
135+ this .A = new Value [this .P ];
136+ for (int i = 0 ; i < this .P ; i ++) this .A [i ] = null ;
137+ this .x = null ;
138+ this .b = null ;
139+ this .p = null ;
140+ this .r = null ;
141+ this .y = null ;
142+ this .t1 = null ;
143+ this .t2 = null ;
144+
145+ this .mmul_axpy_kernel = null ;
146+ this .mmul_kernel = null ;
147+ this .l2_norm_kernel = null ;
148+ this .dp_kernel = null ;
149+ this .saxpy_kernel = null ;
150+ this .copy_kernel = null ;
151+ }
152+
153+ @ Override
154+ public void allocateTest (int iteration ) {
155+ this .S = Math .floorDiv (config .size + this .P - 1 , this .P );
156+
157+ // Allocate vectors
158+ for (int i = 0 ; i < this .P ; i ++)
159+ this .A [i ] = requestArray ("float" , this .S * config .size );
160+ this .x = requestArray ("float" , config .size );
161+ this .b = requestArray ("float" , config .size );
162+ this .p = requestArray ("float" , config .size );
163+ this .r = requestArray ("float" , config .size );
164+ this .y = requestArray ("float" , config .size );
165+ this .t1 = requestArray ("float" , 1 );
166+ this .t2 = requestArray ("float" , 1 );
167+
168+ // Build the kernels
169+ Value buildKernel = context .eval ("grcuda" , "buildkernel" );
170+
171+ this .precondition_kernel = buildKernel .execute (PRECONDITION_KERNEL , "precondition" , "pointer, sint32, sint32, sint32" );
172+ this .mmul_kernel = buildKernel .execute (MMUL_KERNEL , "matrix_vector_mult" , "const pointer, const pointer, const pointer, sint32, sint32, sint32" );
173+ this .mmul_axpy_kernel = buildKernel .execute (MMUL_KERNEL , "matrix_vector_mult_axpy" , "const pointer, const pointer, pointer, float, const pointer, sint32, sint32, sint32" );
174+ this .l2_norm_kernel = buildKernel .execute (DP_KERNEL , "l2_norm" , "const pointer, pointer, sint32" );
175+ this .dp_kernel = buildKernel .execute (DP_KERNEL , "dot" , "const pointer, pointer, pointer, sint32" );
176+ this .saxpy_kernel = buildKernel .execute (SAXPY_KERNEL , "saxpy" , "pointer, const pointer, const pointer, float, sint32" );
177+ this .copy_kernel = buildKernel .execute (SAXPY_KERNEL , "cpy" , "pointer, pointer, sint32" );
178+ this .initialize_random_symmetric_matrix = context .eval ("js" , "(X, S, N) => { \n " +
179+ " for (let i = 0; i < N; i++) {\n " +
180+ " s = (i / S) >> 0;\n " +
181+ " k = i % S;\n " +
182+ " Xs = X[s];\n " +
183+ " i_N = k * N;\n " +
184+ " for (let j = i; j < N; j++) {\n " +
185+ " val = 2 * Math.random() - 1; \n " +
186+ " Xs[i_N + j] = val;\n " +
187+ " X[(j / S) >> 0][(j % S) * N + i] = val;\n " +
188+ " }\n " +
189+ " }}" );
190+ }
191+
192+ @ Override
193+ public void initializeTest (int iteration ) {
194+ this .initialize_random_symmetric_matrix .execute (this .A , this .S , config .size );
195+ }
196+
197+ @ Override
198+ public void resetIteration (int iteration ) {
199+ // Reset result
200+ for (int i = 0 ; i < config .size ; i ++)
201+ this .x .setArrayElement (i , 1.0 / config .size );
202+ this .t1 .setArrayElement (0 , 0.0 );
203+ this .t2 .setArrayElement (0 , 0.0 );
204+ }
205+
206+ @ Override
207+ public void runTest (int iteration ) {
208+ long start_comp = System .nanoTime ();
209+ long end ;
210+
211+ // Initialization phase
212+ // precondition: A += I * np.eps;
213+ for (int i = 0 ; i < this .P ; i ++) {
214+ this .precondition_kernel .execute (config .numBlocks , config .blockSize1D ).
215+ execute (this .A [i ], config .size , Math .min (this .S , config .size - i * this .S ), i * this .S );
216+ }
217+
218+ // r = b - A * x
219+ for (int i = 0 ; i < this .P ; i ++) {
220+ this .mmul_axpy_kernel .execute (config .numBlocks , config .blockSize1D ).
221+ execute (this .A [i ], this .x , this .b , -1 , this .r , this .S , config .size , i * this .S );
222+ }
223+
224+ // p = r
225+ this .copy_kernel .execute (config .numBlocks , config .blockSize1D ).
226+ execute (this .p , this .r , config .size );
227+
228+ // t1 = r^t * r
229+ this .l2_norm_kernel .execute (config .numBlocks , config .blockSize1D ).
230+ execute (this .r , this .t1 , config .size );
231+
232+ for (int curr_iter = 0 ; curr_iter < this .ITER ; curr_iter ++) {
233+ // t2 = p^t * A * p
234+ for (int i = 0 ; i < this .P ; i ++) {
235+ this .mmul_kernel .execute (config .numBlocks , config .blockSize1D ).
236+ execute (this .A [i ], this .p , this .y , this .S , config .size , i * this .S );
237+ }
238+ this .dp_kernel .execute (config .numBlocks , config .blockSize1D ).
239+ execute (this .p , this .y , this .t2 , config .size );
240+
241+ float alpha = this .t1 .getArrayElement (0 ).asFloat () / this .t2 .getArrayElement (0 ).asFloat ();
242+ float old_r_norm_squared = this .t1 .getArrayElement (0 ).asFloat ();
243+ this .t1 .setArrayElement (0 , 0 );
244+ this .t2 .setArrayElement (0 , 0 );
245+
246+ // Update x: x = x + alpha * p
247+ this .saxpy_kernel .execute (config .numBlocks , config .blockSize1D ).
248+ execute (this .x , this .x , this .p , alpha , config .size );
249+
250+ // r = r - alpha * y
251+ this .saxpy_kernel .execute (config .numBlocks , config .blockSize1D ).
252+ execute (this .r , this .r , this .y , -1 * alpha , config .size );
253+
254+ // t1 = r^t * r
255+ this .l2_norm_kernel .execute (config .numBlocks , config .blockSize1D ).
256+ execute (this .r , this .t1 , config .size );
257+
258+ float beta = this .t1 .getArrayElement (0 ).asFloat () / old_r_norm_squared ;
259+
260+ this .saxpy_kernel .execute (config .numBlocks , config .blockSize1D ).
261+ execute (this .p , this .r , this .p , beta , config .size );
262+ }
263+
264+ // Add final sync step
265+ float tmp = x .getArrayElement (0 ).asFloat ();
266+ end = System .nanoTime ();
267+
268+ benchmarkResults .setCurrentComputationSec ((end - start_comp ) / 1000000000F );
269+
270+ // Compute GPU result
271+ for (int i = 0 ; i < this .P ; i ++) {
272+ this .mmul_axpy_kernel .execute (config .numBlocks , config .blockSize1D ).
273+ execute (this .A [i ], this .x , this .b , -1 , this .y , Math .min (this .S , config .size - i * this .S ), config .size , i * this .S );
274+ }
275+
276+ float sum = 0 ;
277+ for (int i = 0 ; i < 10 ; i ++)
278+ sum += this .y .getArrayElement (i ).asFloat ();
279+
280+ benchmarkResults .setCurrentGpuResult (0 );
281+ }
282+
283+ @ Override
284+ public void cpuValidation () {
285+ float [][] A_cpu = new float [config .size ][config .size ];
286+ float [] b_cpu = new float [config .size ];
287+ float [] x_cpu_1 = new float [config .size ];
288+ float [] x_cpu = new float [config .size ];
289+ float [] r_cpu = new float [config .size ];
290+ float [] p_cpu = new float [config .size ];
291+ float [] y_cpu = new float [config .size ];
292+ float [] tmp ;
293+ float t1_cpu = 0 ;
294+ float t2_cpu = 0 ;
295+ float alpha_cpu ;
296+ float beta_cpu ;
297+ float t1_old_cpu ;
298+
299+ for (int i = 0 ; i < config .size ; i ++) x_cpu_1 [i ] = 0 ;
300+
301+ int p_counter ;
302+ for (int i = 0 ; i < config .size ; i ++) {
303+ p_counter = Math .floorDiv (i , this .S );
304+ for (int j = 0 ; j < config .size ; j ++)
305+ A_cpu [i ][j ] = this .A [p_counter ].getArrayElement ((i % this .S ) * config .size + j ).asFloat ();
306+ }
307+
308+ // System.out.println("Matrix test A-CPU");
309+ // System.out.println("Matrix A-CPU -> rowSize: " + A_cpu.length + "; colSize: " + A_cpu[0].length);
310+ // for (int r=0; r<config.size; r++) {
311+ // System.out.print('|');
312+ // for (int c=0; c<config.size; c++) {
313+ // System.out.print(A_cpu[r][c] + "\t| ");
314+ // }
315+ // System.out.print('\n');
316+ // }
317+
318+ Random rd = new Random ();
319+ for (int i = 0 ; i < config .size ; i ++) b_cpu [i ] = rd .nextFloat ();
320+
321+ for (int i = 0 ; i < config .size ; i ++) x_cpu [i ] = 1 ;
322+
323+ tmp = matrixMult (A_cpu , x_cpu );
324+ for (int i = 0 ; i < config .size ; i ++) r_cpu [i ] = b_cpu [i ] - tmp [i ];
325+
326+ for (int i = 0 ; i < config .size ; i ++) p_cpu [i ] = r_cpu [i ];
327+
328+ for (int i = 0 ; i < config .size ; i ++) t1_cpu += (r_cpu [i ] * r_cpu [i ]);
329+
330+ // Main iteration
331+ for (int i = 0 ; i < ITER ; i ++) {
332+ y_cpu = matrixMult (A_cpu , p_cpu );
333+
334+ for (int j = 0 ; j < config .size ; j ++) t2_cpu += (p_cpu [j ] * y_cpu [j ]);
335+
336+ alpha_cpu = t1_cpu / t2_cpu ;
337+ t1_old_cpu = t1_cpu ;
338+ for (int j = 0 ; j < config .size ; j ++){
339+ x_cpu [j ] += alpha_cpu * p_cpu [j ];
340+ r_cpu [j ] -= alpha_cpu * y_cpu [j ];
341+ }
342+
343+ for (int j = 0 ; j < config .size ; j ++) t1_cpu += (r_cpu [j ] * r_cpu [j ]);
344+
345+ beta_cpu = t1_cpu / t1_old_cpu ;
346+
347+ for (int j = 0 ; j < config .size ; j ++) p_cpu [j ] = r_cpu [j ] + beta_cpu * p_cpu [j ];
348+ }
349+
350+ // System.out.println(" CPU - y pre sum ");
351+ // for (int i=0; i < config.size; i++) {System.out.print(y_cpu[i] + " % ");}
352+ // System.out.print("\n");
353+ float sum = 0 ;
354+ for (int i = 0 ; i < 10 ; i ++) {
355+ sum += y_cpu [i ];
356+ }
357+
358+ benchmarkResults .setCurrentCpuResult (sum );
359+ assertEquals (benchmarkResults .currentCpuResult (), benchmarkResults .currentGpuResult (), 1e-3 );
360+ }
361+
362+ private float [] matrixMult (float [][] a , float [] b ) {
363+ float [] res = new float [a .length ];
364+ float tempSum ;
365+
366+ for (int r = 0 ; r < a .length ; r ++) {
367+ tempSum = 0 ;
368+ for (int k = 0 ; k < b .length ; k ++) {
369+ tempSum += a [r ][k ] * b [k ];
370+ }
371+ res [r ] = tempSum ;
372+ }
373+ return res ;
374+ }
375+ }
0 commit comments