-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelper.h
110 lines (95 loc) · 3.06 KB
/
helper.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#include <cstdlib> // header for rand()
#include <ctime> // header for time-seed for rand()
#include <limits> // get the smallest increment for a datatype
#include <cublas_v2.h>
using namespace std;
#define CEIL_DIV(M, N) ((M) + (N)-1) / (N)
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
void intializeMatrix(float *A, int size){
for(int i = 0; i < size; i++){
// A[i] = static_cast<float> (rand()) / static_cast<float> (RAND_MAX);
A[i] = (float)i;
// A[i] = 1.0f;
}
}
void intializeMatrix(float *A, int size, float val){
for(int i = 0; i < size; i++){
A[i] = val;
}
}
void *fixed_cudaMalloc(size_t len)
{
void *p;
if (cudaMalloc(&p, len) == cudaSuccess) return p;
return 0;
}
template<class T>
bool approximatelyEqual(T a, T b, T epsilon)
{
return fabs(a - b) <= ( (fabs(a) < fabs(b) ? fabs(b) : fabs(a)) * epsilon);
}
template<class T>
void compareResults(T *C1, T *C2, int M, int N){
int factor = 100;
T epsilon = std::numeric_limits<T>::epsilon() * factor;
std::cout<<"Epsilon : " << epsilon << std::endl;
for(int i = 0; i < M; i++){
for(int j = 0; j < N; j++){
if( !approximatelyEqual(C1[i*N + j], C2[i*N + j], epsilon) ){
printf("Outside tolerance at indices at i : %d, j : %d, C1 : %f, C2 : %f\n",
i, j, C1[i*N + j], C2[i*N + j]);
exit(0);
}
}
}
}
__host__ __device__
void printArrayDevice(float *A, int rows, int cols){
for(int i = 0; i < rows; i++){
for(int j = 0; j < cols; j++){
printf("%1.0f ", A[i*cols + j]);
}
printf("\n");
}
}
void computeMM(float *hA, float *hB, float *hC, int M, int N, int K){
for(int i = 0; i < M; i++){
for(int j = 0; j < N; j++){
double tmp = 0.;
for(int k = 0; k < K; k++){
tmp += (double)hA[i*K + k] * (double)hB[j + k*N];
}
hC[i * N + j] = (float)tmp;
}
}
}
void cuBLAScomputeMM(float *A, float *B, float *C_cuBLAS, int M, int N, int K, float alpha, float beta){
cublasHandle_t handle;
cublasCreate(&handle);
// CUBLAS MM uses matrices in column-major order
// Therefore, instead of computing C' = (A * B)', we compute B' * A'
cublasSgemm(handle
, CUBLAS_OP_N // no transpose for B (as it is row-major)
, CUBLAS_OP_N // no transpose for A (as it is row-major)
, N
, M
, K
, &alpha
, B // note first input is B
, N
, A // note second input is A
, K
, &beta
, C_cuBLAS
, N
);
cublasDestroy(handle);
}