-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkernel.cu
33 lines (25 loc) · 944 Bytes
/
kernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#include <cuda.h>
#include <stdio.h>
#include "kernel.h"
extern "C" {
__global__ void addKernel(float A[10000][10000], float B[10000][10000], float C[10000][10000])
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if(i < 1000 && j < 1000) C[i][j] = A[i][j] + B[i][j];
}
void addWithCuda(float A[10000][10000], float B[10000][10000], float C[10000][10000], unsigned int numBlocks, unsigned int threads)
{
dim3 blockSize;
dim3 gridSize;
int threadNum;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
threadNum = 1024;
blockSize = dim3(threadNum, 1, 1);
gridSize = dim3(numCols/threadNum+1, numRows, 1);
addKernel << <numBlocks, threadsPerBlock >> > (A, B, C);
}
}
// Helper function for using CUDA to add vectors in parallel.