-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathimageBlur.cu
More file actions
94 lines (75 loc) · 3.36 KB
/
imageBlur.cu
File metadata and controls
94 lines (75 loc) · 3.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#include <stdio.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
using namespace std;
// error checking
inline cudaError_t checkCudaErrors(cudaError_t error){
if(error != cudaSuccess){
fprintf(stderr, "CUDA ERROR = %d: %s (%s:%d)\n", error, cudaGetErrorString(error), __FILE__, __LINE__);
exit(EXIT_FAILURE);
}
return error;
}
__global__
void imageBlurKernel(unsigned char* inputImage, unsigned char* outputImage, int width, int height, int kernelSize){
// calculating thread's position in output image
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = blockDim.x * blockIdx.x + threadIdx.x;
// checking if we are in boundaries
if (row < height && col < width){
float blurredPixelValue = 0.0f;
int kernelRadius = kernelSize/2;
// how much the value of each pixel should be multiplied by
// if it's a 3x3 kernel, each value is multiplied by 1/9
int kernelWeight = 1.0f / (kernelRadius * kernelRadius);
for(int kernelRow = -kernelRadius; kernelRow <= kernelRadius; ++kernelRow){
for(int kernelCol = -kernelRadius; kernelCol <= kernelRadius; ++kernelCol){
int neighbourRow = row + kernelRow;
int neighbourCol = col + kernelCol;
int clampedRow = max(0, min(neighbourRow, height - 1));
int clampedCol = max(0, min(neighbourCol, width - 1));
// replication
blurredPixelValue += inputImage[clampedRow * width + clampedCol] * kernelWeight;
}
}
outputImage[row * width + col] = (unsigned char)blurredPixelValue;
}
}
int main(){
int width = 256;
int height = 256;
unsigned char* h_inputImage = new unsigned char[width * height];
unsigned char* h_outputImage = new unsigned char[width * height];
if(!h_inputImage || !h_outputImage){
fprintf(stderr, "host memory allocation failed \n");
exit(EXIT_FAILURE);
}
for(int i = 0; i < width * height; ++i){
h_inputImage[i] = (i % 256);
}
unsigned char* d_inputImage;
unsigned char* d_outputImage;
checkCudaErrors(cudaMalloc((void**)&d_inputImage, width * height * sizeof(unsigned char)));
checkCudaErrors(cudaMalloc((void**)&d_outputImage, width * height * sizeof(unsigned char)));
checkCudaErrors(cudaMemcpy(d_inputImage, h_inputImage, width * height * sizeof(unsigned char), cudaMemcpyHostToDevice));
dim3 blockDim(16, 16);
dim3 gridDim((width + blockDim.x - 1)/ blockDim.x, (height + blockDim.y - 1)/blockDim.y);
imageBlurKernel<<<gridDim, blockDim>>>(d_inputImage, d_outputImage, width, height, 3);
checkCudaErrors(cudaGetLastError());
// synchronizing after kernel launch helps catch kernel errors before host-device copy is done
// ensures that kernel has completed
// because cuda is asynchronous, this will help.
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaMemcpy(h_outputImage, d_outputImage, width * height * sizeof(unsigned char), cudaMemcpyDeviceToHost));
printf("first few blurred pixel values are \n");
for(int i =0; i < 10; ++i){
printf("%d ", h_outputImage[i]);
}
printf("\n");
// freeing memory
checkCudaErrors(cudaFree(d_inputImage));
checkCudaErrors(cudaFree(d_outputImage));
delete[] h_inputImage;
delete[] h_outputImage;
return 0;
}