-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparallelReductionNaive.cu
More file actions
71 lines (54 loc) · 1.94 KB
/
parallelReductionNaive.cu
File metadata and controls
71 lines (54 loc) · 1.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#include <iostream>
#include <vector>
#include <cuda_runtime.h>
#include "helpers.h"
__global__
void reduce_block(float* g_input_data, float* g_output_data, int n){
// shared memory partial sums
extern __shared__ float sdata[];
int tid = threadIdx.x;
int i = blockIdx.x * blockDim.x + threadIdx.x;
// this thread will load one element from global to shared memory
// pad with 0 if out of limit
sdata[tid] = (i < n) ? g_data[i]: 0;
__syncthreads();
// perform parallel reduction after all threads load data
for(int s = blockDim.x/2; s > 0; s >>= 1){
if (tid < s){
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result back to global memory
if (tid == 0){
g_odata[blockDim.x] = sdata[0];
}
}
int main(){
int n = 1024 * 1024;
int blockSize = 256;
int numBlocks = (n + blockSize - 1)/blockSize;
std::vector<float> h_idata(n);
for(int i =0; i < n; i++){
h_idata[i] = (float)i;
}
std::vector<float> h_odata(numBlocks);
float *d_idata, *d_odata;
CUDA_CHECK(cudaMalloc((void**)&d_idata, n * sizeof(float)));
CUDA_CHECK(cudaMalloc((void**)&d_odata, n * sizeof(float)));
CUDA_CHECK(cudaMemcpy(d_idata, h_idata.data(), n * sizeof(float), cudaMemcpyHostToDevice));
size_t sharedMemSize = blockSize * sizeof(float);
reduce_block<<<numBlocks, blockSize, sharedMemSize>>>(d_idata, d_odata, n);
CUDA_CHECK(cudaDeviceSynchronize());
CUDA_CHECK(cudaMemcpy(h_odata.data(), d_odata, numBlocks * sizeof(float), cudaMemcpyDeviceToHost));
float totalSum = 0.0f;
for (int i = 0; i < numBlocks; ++i) {
totalSum += h_odata[i];
}
float expectedSum = (float)(n - 1) * n / 2.0f; // Sum of 0 to n-1
std::cout << "Sum: " << totalSum << std::endl;
std::cout << "Expected Sum: " << expectedSum << std::endl;
CUDA_CHECK(cudaFree(d_idata));
CUDA_CHECK(cudaFree(d_odata));
return 0;
}