-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhistogramAtomicAdd.cu
More file actions
96 lines (78 loc) · 3.08 KB
/
histogramAtomicAdd.cu
File metadata and controls
96 lines (78 loc) · 3.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#include <cuda_runtime.h>
#include <stdio.h>
#include "helpers.h"
__global__
void histogram_global_atomic(unsigned int* input, unsigned int* histogram, int n, int num_bins){
// kernel using global atomics
int tid = blockIdx.x * blockDim.x + threadIdx.x;
// each thread will process one element
if (tid < n){
unsigned int item = input[tid];
// atomic addition to appropriate bin
// ensures the addition to histogram happens in one uninterrupted sequence
atomicAdd(&histogram[item], 1);
}
}
__global__
void histogram_incorrect(unsigned int* input, unsigned int* histogram, int n, int num_bins){
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n){
unsigned int item = input[tid];
// race condition
// multiple threads try to do this at the same time. final result depends on order in which the threads execute
histogram[item]++;
}
}
int main(){
const int N = 1000000;
const int NUM_BINS = 256;
const int BLOCK_SIZE = 256;
unsigned int* h_input = new unsigned int[N];
unsigned int* h_histogram = new unsigned int[NUM_BINS]();
unsigned int* h_hist_correct = new unsigned int[NUM_BINS]();
// initializing with random values. ensuring no value exceeds number of bins
for(int i = 0; i < N; i++) {
h_input[i] = rand() % NUM_BINS;
}
// calucating on cpu for verification
for(int i = 0; i < N; i++) {
h_hist_correct[h_input[i]]++;
}
unsigned int *d_input, *d_histogram;
CUDA_CHECK(cudaMalloc(&d_input, sizeof(unsigned int) * N));
CUDA_CHECK(cudaMalloc(&d_histogram, sizeof(unsigned int) * NUM_BINS));
CUDA_CHECK(cudaMemcpy(d_input, h_input, N * sizeof(unsigned int), cudaMemcpyHostToDevice));
// reset device histogram
CUDA_CHECK(cudaMemset(d_histogram, 0, NUM_BINS * sizeof(unsigned int)));
cudaEvent_t start, stop;
CUDA_CHECK(cudaEventCreate(&start));
CUDA_CHECK(cudaEventCreate(&stop));
CUDA_CHECK(cudaEventRecord(start));
histogram_global_atomic<<<(N + BLOCK_SIZE - 1)/BLOCK_SIZE, BLOCK_SIZE>>>(d_input, d_histogram, N, NUM_BINS);
CUDA_CHECK(cudaEventRecord(stop));
CUDA_CHECK(cudaEventSynchronize(stop));
float milliseconds = 0;
CUDA_CHECK(cudaEventElapsedTime(&milliseconds, start, stop));
printf("Atomic Kernel Time: %f ms\n", milliseconds);
CUDA_CHECK(cudaMemcpy(h_histogram, d_histogram, NUM_BINS * sizeof(unsigned int), cudaMemcpyDeviceToHost));
bool correct = true;
for(int i = 0; i < NUM_BINS; i++) {
if(h_histogram[i] != h_hist_correct[i]) {
printf("Mismatch at bin %d: %u != %u\n",
i, h_histogram[i], h_hist_correct[i]);
correct = false;
break;
}
}
printf("Results are %s\n", correct ? "correct" : "incorrect");
printf("\nFirst 10 bins:\n");
for(int i = 0; i < 10; i++) {
printf("Bin %d: %u\n", i, h_histogram[i]);
}
CUDA_CHECK(cudaFree(d_input));
CUDA_CHECK(cudaFree(d_histogram));
delete[] h_input;
delete[] h_histogram;
delete[] h_hist_correct;
return 0;
}