gpu/cuda/parallelReductionOptimized.cu at main · asrjy/gpu · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#include <iostream>
#include <cuda_runtime.h>
#include "helpers.h"

__global__
void reduce_warp(float *g_idata, float *g_odata, int n) {
    int tid = threadIdx.x;
    int i = blockIdx.x * blockDim.x + threadIdx.x;

    // warp id and lane id
    // WARP_SIZE is 32, standard in nvidia GPUs
    int laneId = tid % WARP_SIZE;
    int warpId = tid / WARP_SIZE;

    float val = (i < n) ? g_idata[i] : 0.0f; // Load data

   // warp level reduction
    for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
        // 0xffffffff is the mask indicated all threads in the warp should participate in the shuffle
        // each bit in the mask corrresponds to the shuffle, we are setting it to 1 indicating it's active
        // val is the value to be shuffled (partial sum of each thread)
        // source lan = laneId + offset. each thread recieves the value of val from thread whose laneId is laneId + offset
        float other = __shfl_sync(0xffffffff, val, laneId + offset);
        val += other;
    }

    // one thread per warp. writing warp sum to shared memory
    extern __shared__ float warpSums[];
    if (laneId == 0) {
        warpSums[warpId] = val;
    }
    __syncthreads();

    // normal block level reduction using shared memory
    if (warpId == 0) {
        if (tid < blockDim.x/WARP_SIZE){
            int sdataIndex = tid;

            float sdata[blockDim.x/WARP_SIZE];
             sdata[sdataIndex] = warpSums[sdataIndex];
              __syncthreads();
            for (int s = (blockDim.x/WARP_SIZE) / 2; s > 0; s >>= 1) {

                if (sdataIndex < s) {
                     sdata[sdataIndex] += sdata[sdataIndex + s];

                }
                 __syncthreads();
            }
             if (tid == 0){
                g_odata[blockIdx.x] = sdata[0];
            }
        }
    }
}


int main(){
    int n = 1024 * 1024;
    int blockSize = 256;
    int numBlocks = (n + blockSize - 1)/blockSize;
    // number of warps per block
    int numWarpsPerBlock = blockSie / WARP_SIZE;

    std::vector<float> h_idata(n);
    for (int i = 0; i < n; ++i) {
        h_idata[i] = (float)i; // Initialize data
    }
    std::vector<float> h_odata(numBlocks);

    float *d_idata, *d_odata;
    CUDA_CHECK(cudaMalloc((void**)&d_idata, n * sizeof(float)));
    CUDA_CHECK(cudaMalloc((void**)&d_odata, numBlocks * sizeof(float)));

    CUDA_CHECK(cudaMemcpy(d_idata, h_idata.data(), n * sizeof(float), cudaMemcpyHostToDevice));

    size_t sharedMemSize = numWarpsPerBlock * sizeof(float); // One float per warp
    reduce_warp<<<numBlocks, blockSize, sharedMemSize>>>(d_idata, d_odata, n);
    CUDA_CHECK(cudaDeviceSynchronize());

    CUDA_CHECK(cudaMemcpy(h_odata.data(), d_odata, numBlocks * sizeof(float), cudaMemcpyDeviceToHost));

    float totalSum = 0.0f;
    for (int i = 0; i < numBlocks; ++i) {
        totalSum += h_odata[i];
    }

    float expectedSum = (float)(n - 1) * n / 2.0f;
    std::cout << "Sum: " << totalSum << std::endl;
    std::cout << "Expected Sum: " << expectedSum << std::endl;

    CUDA_CHECK(cudaFree(d_idata));
    CUDA_CHECK(cudaFree(d_odata));

    return 0;

}