-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmatrix_transpose.cu
More file actions
109 lines (82 loc) · 2.77 KB
/
matrix_transpose.cu
File metadata and controls
109 lines (82 loc) · 2.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#include<stdio.h>
#include<stdlib.h>
#define N 2048
#define BLOCK_SIZE 32
__global__ void matrix_transpose_naive(int *input, int *output) {
int indexX = threadIdx.x + blockIdx.x * blockDim.x;
int indexY = threadIdx.y + blockIdx.y * blockDim.y;
int index = indexY * N + indexX;
int transposedIndex = indexX * N + indexY;
// this has discoalesced global memory store
output[transposedIndex] = input[index];
// this has discoalesced global memore load
// output[index] = input[transposedIndex];
}
__global__ void matrix_transpose_shared(int *input, int *output) {
__shared__ int sharedMemory [BLOCK_SIZE] [BLOCK_SIZE];
// global index
int indexX = threadIdx.x + blockIdx.x * blockDim.x;
int indexY = threadIdx.y + blockIdx.y * blockDim.y;
// transposed global memory index
int tindexX = threadIdx.x + blockIdx.y * blockDim.x;
int tindexY = threadIdx.y + blockIdx.x * blockDim.y;
// local index
int localIndexX = threadIdx.x;
int localIndexY = threadIdx.y;
int index = indexY * N + indexX;
int transposedIndex = tindexY * N + tindexX;
// reading from global memory in coalesed manner and performing tanspose in shared memory
sharedMemory[localIndexX][localIndexY] = input[index];
__syncthreads();
// writing into global memory in coalesed fashion via transposed data in shared memory
output[transposedIndex] = sharedMemory[localIndexY][localIndexX];
}
//basically just fills the array with index.
void fill_array(int *data) {
for(int idx=0;idx<(N*N);idx++)
data[idx] = idx;
}
void print_output(int *a, int *b) {
printf("\n Original Matrix::\n");
for(int idx=0;idx<(N*N);idx++) {
if(idx%N == 0)
printf("\n");
printf(" %d ", a[idx]);
}
printf("\n Transposed Matrix::\n");
for(int idx=0;idx<(N*N);idx++) {
if(idx%N == 0)
printf("\n");
printf(" %d ", b[idx]);
}
}
int main(void) {
int *a, *b;
int *d_a, *d_b; // device copies of a, b, c
int size = N * N *sizeof(int);
// Alloc space for host copies of a, b, c and setup input values
a = (int *)malloc(size); fill_array(a);
b = (int *)malloc(size);
// Alloc space for device copies of a, b, c
cudaMalloc((void **)&d_a, size);
cudaMalloc((void **)&d_b, size);
// Copy inputs to device
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
dim3 blockSize(BLOCK_SIZE,BLOCK_SIZE,1);
dim3 gridSize(N/BLOCK_SIZE,N/BLOCK_SIZE,1);
matrix_transpose_naive<<<gridSize,blockSize>>>(d_a,d_b);
// Copy result back to host
// cudaMemcpy(b, d_b, size, cudaMemcpyDeviceToHost);
// print_output(a,b);
matrix_transpose_shared<<<gridSize,blockSize>>>(d_a,d_b);
// Copy result back to host
cudaMemcpy(b, d_b, size, cudaMemcpyDeviceToHost);
// print_output(a,b);
// terminate memories
free(a);
free(b);
cudaFree(d_a);
cudaFree(d_b);
return 0;
}