Skip to content

Commit fc45b32

Browse files
committed
feat(convolution - ops): add img2col operation and related test cases
- Added the img2col operation to the convolution operations suite. - Implemented a set of test cases to verify the correctness of the img2col operation. - These test cases cover various input scenarios to ensure the stability and accuracy of the img2col implementation.
1 parent d324650 commit fc45b32

5 files changed

Lines changed: 135 additions & 13 deletions

File tree

include/NeuZephyr/OperationKernels.cuh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1000,11 +1000,15 @@ namespace nz::krnl {
10001000
void NgradCopy(dim3 gridDim, dim3 blockDim, float* out, float* in, size_t n,
10011001
const std::vector<size_t>& offset_o, const std::vector<size_t>& offset_i);
10021002

1003-
void Expand(dim3 gridDim, dim3 blockDim, float* out, const float* in, size_t n,
1003+
void Expand(dim3 gridDim, dim3 blockDim, float* out, float* in, size_t n,
10041004
size_t total);
10051005

1006-
void Compress(dim3 gridDim, dim3 blockDim, float* out, const float* in, size_t n,
1006+
void Compress(dim3 gridDim, dim3 blockDim, float* out, float* in, size_t n,
10071007
size_t total);
1008+
1009+
void img2col(const dim3 gridDim, const dim3 blockDim, float* out, float* in, const size_t H_out,
1010+
const size_t W_out, const size_t C, const size_t K_h, const size_t K_w, const size_t stride,
1011+
const size_t pad, const size_t H_in, const size_t W_in, const size_t batch);
10081012
#endif
10091013
}
10101014

include/NeuZephyr/TensorOperations.cuh

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -798,8 +798,8 @@ namespace nz::data {
798798
for (auto j = 0; j < out.shape()[1]; j++) {
799799
offsetC.push_back(i * out.shape().getStride(0) + j * out.shape().getStride(1));
800800
offsetA.push_back(i * (lhs.shape().N() > 1 ? lhs.shape().getStride(0) : 0) + j * (lhs.shape().C() > 1
801-
? lhs.shape().getStride(1)
802-
: 0));
801+
? lhs.shape().getStride(1)
802+
: 0));
803803
offsetB.push_back(i * (rhs.shape().N() > 1 ? rhs.shape().getStride(0) : 0) + j * (
804804
rhs.shape().C() > 1 ? rhs.shape().getStride(1) : 0));
805805
}
@@ -869,8 +869,8 @@ namespace nz::data {
869869
for (auto j = 0; j < out.shape()[1]; j++) {
870870
offsetC.push_back(i * out.shape().getStride(0) + j * out.shape().getStride(1));
871871
offsetA.push_back(i * (lhs.shape().N() > 1 ? lhs.shape().getStride(0) : 0) + j * (lhs.shape().C() > 1
872-
? lhs.shape().getStride(1)
873-
: 0));
872+
? lhs.shape().getStride(1)
873+
: 0));
874874
offsetB.push_back(i * (rhs.shape().N() > 1 ? rhs.shape().getStride(0) : 0) + j * (
875875
rhs.shape().C() > 1 ? rhs.shape().getStride(1) : 0));
876876
}
@@ -939,8 +939,8 @@ namespace nz::data {
939939
for (auto j = 0; j < out.shape()[1]; j++) {
940940
offsetC.push_back(i * out.shape().getStride(0) + j * out.shape().getStride(1));
941941
offsetA.push_back(i * (lhs.shape().N() > 1 ? lhs.shape().getStride(0) : 0) + j * (lhs.shape().C() > 1
942-
? lhs.shape().getStride(1)
943-
: 0));
942+
? lhs.shape().getStride(1)
943+
: 0));
944944
offsetB.push_back(i * (rhs.shape().N() > 1 ? rhs.shape().getStride(0) : 0) + j * (
945945
rhs.shape().C() > 1 ? rhs.shape().getStride(1) : 0));
946946
}
@@ -1008,8 +1008,8 @@ namespace nz::data {
10081008
for (auto j = 0; j < out.shape()[1]; j++) {
10091009
offsetC.push_back(i * out.shape().getStride(0) + j * out.shape().getStride(1));
10101010
offsetA.push_back(i * (lhs.shape().N() > 1 ? lhs.shape().getStride(0) : 0) + j * (lhs.shape().C() > 1
1011-
? lhs.shape().getStride(1)
1012-
: 0));
1011+
? lhs.shape().getStride(1)
1012+
: 0));
10131013
offsetB.push_back(i * (rhs.shape().N() > 1 ? rhs.shape().getStride(0) : 0) + j * (
10141014
rhs.shape().C() > 1 ? rhs.shape().getStride(1) : 0));
10151015
}
@@ -1085,7 +1085,8 @@ namespace nz::data {
10851085
return result;
10861086
}
10871087

1088-
DL_API void iSoftmaxJacobian(float* out, float* in, size_t n, const std::vector<size_t>& offset_o, const std::vector<size_t>& offset_i);
1088+
DL_API void iSoftmaxJacobian(float* out, float* in, size_t n, const std::vector<size_t>& offset_o,
1089+
const std::vector<size_t>& offset_i);
10891090

10901091
template <typename T>
10911092
std::enable_if_t<is_valid_tensor_type<T>::value, T>
@@ -1103,5 +1104,21 @@ namespace nz::data {
11031104
iSoftmaxJacobian(result.data(), in.data(), n, offset_o, offset_i);
11041105
return result;
11051106
}
1107+
1108+
DL_API void iImg2col(float* out, float* in, const size_t H_out,
1109+
const size_t W_out, const size_t C, const size_t K_h, const size_t K_w, const size_t stride,
1110+
const size_t pad, const size_t H_in, const size_t W_in, const size_t batch);
1111+
1112+
template <typename T>
1113+
std::enable_if_t<is_valid_tensor_type<T>::value, T>
1114+
tensorImg2col(const T& in, const size_t K_h, const size_t K_w, const size_t stride,
1115+
const size_t pad) {
1116+
const size_t H_out = (in.shape().H() + 2 * pad - K_h) / stride + 1;
1117+
const size_t W_out = (in.shape().W() + 2 * pad - K_w) / stride + 1;
1118+
T result({in.shape()[0], 1, H_out * W_out, in.shape().C() * K_h * K_w});
1119+
iImg2col(result.data(), in.data(), H_out, W_out, in.shape().C(), K_h, K_w, stride, pad,
1120+
in.shape().H(), in.shape().W(), in.shape()[0]);
1121+
return result;
1122+
}
11061123
}
11071124
#endif //TENSOROPERATIONS_CUH

src/OperationKernels.cu

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1285,7 +1285,7 @@ namespace nz::krnl {
12851285
out[idx] = in[idx % n];
12861286
}
12871287

1288-
void Expand(const dim3 gridDim, const dim3 blockDim, float* out, const float* in, const size_t n,
1288+
void Expand(const dim3 gridDim, const dim3 blockDim, float* out, float* in, const size_t n,
12891289
const size_t total) {
12901290
StreamManager<float>::Instance().submit(ExpandKernel, gridDim, blockDim, 0, out, in, n, total);
12911291
}
@@ -1298,8 +1298,36 @@ namespace nz::krnl {
12981298
atomicAdd(out + idx % n, in[idx]);
12991299
}
13001300

1301-
void Compress(const dim3 gridDim, const dim3 blockDim, float* out, const float* in, const size_t n,
1301+
void Compress(const dim3 gridDim, const dim3 blockDim, float* out, float* in, const size_t n,
13021302
const size_t total) {
13031303
StreamManager<float>::Instance().submit(CompressKernel, gridDim, blockDim, 0, out, in, n, total);
13041304
}
1305+
1306+
__global__ void img2colKernel(float* out, const float* in, const size_t H_out, const size_t W_out, const size_t C,
1307+
const size_t K_h, const size_t K_w, const size_t stride, const size_t pad, const size_t H_in, const size_t W_in, const size_t batch) {
1308+
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
1309+
if (idx >= H_out * W_out * C * K_h * K_w * batch) {
1310+
return;
1311+
}
1312+
const size_t fixedIdx = idx % (H_out * W_out * C * K_h * K_w);
1313+
const size_t currentBatch = idx / (H_out * W_out * C * K_h * K_w);
1314+
const size_t k = fixedIdx / (C * K_h * K_w);
1315+
const size_t m = fixedIdx % (C * K_h * K_w);
1316+
const size_t c = m / (K_h * K_w);
1317+
const long long h = (k / W_out) * stride - pad + (m % (K_h * K_w)) / K_w;
1318+
const long long w = (k % W_out) * stride - pad + m % K_w;
1319+
if (h >= 0 && h < H_in && w >= 0 && w < W_in) {
1320+
out[idx] = in[currentBatch * (C * H_in * W_in) + c * (H_in * W_in) + h * W_in + w];
1321+
}
1322+
else {
1323+
out[idx] = 0;
1324+
}
1325+
}
1326+
1327+
void img2col(const dim3 gridDim, const dim3 blockDim, float* out, float* in, const size_t H_out,
1328+
const size_t W_out, const size_t C, const size_t K_h, const size_t K_w, const size_t stride,
1329+
const size_t pad, const size_t H_in, const size_t W_in, const size_t batch) {
1330+
StreamManager<float>::Instance().submit(img2colKernel, gridDim, blockDim, 0, out, in, H_out, W_out, C,
1331+
K_h, K_w, stride, pad, H_in, W_in, batch);
1332+
}
13051333
}

src/TensorOperations.cu

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,4 +131,12 @@ namespace nz::data {
131131
dim3 grid((n + block.x - 1) / block.x, (n + block.y - 1) / block.y);
132132
krnl::SoftmaxJacobian(grid, block, out, in, n, offset_o, offset_i);
133133
}
134+
135+
void iImg2col(float* out, float* in, const size_t H_out, const size_t W_out, const size_t C, const size_t K_h,
136+
const size_t K_w, const size_t stride, const size_t pad, const size_t H_in, const size_t W_in,
137+
const size_t batch) {
138+
const dim3 block(BLOCKSIZE);
139+
const dim3 grid((H_out * W_out * C * K_h * K_w * batch + BLOCKSIZE - 1) / BLOCKSIZE);
140+
krnl::img2col(grid, block, out, in, H_out, W_out, C, K_h, K_w, stride, pad, H_in, W_in, batch);
141+
}
134142
}

test/Test.cpp

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2876,4 +2876,69 @@ TEST(Model, SGDOptimize) {
28762876
expected.dataInject(input1Gard.begin(), input1Gard.end(), true);
28772877

28782878
EXPECT_EQ(expected, *model.input1.output);
2879+
}
2880+
2881+
TEST(TensorBasic, img2colTest) {
2882+
const size_t n = 2;
2883+
const size_t c = 3;
2884+
const size_t h = 4;
2885+
const size_t w = 5;
2886+
const size_t k_h = 3;
2887+
const size_t k_w = 3;
2888+
const size_t stride = 1;
2889+
const size_t pad = 1;
2890+
const size_t H_out = (h + 2 * pad - k_h) / stride + 1;
2891+
const size_t W_out = (w + 2 * pad - k_w) / stride + 1;
2892+
2893+
std::vector<float> inputData({n*c*h*w});
2894+
std::vector<float> expectedData({n*H_out*W_out*k_h*k_w*c});
2895+
2896+
std::random_device rd;
2897+
std::mt19937 gen(rd());
2898+
std::uniform_real_distribution<float> dist(0.1f, 0.9f);
2899+
2900+
for (auto& i : inputData) {
2901+
i = dist(gen);
2902+
}
2903+
2904+
for (size_t b = 0; b < n; ++b) {
2905+
for (size_t i = 0; i < H_out; ++i) {
2906+
for (size_t j = 0; j < W_out; ++j) {
2907+
const int h_start = static_cast<int>(i * stride) - pad;
2908+
const int w_start = static_cast<int>(j * stride) - pad;
2909+
2910+
for (size_t r = 0; r < k_h; ++r) {
2911+
const int h_in = h_start + r;
2912+
for (size_t s = 0; s < k_w; ++s) {
2913+
const int w_in = w_start + s;
2914+
for (size_t c_in = 0; c_in < c; ++c_in) {
2915+
float val = 0.0f;
2916+
if (h_in >= 0 && h_in < h && w_in >= 0 && w_in < w) {
2917+
const size_t input_idx =
2918+
b * (c * h * w) +
2919+
c_in * (h * w) +
2920+
h_in * w +
2921+
w_in;
2922+
val = inputData[input_idx];
2923+
}
2924+
const size_t expected_idx =
2925+
b * (H_out * W_out * k_h * k_w * c) +
2926+
(i * W_out + j) * (k_h * k_w * c) +
2927+
c_in * (k_h * k_w) +
2928+
r * k_w +
2929+
s;
2930+
expectedData[expected_idx] = val;
2931+
}
2932+
}
2933+
}
2934+
}
2935+
}
2936+
}
2937+
2938+
Tensor input({n, c, h, w});
2939+
input.dataInject(inputData.begin(), inputData.end());
2940+
auto result = tensorImg2col(input, k_h, k_w, stride, pad);
2941+
Tensor expected({n, 1, H_out * W_out, k_h * k_w * c});
2942+
expected.dataInject(expectedData.begin(), expectedData.end());
2943+
EXPECT_EQ(expected, result);
28792944
}

0 commit comments

Comments
 (0)