feat(convolution - ops): add col2img operation and related test cases

Mgepahmge · Mgepahmge · commit 7f828ffa39c9 · 2025-05-11T16:12:54.000+08:00
- Added the col2img operation to handle the reverse transformation in convolution operations.
- Developed a series of test cases to validate the functionality and accuracy of the col2img operation.
- The test cases include different input sizes and configurations to ensure the robustness of the implementation.
diff --git a/include/NeuZephyr/OperationKernels.cuh b/include/NeuZephyr/OperationKernels.cuh
@@ -1009,6 +1009,9 @@ namespace nz::krnl {
     void img2col(const dim3 gridDim, const dim3 blockDim, float* out, float* in, const size_t H_out,
               const size_t W_out, const size_t C, const size_t K_h, const size_t K_w, const size_t stride,
               const size_t pad, const size_t H_in, const size_t W_in, const size_t batch);
+
+    void col2img(const dim3 gridDim, const dim3 blockDim, float* out, float* in, const size_t H_out,
+                 const size_t W_out, const size_t C_out, const size_t batches);
 #endif
 }
 
diff --git a/include/NeuZephyr/TensorOperations.cuh b/include/NeuZephyr/TensorOperations.cuh
@@ -1115,9 +1115,27 @@ namespace nz::data {
                   const size_t pad) {
         const size_t H_out = (in.shape().H() + 2 * pad - K_h) / stride + 1;
         const size_t W_out = (in.shape().W() + 2 * pad - K_w) / stride + 1;
-        T result({in.shape()[0], 1, H_out * W_out, in.shape().C() * K_h * K_w});
+        T result({in.shape()[0], 1, H_out * W_out, in.shape().C() * K_h * K_w}, in.requiresGrad());
         iImg2col(result.data(), in.data(), H_out, W_out, in.shape().C(), K_h, K_w, stride, pad,
                  in.shape().H(), in.shape().W(), in.shape()[0]);
+        if (in.requiresGrad()) {
+            iImg2col(result.grad(), in.grad(), H_out, W_out, in.shape().C(), K_h, K_w, stride, pad,
+                     in.shape().H(), in.shape().W(), in.shape()[0]);
+        }
+        return result;
+    }
+
+    DL_API void iCol2img(float* out, float* in, size_t H_out,
+                         size_t W_out, size_t C_out, size_t batches);
+
+    template <typename T>
+    std::enable_if_t<is_valid_tensor_type<T>::value, T>
+    tensorCol2img(const T& in, const size_t H_out, const size_t W_out) {
+        T result({in.shape()[0], in.shape()[3], H_out, W_out}, in.requiresGrad());
+        iCol2img(result.data(), in.data(), H_out, W_out, in.shape()[3], in.shape()[0]);
+        if (in.requiresGrad()) {
+            iCol2img(result.grad(), in.grad(), H_out, W_out, in.shape()[3], in.shape()[0]);
+        }
         return result;
     }
 }
diff --git a/src/OperationKernels.cu b/src/OperationKernels.cu
@@ -1330,4 +1330,23 @@ namespace nz::krnl {
         StreamManager<float>::Instance().submit(img2colKernel, gridDim, blockDim, 0, out, in, H_out, W_out, C,
                                                 K_h, K_w, stride, pad, H_in, W_in, batch);
     }
+
+    __global__ void col2imgKernel(float* out, const float* in, const size_t H_out, const size_t W_out, const size_t C_out, const size_t batches) {
+        const size_t idx = blockDim.x * blockIdx.x + threadIdx.x;
+        if (idx >= H_out * W_out * C_out * batches) {
+            return;
+        }
+        const size_t batch = idx / (C_out * H_out * W_out);
+        const size_t fixedIdx = idx % (C_out * H_out * W_out);
+        const size_t c = fixedIdx / (H_out * W_out);
+        const size_t h = (fixedIdx % (H_out * W_out)) / W_out;
+        const size_t w = (fixedIdx % (H_out * W_out)) % W_out;
+        out[idx] = in[batch * (C_out * H_out * W_out) + (h * W_out + w) * C_out + c];
+    }
+
+    void col2img(const dim3 gridDim, const dim3 blockDim, float* out, float* in, const size_t H_out,
+                 const size_t W_out, const size_t C_out, const size_t batches) {
+        StreamManager<float>::Instance().submit(col2imgKernel, gridDim, blockDim, 0, out, in, H_out, W_out, C_out,
+                                                batches);
+    }
 }
diff --git a/src/TensorOperations.cu b/src/TensorOperations.cu
@@ -139,4 +139,11 @@ namespace nz::data {
         const dim3 grid((H_out * W_out * C * K_h * K_w * batch + BLOCKSIZE - 1) / BLOCKSIZE);
         krnl::img2col(grid, block, out, in, H_out, W_out, C, K_h, K_w, stride, pad, H_in, W_in, batch);
     }
+
+    void iCol2img(float* out, float* in, const size_t H_out, const size_t W_out, const size_t C_out,
+        const size_t batches) {
+        const dim3 block(BLOCKSIZE);
+        const dim3 grid((H_out * W_out * C_out * batches + BLOCKSIZE - 1) / BLOCKSIZE);
+        krnl::col2img(grid, block, out, in, H_out, W_out, C_out, batches);
+    }
 }
diff --git a/test/Test.cpp b/test/Test.cpp
@@ -2941,4 +2941,39 @@ TEST(TensorBasic, img2colTest) {
     Tensor expected({n, 1, H_out * W_out, k_h * k_w * c});
     expected.dataInject(expectedData.begin(), expectedData.end());
     EXPECT_EQ(expected, result);
+}
+
+TEST(TenorBasic, col2imgTest) {
+    const size_t n = 2;
+    const size_t c = 3;
+    const size_t h = 4;
+    const size_t w = 5;
+
+    std::vector<float> inputData({n*c*h*w});
+    std::vector<float> expectedData({n*c*h*w});
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dist(0.1f, 0.9f);
+
+    for (auto& i : inputData) {
+        i = dist(gen);
+    }
+    for (auto i = 0; i < n; i++) {
+        for (auto j = 0; j < c; j++) {
+            for (auto k = 0; k < h; k++) {
+                for (auto l = 0; l < w; l++) {
+                    expectedData[i * (c*h*w) + j * (h*w) + k * w + l] =
+                        inputData[i * (c*h*w) + (k * w + l) * c + j];
+                }
+            }
+        }
+    }
+
+    Tensor input({n ,1, h*w, c});
+    input.dataInject(inputData.begin(), inputData.end());
+    auto result = tensorCol2img(input, h, w);
+    Tensor expected({n, c, h, w});
+    expected.dataInject(expectedData.begin(), expectedData.end());
+    EXPECT_EQ(expected, result);
 }

Original file line number	Diff line number	Diff line change
`@@ -1009,6 +1009,9 @@ namespace nz::krnl {`
`1009`	`1009`	`void img2col(const dim3 gridDim, const dim3 blockDim, float* out, float* in, const size_t H_out,`
`1010`	`1010`	`const size_t W_out, const size_t C, const size_t K_h, const size_t K_w, const size_t stride,`
`1011`	`1011`	`const size_t pad, const size_t H_in, const size_t W_in, const size_t batch);`
	`1012`	`+`
	`1013`	`+ void col2img(const dim3 gridDim, const dim3 blockDim, float* out, float* in, const size_t H_out,`
	`1014`	`+ const size_t W_out, const size_t C_out, const size_t batches);`
`1012`	`1015`	`#endif`
`1013`	`1016`	`}`
`1014`	`1017`
Original file line number	Diff line number	Diff line change
`@@ -139,4 +139,11 @@ namespace nz::data {`
`139`	`139`	`const dim3 grid((H_out * W_out * C * K_h * K_w * batch + BLOCKSIZE - 1) / BLOCKSIZE);`
`140`	`140`	`krnl::img2col(grid, block, out, in, H_out, W_out, C, K_h, K_w, stride, pad, H_in, W_in, batch);`
`141`	`141`	`}`
	`142`	`+`
	`143`	`+ void iCol2img(float* out, float* in, const size_t H_out, const size_t W_out, const size_t C_out,`
	`144`	`+ const size_t batches) {`
	`145`	`+ const dim3 block(BLOCKSIZE);`
	`146`	`+ const dim3 grid((H_out * W_out * C_out * batches + BLOCKSIZE - 1) / BLOCKSIZE);`
	`147`	`+ krnl::col2img(grid, block, out, in, H_out, W_out, C_out, batches);`
	`148`	`+ }`
`142`	`149`	`}`