feat(convolution - ops): add backpropagation for img2col, computational graph nodes, and related test cases

Mgepahmge · Mgepahmge · commit 4be3aac7dd06 · 2025-05-11T16:56:15.000+08:00
- Implemented the backpropagation algorithm for the img2col operation to support gradient computation.
- Added computational graph nodes for the img2col operation and its backpropagation to facilitate automatic differentiation.
- Developed a comprehensive set of test cases to verify the correctness of the backpropagation and computational graph nodes.
- These test cases cover different input scenarios and gradients to ensure the stability and accuracy of the implementation.
diff --git a/include/NeuZephyr/Nodes.cuh b/include/NeuZephyr/Nodes.cuh
@@ -3299,6 +3299,23 @@ namespace nz::nodes {
 
             void backward() override;
         };
+
+        class DL_API Img2ColNode : public Node {
+        public:
+            Tensor::size_type kernelHeight;
+            Tensor::size_type kernelWidth;
+            Tensor::size_type stride;
+            Tensor::size_type padding;
+            Tensor::size_type outputHeight;
+            Tensor::size_type outputWidth;
+
+            Img2ColNode(Node* input, Tensor::size_type kernelHeight, Tensor::size_type kernelWidth,
+                        Tensor::size_type stride, Tensor::size_type padding);
+
+            void forward() override;
+
+            void backward() override;
+        };
     }
 
     /**
diff --git a/include/NeuZephyr/OperationKernels.cuh b/include/NeuZephyr/OperationKernels.cuh
@@ -1010,6 +1010,10 @@ namespace nz::krnl {
               const size_t W_out, const size_t C, const size_t K_h, const size_t K_w, const size_t stride,
               const size_t pad, const size_t H_in, const size_t W_in, const size_t batch);
 
+    void img2colBackward(const dim3 gridDim, const dim3 blockDim, float* out, float* in, const size_t H_out,
+              const size_t W_out, const size_t C, const size_t K_h, const size_t K_w, const size_t stride,
+              const size_t pad, const size_t H_in, const size_t W_in, const size_t batch);
+
     void col2img(const dim3 gridDim, const dim3 blockDim, float* out, float* in, const size_t H_out,
                  const size_t W_out, const size_t C_out, const size_t batches);
 #endif
diff --git a/include/NeuZephyr/TensorOperations.cuh b/include/NeuZephyr/TensorOperations.cuh
@@ -1109,6 +1109,10 @@ namespace nz::data {
                          const size_t W_out, const size_t C, const size_t K_h, const size_t K_w, const size_t stride,
                          const size_t pad, const size_t H_in, const size_t W_in, const size_t batch);
 
+    DL_API void iImg2colBackward(float* out, float* in, const size_t H_out,
+              const size_t W_out, const size_t C, const size_t K_h, const size_t K_w, const size_t stride,
+              const size_t pad, const size_t H_in, const size_t W_in, const size_t batch);
+
     template <typename T>
     std::enable_if_t<is_valid_tensor_type<T>::value, T>
     tensorImg2col(const T& in, const size_t K_h, const size_t K_w, const size_t stride,
diff --git a/src/Nodes.cu b/src/Nodes.cu
@@ -584,7 +584,7 @@ namespace nz::nodes {
 
         void ExpandNode::forward() {
             const auto size = inputs[0]->output->shape()[1] * inputs[0]->output->shape()[2] *
-                        inputs[0]->output->shape()[3];
+                inputs[0]->output->shape()[3];
             const auto total = size * newBatch;
             const dim3 block(BLOCKSIZE);
             const dim3 grid((total + block.x - 1) / block.x);
@@ -594,13 +594,46 @@ namespace nz::nodes {
         void ExpandNode::backward() {
             if (inputs[0]->output->requiresGrad()) {
                 const auto size = inputs[0]->output->shape()[1] * inputs[0]->output->shape()[2] *
-                            inputs[0]->output->shape()[3];
+                    inputs[0]->output->shape()[3];
                 const auto total = size * newBatch;
                 const dim3 block(BLOCKSIZE);
                 const dim3 grid((total + block.x - 1) / block.x);
                 Compress(grid, block, inputs[0]->output->grad(), output->grad(), size, total);
             }
         }
+
+        Img2ColNode::Img2ColNode(Node* input, const Tensor::size_type kernelHeight, const Tensor::size_type kernelWidth,
+                                 const Tensor::size_type stride,
+                                 const Tensor::size_type padding) : kernelHeight(kernelHeight),
+                                                                    kernelWidth(kernelWidth),
+                                                                    stride(stride), padding(padding),
+                                                                    outputHeight(
+                                                                        (input->output->shape().H() + 2 * padding -
+                                                                            kernelHeight) / stride + 1),
+                                                                    outputWidth(
+                                                                        (input->output->shape().W() + 2 * padding -
+                                                                            kernelWidth) / stride + 1) {
+            inputs.push_back(input);
+            output = std::make_shared<Tensor>(Tensor::shape_type{
+                                                  input->output->shape()[0], 1, outputHeight * outputWidth,
+                                                  kernelHeight * kernelWidth * input->output->shape()[1]
+                                              }, input->output->requiresGrad());
+            type = "Img2Col";
+        }
+
+        void Img2ColNode::forward() {
+            iImg2col(output->data(), inputs[0]->output->data(), outputHeight, outputWidth, inputs[0]->output->shape()[1],
+                kernelHeight, kernelWidth, stride, padding, inputs[0]->output->shape()[2], inputs[0]->output->shape()[3],
+                inputs[0]->output->shape()[0]);
+        }
+
+        void Img2ColNode::backward() {
+            if (inputs[0]->output->requiresGrad()) {
+                iImg2colBackward(inputs[0]->output->grad(), output->grad(), outputHeight, outputWidth, inputs[0]->output->shape()[1],
+                kernelHeight, kernelWidth, stride, padding, inputs[0]->output->shape()[2], inputs[0]->output->shape()[3],
+                inputs[0]->output->shape()[0]);
+            }
+        }
     }
 
     namespace loss {
diff --git a/src/OperationKernels.cu b/src/OperationKernels.cu
@@ -1331,6 +1331,31 @@ namespace nz::krnl {
                                                 K_h, K_w, stride, pad, H_in, W_in, batch);
     }
 
+    __global__ void img2colBackwardKernel(float* out, const float* in, const size_t H_out, const size_t W_out, const size_t C,
+    const size_t K_h, const size_t K_w, const size_t stride, const size_t pad, const size_t H_in, const size_t W_in, const size_t batch) {
+        const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx >= H_out * W_out * C * K_h * K_w * batch) {
+            return;
+        }
+        const size_t fixedIdx = idx % (H_out * W_out * C * K_h * K_w);
+        const size_t currentBatch = idx / (H_out * W_out * C * K_h * K_w);
+        const size_t k = fixedIdx / (C * K_h * K_w);
+        const size_t m = fixedIdx % (C * K_h * K_w);
+        const size_t c = m / (K_h * K_w);
+        const long long h = (k / W_out) * stride - pad + (m % (K_h * K_w)) / K_w;
+        const long long w = (k % W_out) * stride - pad + m % K_w;
+        if (h >= 0 && h < H_in && w >= 0 && w < W_in) {
+            atomicAdd(out + currentBatch * (C * H_in * W_in) + c * (H_in * W_in) + h * W_in + w, in[idx]);
+        }
+    }
+
+    void img2colBackward(const dim3 gridDim, const dim3 blockDim, float* out, float* in, const size_t H_out,
+              const size_t W_out, const size_t C, const size_t K_h, const size_t K_w, const size_t stride,
+              const size_t pad, const size_t H_in, const size_t W_in, const size_t batch) {
+        StreamManager<float>::Instance().submit(img2colBackwardKernel, gridDim, blockDim, 0, out, in, H_out,
+                                                W_out, C, K_h, K_w, stride, pad, H_in, W_in, batch);
+    }
+
     __global__ void col2imgKernel(float* out, const float* in, const size_t H_out, const size_t W_out, const size_t C_out, const size_t batches) {
         const size_t idx = blockDim.x * blockIdx.x + threadIdx.x;
         if (idx >= H_out * W_out * C_out * batches) {
diff --git a/src/TensorOperations.cu b/src/TensorOperations.cu
@@ -140,8 +140,16 @@ namespace nz::data {
         krnl::img2col(grid, block, out, in, H_out, W_out, C, K_h, K_w, stride, pad, H_in, W_in, batch);
     }
 
+    void iImg2colBackward(float* out, float* in, const size_t H_out, const size_t W_out, const size_t C,
+        const size_t K_h, const size_t K_w, const size_t stride, const size_t pad, const size_t H_in, const size_t W_in,
+        const size_t batch) {
+        const dim3 block(BLOCKSIZE);
+        const dim3 grid((H_out * W_out * C * K_h * K_w * batch + BLOCKSIZE - 1) / BLOCKSIZE);
+        krnl::img2colBackward(grid, block, out, in, H_out, W_out, C, K_h, K_w, stride, pad, H_in, W_in, batch);
+    }
+
     void iCol2img(float* out, float* in, const size_t H_out, const size_t W_out, const size_t C_out,
-        const size_t batches) {
+                  const size_t batches) {
         const dim3 block(BLOCKSIZE);
         const dim3 grid((H_out * W_out * C_out * batches + BLOCKSIZE - 1) / BLOCKSIZE);
         krnl::col2img(grid, block, out, in, H_out, W_out, C_out, batches);
diff --git a/test/Test.cpp b/test/Test.cpp
@@ -2943,6 +2943,135 @@ TEST(TensorBasic, img2colTest) {
     EXPECT_EQ(expected, result);
 }
 
+TEST(NodeBasic, img2colForward) {
+    const size_t n = 2;
+    const size_t c = 3;
+    const size_t h = 4;
+    const size_t w = 5;
+    const size_t k_h = 3;
+    const size_t k_w = 3;
+    const size_t stride = 1;
+    const size_t pad = 1;
+    const size_t H_out = (h + 2 * pad - k_h) / stride + 1;
+    const size_t W_out = (w + 2 * pad - k_w) / stride + 1;
+
+    std::vector<float> inputData({n*c*h*w});
+    std::vector<float> expectedData({n*H_out*W_out*k_h*k_w*c});
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dist(0.1f, 0.9f);
+
+    for (auto& i : inputData) {
+        i = dist(gen);
+    }
+
+    for (size_t b = 0; b < n; ++b) {
+        for (size_t i = 0; i < H_out; ++i) {
+            for (size_t j = 0; j < W_out; ++j) {
+                const int h_start = static_cast<int>(i * stride) - pad;
+                const int w_start = static_cast<int>(j * stride) - pad;
+
+                for (size_t r = 0; r < k_h; ++r) {
+                    const int h_in = h_start + r;
+                    for (size_t s = 0; s < k_w; ++s) {
+                        const int w_in = w_start + s;
+                        for (size_t c_in = 0; c_in < c; ++c_in) {
+                            float val = 0.0f;
+                            if (h_in >= 0 && h_in < h && w_in >= 0 && w_in < w) {
+                                const size_t input_idx =
+                                    b * (c * h * w) +
+                                    c_in * (h * w) +
+                                    h_in * w +
+                                    w_in;
+                                val = inputData[input_idx];
+                            }
+                            const size_t expected_idx =
+                                b * (H_out * W_out * k_h * k_w * c) +
+                                (i * W_out + j) * (k_h * k_w * c) +
+                                c_in * (k_h * k_w) +
+                                r * k_w +
+                                s;
+                            expectedData[expected_idx] = val;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    InputNode input({n, c, h, w});
+    input.dataInject(inputData.begin(), inputData.end());
+    Img2ColNode result(&input, k_h, k_w, stride, pad);
+    result.forward();
+    Tensor expected({n, 1, H_out * W_out, k_h * k_w * c});
+    expected.dataInject(expectedData.begin(), expectedData.end());
+    EXPECT_EQ(expected, *result.output);
+}
+
+TEST(NodeBasic, img2colBackward) {
+    const size_t n = 2;
+    const size_t c = 3;
+    const size_t h = 4;
+    const size_t w = 5;
+    const size_t k_h = 3;
+    const size_t k_w = 3;
+    const size_t stride = 1;
+    const size_t pad = 1;
+    const size_t H_out = (h + 2 * pad - k_h) / stride + 1;
+    const size_t W_out = (w + 2 * pad - k_w) / stride + 1;
+
+    std::vector<float> gradData({n*H_out*W_out*k_h*k_w*c});
+    std::vector<float> expectedGradData({n*c*h*w});
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dist(0.1f, 0.9f);
+
+    for (auto& i : gradData) {
+        i = dist(gen);
+    }
+
+    for (size_t b = 0; b < n; ++b) {
+        for (size_t i = 0; i < H_out; ++i) {
+            for (size_t j = 0; j < W_out; ++j) {
+                const int h_start = static_cast<int>(i * stride) - pad;
+                const int w_start = static_cast<int>(j * stride) - pad;
+                for (size_t r = 0; r < k_h; ++r) {
+                    const int h_in = h_start + r;
+                    for (size_t s = 0; s < k_w; ++s) {
+                        const int w_in = w_start + s;
+                        for (size_t c_in = 0; c_in < c; ++c_in) {
+                            if (h_in >= 0 && h_in < h && w_in >= 0 && w_in < w) {
+                                const size_t input_idx =
+                                    b * (c * h * w) +
+                                    c_in * (h * w) +
+                                    h_in * w +
+                                    w_in;
+                                const size_t grad_idx =
+                                    b * (H_out * W_out * k_h * k_w * c) +
+                                    (i * W_out + j) * (k_h * k_w * c) +
+                                    c_in * (k_h * k_w) +
+                                    r * k_w +
+                                    s;
+                                expectedGradData[input_idx] += gradData[grad_idx];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    InputNode input({n, c, h, w}, true);
+    Img2ColNode result(&input, k_h, k_w, stride, pad);
+    result.dataInject(gradData.begin(), gradData.end(), true);
+    result.backward();
+    Tensor expected({n, c, h, w}, true);
+    expected.dataInject(expectedGradData.begin(), expectedGradData.end(), true);
+    EXPECT_EQ(expected, *input.output);
+}
+
 TEST(TenorBasic, col2imgTest) {
     const size_t n = 2;
     const size_t c = 3;