feat(pooling): add average pooling layer and related test cases

Mgepahmge · Mgepahmge · commit 8540b6b34e3c · 2025-05-12T17:12:54.000+08:00
- Added an average pooling layer to the model architecture, enabling the downsampling of feature maps using average pooling.
- Implemented the forward pass logic for the average pooling layer, including the calculation of average values within pooling windows.
- Developed a set of test cases to verify the correctness of the average pooling layer implementation.
- The test cases cover different input sizes, pooling kernel sizes, and stride values to ensure the robustness of the layer.
diff --git a/include/NeuZephyr/Model.cuh b/include/NeuZephyr/Model.cuh
@@ -68,12 +68,26 @@ namespace nz {
                      Tensor::size_type kernelWidth,
                      Tensor::size_type stride, Tensor::size_type padding, bool bias = true);
 
+        Node* AvgPool2d(Node* input, Tensor::size_type poolSize, Tensor::size_type stride,
+                        Tensor::size_type padding = 0);
+
         void MSELoss(Node* input, Node* target);
 
         void BCELoss(Node* input, Node* target);
 
         void defaultOutput(Node* input);
     };
+
+    inline Node* Model::AvgPool2d(Node* input, Tensor::size_type poolSize, Tensor::size_type stride,
+        Tensor::size_type padding) {
+        if (!computeGraph.inGraph(input)) {
+            computeGraph.addNode(input);
+        }
+        auto* avgPoolNode = new calc::AveragePoolingNode(input, poolSize, stride, padding);
+        hiddenNodes.push_back(avgPoolNode);
+        computeGraph.addNode(avgPoolNode);
+        return avgPoolNode;
+    }
 }
 
 
diff --git a/include/NeuZephyr/Nodes.cuh b/include/NeuZephyr/Nodes.cuh
@@ -3329,6 +3329,20 @@ namespace nz::nodes {
 
             void backward() override;
         };
+
+        class DL_API AveragePoolingNode : public Node {
+        public:
+            Tensor::size_type poolSize;
+            Tensor::size_type stride;
+            Tensor::size_type padding;
+
+            AveragePoolingNode(Node* input, Tensor::size_type poolSize, Tensor::size_type stride,
+                               Tensor::size_type padding);
+
+            void forward() override;
+
+            void backward() override;
+        };
     }
 
     /**
diff --git a/include/NeuZephyr/OperationKernels.cuh b/include/NeuZephyr/OperationKernels.cuh
@@ -48,6 +48,9 @@
 #include <vector>
 #include "Dimension.cuh"
 
+#define OUTPUT_DIM(INPUT, KERNEL, STRIDE, PADDING) \
+( ((size_t)(INPUT) + 2*(size_t)(PADDING) - (size_t)(KERNEL)) / (size_t)(STRIDE) + 1 )
+
 /**
  * @namespace nz::krnl
  * @brief High-Performance CUDA Kernel Implementations for Tensor Computations
@@ -1019,6 +1022,16 @@ namespace nz::krnl {
 
     void col2imgBackward(const dim3 gridDim, const dim3 blockDim, float* out, float* in, const size_t H_out,
                          const size_t W_out, const size_t C_out, const size_t batches);
+
+    void AveragePooling(const dim3 gridDim, const dim3 blockDim, float* out, float* in,
+        const size_t pool_size, const size_t stride, const size_t padding,
+        const size_t batches, const size_t channels, const size_t H_in, const size_t W_in,
+        const size_t H_out, const size_t W_out);
+
+    void AveragePoolingBackward(const dim3 gridDim, const dim3 blockDim, float* out, float* in,
+        const size_t pool_size, const size_t stride, const size_t padding,
+        const size_t batches, const size_t channels, const size_t H_in, const size_t W_in,
+        const size_t H_out, const size_t W_out);
 #endif
 }
 
diff --git a/include/NeuZephyr/TensorOperations.cuh b/include/NeuZephyr/TensorOperations.cuh
@@ -1144,5 +1144,33 @@ namespace nz::data {
     }
 
     DL_API void iCol2imgBackward(float* out, float* in, size_t H_out, size_t W_out, size_t C_out, size_t batches);
+
+    DL_API void iAveragePooling(float* out, float* in,
+                                size_t pool_size, size_t stride, size_t padding,
+                                size_t batches, size_t channels, size_t H_in, size_t W_in,
+                                size_t H_out, size_t W_out);
+
+    template <typename T>
+    std::enable_if_t<is_valid_tensor_type<T>::value, T>
+    tensorAveragePooling(const T& in, const size_t pool_size, const size_t stride,
+                          const size_t padding) {
+        const size_t H_out = OUTPUT_DIM(in.shape().H(), pool_size, stride, padding);
+        const size_t W_out = OUTPUT_DIM(in.shape().W(), pool_size, stride, padding);
+        T result({in.shape()[0], in.shape()[1], H_out, W_out}, in.requiresGrad());
+        iAveragePooling(result.data(), in.data(), pool_size, stride, padding,
+                        in.shape()[0], in.shape()[1], in.shape().H(), in.shape().W(),
+                        H_out, W_out);
+        if (in.requiresGrad()) {
+            iAveragePooling(result.grad(), in.grad(), pool_size, stride, padding,
+                            in.shape()[0], in.shape()[1], in.shape().H(), in.shape().W(),
+                            H_out, W_out);
+        }
+        return result;
+    }
+
+    DL_API void iAveragePoolingBackward(float* out, float* in,
+                                size_t pool_size, size_t stride, size_t padding,
+                                size_t batches, size_t channels, size_t H_in, size_t W_in,
+                                size_t H_out, size_t W_out);
 }
 #endif //TENSOROPERATIONS_CUH
diff --git a/src/Nodes.cu b/src/Nodes.cu
@@ -661,6 +661,29 @@ namespace nz::nodes {
             iCol2imgBackward(inputs[0]->output->grad(), output->grad(), outputHeight, outputWidth, outputChannels,
                 inputs[0]->output->shape()[0]);
         }
+
+        AveragePoolingNode::AveragePoolingNode(Node* input, Tensor::size_type poolSize, Tensor::size_type stride,
+            Tensor::size_type padding) : poolSize(poolSize), stride(stride), padding(padding) {
+            inputs.push_back(input);
+            output = std::make_shared<Tensor>(Tensor::shape_type{
+                input->output->shape()[0], input->output->shape()[1],
+                OUTPUT_DIM(input->output->shape()[2], poolSize, stride, padding),
+                OUTPUT_DIM(input->output->shape()[3], poolSize, stride, padding)
+            }, input->output->requiresGrad());
+            type = "AveragePooling";
+        }
+
+        void AveragePoolingNode::forward() {
+            iAveragePooling(output->data(), inputs[0]->output->data(), poolSize, stride, padding, inputs[0]->output->shape()[0],
+                inputs[0]->output->shape()[1], inputs[0]->output->shape()[2], inputs[0]->output->shape()[3],
+                output->shape()[2], output->shape()[3]);
+        }
+
+        void AveragePoolingNode::backward() {
+            iAveragePoolingBackward(inputs[0]->output->grad(), output->grad(), poolSize, stride, padding, inputs[0]->output->shape()[0],
+                inputs[0]->output->shape()[1], inputs[0]->output->shape()[2], inputs[0]->output->shape()[3],
+                output->shape()[2], output->shape()[3]);
+        }
     }
 
     namespace loss {
diff --git a/src/OperationKernels.cu b/src/OperationKernels.cu
@@ -1400,4 +1400,92 @@ namespace nz::krnl {
         StreamManager<float>::Instance().submit(col2imgBackwardKernel, gridDim, blockDim, 0, out, in, H_out, W_out,
                                                 C_out, batches);
     }
+
+    __global__ void AveragePoolingKernel(float* out, const float* in, const size_t pool_size, const size_t stride, const size_t padding,
+        const size_t batches, const size_t channels, const size_t H_in, const size_t W_in, const size_t H_out, const size_t W_out) {
+        const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx >= batches * channels * H_out * W_out) {
+            return;
+        }
+        const size_t currentBatch = idx / (channels * H_out * W_out);
+        const size_t currentChannel = (idx % (channels * H_out * W_out)) / (H_out * W_out);
+        const size_t h = (idx % (H_out * W_out)) / W_out;
+        const size_t w = (idx % (H_out * W_out)) % W_out;
+        const long long h_start = h * stride - padding;
+        const long long w_start = w * stride - padding;
+        out[idx] = 0.0f;
+        size_t count = 0;
+        for (long long i = 0; i < pool_size; i++) {
+            for (long long j = 0; j < pool_size; j++) {
+                const long long h_in = h_start + i;
+                const long long w_in = w_start + j;
+                if (h_in >= 0 && h_in < H_in && w_in >= 0 && w_in < W_in) {
+                    out[idx] += in[currentBatch * (channels * H_in * W_in) + currentChannel * (H_in * W_in) + h_in * W_in + w_in];
+                    count++;
+                }
+            }
+        }
+        out[idx] = count > 0 ? out[idx] / (float)count : 0.0f;
+    }
+
+    void AveragePooling(const dim3 gridDim, const dim3 blockDim, float* out, float* in,
+        const size_t pool_size, const size_t stride, const size_t padding,
+        const size_t batches, const size_t channels, const size_t H_in, const size_t W_in,
+        const size_t H_out, const size_t W_out) {
+        StreamManager<float>::Instance().submit(AveragePoolingKernel, gridDim, blockDim, 0, out, in,
+            pool_size, stride, padding, batches, channels, H_in, W_in, H_out, W_out);
+    }
+
+    __global__ void AveragePoolingBackwardKernel(float* out, const float* in, const size_t pool_size, const size_t stride, const size_t padding,
+    const size_t batches, const size_t channels, const size_t H_in, const size_t W_in, const size_t H_out, const size_t W_out) {
+        const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx >= batches * channels * H_out * W_out) {
+            return;
+        }
+        const size_t currentBatch = idx / (channels * H_out * W_out);
+        const size_t currentChannel = (idx % (channels * H_out * W_out)) / (H_out * W_out);
+        const size_t h = (idx % (H_out * W_out)) / W_out;
+        const size_t w = (idx % (H_out * W_out)) % W_out;
+        const long long h_start = h * stride - padding;
+        const long long w_start = w * stride - padding;
+        if (!padding) {
+            for (long long i = 0; i < pool_size; i++) {
+                for (long long j = 0; j < pool_size; j++) {
+                    const long long h_in = h_start + i;
+                    const long long w_in = w_start + j;
+                    if (h_in >= 0 && h_in < H_in && w_in >= 0 && w_in < W_in) {
+                        atomicAdd(out + currentBatch * (channels * H_in * W_in) + currentChannel * (H_in * W_in) + h_in * W_in + w_in, in[idx] / (float)(pool_size*pool_size));
+                    }
+                }
+            }
+        } else {
+            size_t count = 0;
+            for (long long i = 0; i < pool_size; i++) {
+                for (long long j = 0; j < pool_size; j++) {
+                    const long long h_in = h_start + i;
+                    const long long w_in = w_start + j;
+                    if (h_in >= 0 && h_in < H_in && w_in >= 0 && w_in < W_in) {
+                        count++;
+                    }
+                }
+            }
+            for (long long i = 0; i < pool_size; i++) {
+                for (long long j = 0; j < pool_size; j++) {
+                    const long long h_in = h_start + i;
+                    const long long w_in = w_start + j;
+                    if (h_in >= 0 && h_in < H_in && w_in >= 0 && w_in < W_in) {
+                        atomicAdd(out + currentBatch * (channels * H_in * W_in) + currentChannel * (H_in * W_in) + h_in * W_in + w_in, in[idx] / (float)count);
+                    }
+                }
+            }
+        }
+    }
+
+    void AveragePoolingBackward(const dim3 gridDim, const dim3 blockDim, float* out, float* in,
+        const size_t pool_size, const size_t stride, const size_t padding,
+        const size_t batches, const size_t channels, const size_t H_in, const size_t W_in,
+        const size_t H_out, const size_t W_out) {
+        StreamManager<float>::Instance().submit(AveragePoolingBackwardKernel, gridDim, blockDim, 0, out, in,
+            pool_size, stride, padding, batches, channels, H_in, W_in, H_out, W_out);
+    }
 }
diff --git a/src/TensorOperations.cu b/src/TensorOperations.cu
@@ -160,4 +160,20 @@ namespace nz::data {
         const dim3 grid((H_out * W_out * C_out * batches + BLOCKSIZE - 1) / BLOCKSIZE);
         krnl::col2imgBackward(grid, block, out, in, H_out, W_out, C_out, batches);
     }
+
+    void iAveragePooling(float* out, float* in, const size_t pool_size, const size_t stride, const size_t padding,
+        const size_t batches, const size_t channels, const size_t H_in, const size_t W_in, const size_t H_out,
+        const size_t W_out) {
+        dim3 block(BLOCKSIZE);
+        dim3 grid((batches * channels * H_out * W_out + BLOCKSIZE - 1) / BLOCKSIZE);
+        krnl::AveragePooling(grid, block, out, in, pool_size, stride, padding, batches, channels, H_in, W_in, H_out, W_out);
+    }
+
+    void iAveragePoolingBackward(float* out, float* in, const size_t pool_size, const size_t stride, const size_t padding, const size_t batches,
+        const size_t channels, const size_t H_in, const size_t W_in, const size_t H_out, const size_t W_out) {
+        dim3 block(BLOCKSIZE);
+        dim3 grid((batches * channels * H_out * W_out + BLOCKSIZE - 1) / BLOCKSIZE);
+        krnl::AveragePoolingBackward(grid, block, out, in, pool_size, stride, padding, batches, channels, H_in, W_in,
+            H_out, W_out);
+    }
 }
diff --git a/test/Test.cpp b/test/Test.cpp