feat(Tensor): add operations for getting min/max values, finding element positions and setting values at specific positions

Mgepahmge · Mgepahmge · commit ed10380ccd52 · 2025-05-13T16:17:32.000+08:00
- Added methods to the Tensor class to retrieve the minimum and maximum values of the tensor.
- Implemented functionality to find the positions of specific elements within the tensor.
- Added the ability to set values at specific positions in the tensor.
- Conducted basic testing to ensure the correctness of these new operations.
diff --git a/include/NeuZephyr/Tensor.cuh b/include/NeuZephyr/Tensor.cuh
@@ -823,38 +823,59 @@ namespace nz::data {
         void transpose();
 
         /**
-         * @brief Sets a specific element of the tensor's data to a given value.
+         * @brief Sets the value of an element in the tensor or its gradient at a specified position.
          *
-         * This function modifies a specific element of the tensor's data stored in GPU memory.
-         * The element to be modified is specified by its position in the tensor's shape (given as a 2D index).
-         * The function first copies the tensor's data from GPU memory to host memory, modifies the specified element,
-         * and then copies the updated data back to the GPU memory.
+         * This member function allows you to set the value of a specific element in the tensor or its gradient.
+         * It first validates the position and the gradient setting based on the tensor's requirements.
          *
-         * @param position A `shape_type` (alias for `std::vector<int>`) representing the 2D index (row, column)
-         *                 of the element to modify.
-         * @param value The value to which the specified element will be set.
+         * @param position The position in the tensor where the value will be set. Memory location: host - to - device.
+         * @param value The value to be set at the specified position. Memory location: host - to - device.
+         * @param isGrad A boolean indicating whether to set the value in the gradient or the tensor data. Memory location: host - to - device.
          *
-         * This function performs the following steps:
-         * 1. It checks if the provided position is valid within the tensor's shape. If not, an exception is thrown.
-         * 2. It copies the tensor's data from GPU memory to host memory using `cudaMemcpy`.
-         * 3. It modifies the specified element at the given position in the tensor's data.
-         * 4. It copies the updated data back to the GPU memory.
+         * @return None
+         *
+         * **Memory Management Strategy**:
+         * - A temporary array `data` of size `_size` is allocated on the host using `malloc`.
+         * - The data from the device (either tensor data or gradient) is copied to the host using `cuStrm::StreamManager<value_type>::Instance().memcpy`.
+         * - After the value is set at the specified position in the host - side data, the updated data is copied back to the device.
+         * - The temporary array `data` is freed using `free` to avoid memory leaks.
+         *
+         * **Exception Handling Mechanism**:
+         * - Throws `std::invalid_argument` if the `position` is out of bounds of the tensor's shape.
+         * - Throws `std::invalid_argument` if `isGrad` is `true` but the tensor does not require gradients.
+         * - If any of the `cuStrm::StreamManager` operations fail, it may lead to undefined behavior as error - checking is not explicitly done in this function.
          *
-         * @throws std::invalid_argument If the provided position is out of bounds.
+         * **Relationship with Other Components**:
+         * - Depends on `cuStrm::StreamManager<value_type>::Instance()` for memory copying and data synchronization operations.
+         * - Relies on the `_shape` member variable to validate the position and calculate the index in the data array.
+         * - Uses the `_data` and `_grad` member variables to access the tensor data and its gradient.
+         *
+         * @throws std::invalid_argument When the position is out of bounds or when trying to set the gradient of a tensor that does not require gradients.
          *
          * @note
-         * - This function uses memory copying between host and device, which can introduce performance overhead.
-         * - The tensor's data is modified on the host first and then copied back to the GPU. This approach may not be
-         *   the most efficient for large tensors or frequent updates.
+         * - The time complexity of this function is O(n) due to the memory copying operations, where n is the number of elements in the tensor (`_size`).
+         * - Ensure that the CUDA runtime environment is properly initialized and the device memory is valid before calling this function.
+         * - Ensure that the `position` is within the valid range of the tensor's shape to avoid exceptions.
+         * - If setting the gradient, ensure that the tensor requires gradients.
+         *
+         * @warning
+         * - If any of the `cuStrm::StreamManager` operations fail, the behavior of this function is undefined.
          *
          * @code
          * ```cpp
-         * Tensor tensor({2, 3});  // Create a tensor with shape 2x3
-         * tensor.setData(std::vector<int>({1, 2}), 7.5f);  // Set the element at position (1, 2) to 7.5f
+         * Tensor tensor;
+         * Tensor::shape_type position = {0, 0, 0, 0};
+         * Tensor::value_type value = 1.0;
+         * bool isGrad = false;
+         * try {
+         *     tensor.setData(position, value, isGrad);
+         * } catch (const std::invalid_argument& e) {
+         *     std::cerr << e.what() << std::endl;
+         * }
          * ```
          * @endcode
          */
-        void setData(const shape_type& position, value_type value) const;
+        void setData(const shape_type& position, value_type value, bool isGrad = false) const;
 
         /// @}
 
@@ -1190,6 +1211,18 @@ namespace nz::data {
          */
         [[nodiscard]] value_type sum(size_type batch, size_type channel) const;
 
+        [[nodiscard]] value_type max() const;
+
+        [[nodiscard]] value_type max(size_type batch, size_type channel) const;
+
+        [[nodiscard]] value_type min() const;
+
+        [[nodiscard]] value_type min(size_type batch, size_type channel) const;
+
+        [[nodiscard]] shape_type find(value_type value) const;
+
+        [[nodiscard]] shape_type find(value_type value, size_type batch, size_type channel) const;
+
         /**
          * @brief Compute the sum of the exponential values of all elements in the Tensor.
          *
diff --git a/src/Tensor.cu b/src/Tensor.cu
@@ -407,18 +407,24 @@ namespace nz::data {
         _shape.updateStride();
     }
 
-    void Tensor::setData(const shape_type& position, const value_type value) const {
+    void Tensor::setData(const shape_type& position, const value_type value, const bool isGrad) const {
         if (position[0] >= _shape[0] || position[1] >= _shape[1] || position[2] >= _shape[2] || position[3] >= _shape[
             3]) {
             throw std::invalid_argument("Invalid position");
         }
+        if (isGrad && !_requires_grad) {
+            throw std::invalid_argument(
+                "Gradient setting is not allowed for tensors that do not require gradients.");
+        }
         auto* data = static_cast<value_type*>(malloc(_size * sizeof(value_type)));
-        cuStrm::StreamManager<value_type>::Instance().memcpy(data, _data, _size * sizeof(value_type),
+        cuStrm::StreamManager<value_type>::Instance().memcpy(data, isGrad ? _grad : _data, _size * sizeof(value_type),
                                                              cudaMemcpyDeviceToHost);
+        cuStrm::StreamManager<value_type>::Instance().syncData(data);
         data[position[0] * _shape.getStride(0) + position[1] * _shape.getStride(1) + position[2] * _shape.getStride(2) +
             position[3] * _shape.getStride(3)] = value;
-        cuStrm::StreamManager<value_type>::Instance().memcpy(_data, data, _size * sizeof(value_type),
+        cuStrm::StreamManager<value_type>::Instance().memcpy(isGrad ? _grad : _data, data, _size * sizeof(value_type),
                                                              cudaMemcpyHostToDevice);
+        cuStrm::StreamManager<value_type>::Instance().syncData(isGrad ? _grad : _data);
         free(data);
     }
 
@@ -565,6 +571,7 @@ namespace nz::data {
         krnl::Summation(grid, block, block.x / WARP_SIZE * sizeof(float), dData, _data, _size);
         cuStrm::StreamManager<value_type>::Instance().memcpy(hData, dData, grid.x * sizeof(value_type),
                                                              cudaMemcpyDeviceToHost);
+        cuStrm::StreamManager<value_type>::Instance().syncData(hData);
         value_type result = 0;
         for (auto i = 0; i < grid.x; ++i) {
             result += hData[i];
@@ -588,6 +595,7 @@ namespace nz::data {
         krnl::Summation(grid, block, block.x / WARP_SIZE * sizeof(float), dData, _data, size, offset);
         cuStrm::StreamManager<value_type>::Instance().memcpy(hData, dData, grid.x * sizeof(value_type),
                                                              cudaMemcpyDeviceToHost);
+        cuStrm::StreamManager<value_type>::Instance().syncData(hData);
         value_type result = 0;
         for (auto i = 0; i < grid.x; ++i) {
             result += hData[i];
@@ -597,6 +605,89 @@ namespace nz::data {
         return result;
     }
 
+    Tensor::value_type Tensor::max() const {
+        auto hData = hostData();
+        value_type result = std::numeric_limits<value_type>::min();
+        for (auto i = 0; i < _size; ++i) {
+            if (hData[i] > result) {
+                result = hData[i];
+            }
+        }
+        return result;
+    }
+
+    Tensor::value_type Tensor::max(const size_type batch, const size_type channel) const {
+        if (batch >= _shape[0] || channel >= _shape[1]) {
+            throw std::invalid_argument("Invalid position");
+        }
+        const auto offset = batch * _shape.getStride(0) + channel * _shape.getStride(1);
+        auto hData = hostData();
+        value_type result = std::numeric_limits<value_type>::min();
+        for (auto i = 0; i < _shape[2] * _shape[3]; ++i) {
+            if (hData[offset + i] > result) {
+                result = hData[offset + i];
+            }
+        }
+        return result;
+    }
+
+    Tensor::value_type Tensor::min() const {
+        auto hData = hostData();
+        value_type result = std::numeric_limits<value_type>::max();
+        for (auto i = 0; i < _size; ++i) {
+            if (hData[i] < result) {
+                result = hData[i];
+            }
+        }
+        return result;
+    }
+
+    Tensor::value_type Tensor::min(const size_type batch, const size_type channel) const {
+        if (batch >= _shape[0] || channel >= _shape[1]) {
+            throw std::invalid_argument("Invalid position");
+        }
+        const auto offset = batch * _shape.getStride(0) + channel * _shape.getStride(1);
+        auto hData = hostData();
+        value_type result = std::numeric_limits<value_type>::max();
+        for (auto i = 0; i < _shape[2] * _shape[3]; ++i) {
+            if (hData[offset + i] < result) {
+                result = hData[offset + i];
+            }
+        }
+        return result;
+    }
+
+    Tensor::shape_type Tensor::find(const value_type value) const {
+        auto hData = hostData();
+        auto index = 0;
+        for (auto i = 0; i < _size; ++i) {
+            if (hData[i] == value) {
+                index = i;
+                break;
+            }
+        }
+        auto n = index / (_shape[1] * _shape[2] * _shape[3]);
+        auto c = (index % (_shape[1] * _shape[2] * _shape[3])) / (_shape[2] * _shape[3]);
+        auto h = (index % (_shape[2] * _shape[3])) / _shape[3];
+        auto w = index % _shape[3];
+        return {n, c, h, w};
+    }
+
+    Tensor::shape_type Tensor::find(value_type value, size_type batch, size_type channel) const {
+        auto hData = hostData();
+        auto index = 0;
+        auto offset = batch * _shape.getStride(0) + channel * _shape.getStride(1);
+        for (auto i = 0; i < _shape[2] * _shape[3]; ++i) {
+            if (hData[offset + i] == value) {
+                index = i;
+                break;
+            }
+        }
+        auto h = index / _shape[3];
+        auto w = index % _shape[3];
+        return {batch, channel, h, w};
+    }
+
     Tensor::value_type Tensor::expSum() const {
         const dim3 block(256);
         const dim3 grid((_size + block.x - 1) / block.x);
@@ -606,6 +697,7 @@ namespace nz::data {
         krnl::SummationExp(grid, block, block.x / WARP_SIZE * sizeof(float), dData, _data, _size);
         cuStrm::StreamManager<value_type>::Instance().memcpy(hData, dData, grid.x * sizeof(value_type),
                                                              cudaMemcpyDeviceToHost);
+        cuStrm::StreamManager<value_type>::Instance().syncData(hData);
         value_type result = 0;
         for (auto i = 0; i < grid.x; ++i) {
             result += hData[i];
@@ -629,6 +721,7 @@ namespace nz::data {
         krnl::SummationExp(grid, block, block.x / WARP_SIZE * sizeof(float), dData, _data, size, offset);
         cuStrm::StreamManager<value_type>::Instance().memcpy(hData, dData, grid.x * sizeof(value_type),
                                                              cudaMemcpyDeviceToHost);
+        cuStrm::StreamManager<value_type>::Instance().syncData(hData);
         value_type result = 0;
         for (auto i = 0; i < grid.x; ++i) {
             result += hData[i];
diff --git a/test/Test.cpp b/test/Test.cpp