Fix out-of-bounds memory access in SetKernel for 0-size tensor (#78486)

hushenwei2000 · claude · web-flow · commit c9b4be562e63 · 2026-04-01T14:39:51.000+08:00
* Fix out-of-bounds memory access in SetKernel with 0-size source tensor

When calling Tensor.set_(source, shape, stride, offset) with a 0-size source
tensor and non-zero target shape, the original code had a missing branch in
the conditional logic: when source.numel()==0 and x.numel()!=0, no branch
was executed, leaving `out` with its original data holder but with the
user-specified meta (shape/stride). This caused ContiguousKernel to read
beyond allocated memory when converting the strided tensor to contiguous.

The fix forces the output tensor to inherit the source's 0-size dims/strides
when source has no elements, preventing out-of-bounds access.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;

* fix(phi): fix set_kernel access memory error when tensor is empty

* test(inplace): fix assert of 0-size tensor set_ behaviour

* fix(phi): fix CPU

* test(inplace): add 0-dim tensor set to non-0-size tensor tests

* fix(set): handle zero-element output in inplace operation

---------

Co-authored-by: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/paddle/phi/kernels/set_kernel.cc b/paddle/phi/kernels/set_kernel.cc
@@ -12,10 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/phi/kernels/set_kernel.h"
+#include <cstring>
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/full_kernel.h"
 namespace phi {
 
+// Compute the minimum number of elements required in storage to hold
+// a strided view described by dims, stride and offset.
+static int64_t ComputeRequiredStorageSize(const std::vector<int64_t>& dims,
+                                          const std::vector<int64_t>& stride,
+                                          int64_t offset) {
+  int64_t required = offset;
+  for (size_t i = 0; i < dims.size(); ++i) {
+    if (dims[i] > 0) {
+      required += (dims[i] - 1) * stride[i];
+    }
+  }
+  return required + 1;  // +1 for the last element itself
+}
+
 template <typename T, typename Context>
 void SetKernel(const Context& dev_ctx,
                const DenseTensor& x,
@@ -29,11 +45,58 @@ void SetKernel(const Context& dev_ctx,
   meta.strides = DDim(stride.data(), static_cast<int>(stride.size()));
   meta.offset = offset;
   if (x.numel() == 0 || source.numel() == 0) {
-    if (source.numel() != 0) {
+    int64_t out_numel = 1;
+    for (auto d : dims) {
+      out_numel *= d;
+    }
+    if (source.numel() == 0 && x.numel() != 0) {
+      // Source is empty but x has storage. Reuse x's storage and apply
+      // the user-specified meta, matching PyTorch behavior.
+      if (out_numel == 0) {
+        // Output has 0 elements — no storage needed, just set meta.
+        out->set_meta(meta);
+        out->ShareInplaceVersionCounterWith(x);
+        return;
+      }
+      // If the strided view requires more storage than x provides,
+      // allocate a larger zero-filled buffer and copy x's data into it
+      // to avoid out-of-bounds reads on elements beyond x's allocation.
+      int64_t required_size = ComputeRequiredStorageSize(dims, stride, offset);
+      if (required_size > x.numel()) {
+        DenseTensor tmp;
+        std::vector<int64_t> alloc_shape = {required_size};
+        Full<T, Context>(dev_ctx, alloc_shape, 0, &tmp);
+        if (dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU) {
+          std::memcpy(tmp.data<T>(), x.data<T>(), x.numel() * sizeof(T));
+        } else {
+          memory_utils::Copy(dev_ctx.GetPlace(),
+                             tmp.data<T>(),
+                             dev_ctx.GetPlace(),
+                             x.data<T>(),
+                             x.numel() * sizeof(T),
+                             nullptr);
+        }
+        out->clear();
+        *out = DenseTensor{tmp.Holder(), meta};
+      } else {
+        out->set_meta(meta);
+      }
+    } else if (source.numel() == 0 && x.numel() == 0 && out_numel != 0) {
+      // Both x and source are 0-size but user wants non-zero shape.
+      // Allocate zero-filled storage to avoid null pointer access.
+      int64_t required_size = ComputeRequiredStorageSize(dims, stride, offset);
+      DenseTensor tmp;
+      std::vector<int64_t> alloc_shape = {required_size};
+      Full<T, Context>(dev_ctx, alloc_shape, 0, &tmp);
+      out->clear();
+      *out = DenseTensor{tmp.Holder(), meta};
+    } else if (source.numel() != 0) {
+      out->clear();
+      *out = DenseTensor{source.Holder(), meta};
+    } else {
+      // Both 0-size, output also 0-size
       out->clear();
       *out = DenseTensor{source.Holder(), meta};
-    } else if (x.numel() == 0) {
-      Full<T, Context>(dev_ctx, out->dims(), 0, out);
     }
     out->ShareInplaceVersionCounterWith(x);
     return;
diff --git a/test/legacy_test/test_inplace.py b/test/legacy_test/test_inplace.py
@@ -2603,11 +2603,115 @@ class TestSet_API_ZeroSize(unittest.TestCase):
     def setUp(self):
         self.places = get_places()
 
-    def test_set_api(self):
+    def test_zero_size_source_with_nonzero_shape(self):
+        """When source is 0-size but user specifies non-zero dims/stride,
+        output should respect user-specified shape (matching PyTorch behavior).
+        Storage is expanded if needed to avoid out-of-bounds access."""
         for place in self.places:
             with paddle.base.dygraph.guard(place):
-                out = paddle.randn([20]).set_(paddle.randn([0, 3]), [20], [2])
-                np.testing.assert_allclose(out.shape, [20])
+                source = paddle.randn([0, 3])
+                x = paddle.randn([20])
+                out = x.set_(source, [20], [2])
+                self.assertEqual(list(out.shape), [20])
+                # contiguous should work without OOB
+                c = out.contiguous()
+                self.assertEqual(list(c.shape), [20])
+
+    def test_zero_size_source_default_args(self):
+        """set_ with 0-size source and no explicit shape/stride."""
+        for place in self.places:
+            with paddle.base.dygraph.guard(place):
+                source = paddle.randn([0, 5])
+                x = paddle.randn([10])
+                out = x.set_(source)
+                self.assertEqual(out.numel().item(), 0)
+                self.assertEqual(list(out.shape), [0, 5])
+                self.assertTrue(id(x) == id(out))
+
+    def test_zero_size_x_nonzero_source(self):
+        """set_ with 0-size x but non-zero source should work normally."""
+        for place in self.places:
+            with paddle.base.dygraph.guard(place):
+                source = paddle.to_tensor([1.0, 2.0, 3.0])
+                x = paddle.randn([0])
+                out = x.set_(source)
+                self.assertEqual(list(out.shape), [3])
+                self.assertTrue(x._is_shared_buffer_with(source))
+
+    def test_both_zero_size(self):
+        """set_ with both x and source being 0-size."""
+        for place in self.places:
+            with paddle.base.dygraph.guard(place):
+                source = paddle.randn([0])
+                x = paddle.randn([0])
+                out = x.set_(source)
+                self.assertEqual(out.numel().item(), 0)
+                self.assertTrue(id(x) == id(out))
+
+    def test_both_zero_size_with_nonzero_shape(self):
+        """Both x and source are 0-size but user specifies non-zero dims/stride.
+        This covers the branch that allocates zero-filled storage when both
+        tensors are empty but a non-zero output shape is requested."""
+        for place in self.places:
+            with paddle.base.dygraph.guard(place):
+                source = paddle.randn([0])
+                x = paddle.randn([0])
+                out = x.set_(source, [4], [1])
+                self.assertEqual(list(out.shape), [4])
+                self.assertTrue(id(x) == id(out))
+                # The allocated storage should be zero-filled and accessible
+                c = out.contiguous()
+                self.assertEqual(list(c.shape), [4])
+                np.testing.assert_array_equal(
+                    c.numpy(), np.zeros([4], dtype='float32')
+                )
+
+    def test_both_zero_size_with_nonzero_shape_and_offset(self):
+        """Both x and source are 0-size, user specifies non-zero shape with
+        a non-zero offset. Verifies storage is large enough to accommodate
+        the offset without out-of-bounds access."""
+        for place in self.places:
+            with paddle.base.dygraph.guard(place):
+                source = paddle.randn([0])
+                x = paddle.randn([0])
+                # offset must be a multiple of element size (4 bytes for
+                # float32) to avoid misaligned GPU memory access.
+                out = x.set_(source, [3], [2], 4)
+                self.assertEqual(list(out.shape), [3])
+                self.assertTrue(id(x) == id(out))
+                c = out.contiguous()
+                self.assertEqual(list(c.shape), [3])
+                np.testing.assert_array_equal(
+                    c.numpy(), np.zeros([3], dtype='float32')
+                )
+
+    def test_both_zero_size_with_nonzero_2d_shape(self):
+        """Both x and source are 0-size, user specifies a 2D non-zero shape.
+        Verifies multi-dimensional strided view is allocated correctly."""
+        for place in self.places:
+            with paddle.base.dygraph.guard(place):
+                source = paddle.randn([0, 0])
+                x = paddle.randn([0])
+                out = x.set_(source, [2, 3], [3, 1])
+                self.assertEqual(list(out.shape), [2, 3])
+                self.assertTrue(id(x) == id(out))
+                c = out.contiguous()
+                self.assertEqual(list(c.shape), [2, 3])
+                np.testing.assert_array_equal(
+                    c.numpy(), np.zeros([2, 3], dtype='float32')
+                )
+
+    def test_zero_size_source_no_crash_on_contiguous(self):
+        """Ensure contiguous() works correctly on a tensor
+        that was set_ with a 0-size source but user-specified shape."""
+        for place in self.places:
+            with paddle.base.dygraph.guard(place):
+                source = paddle.randn([0, 3])
+                x = paddle.randn([20])
+                out = x.set_(source, [20], [2])
+                # contiguous should produce a valid tensor with correct shape
+                c = out.contiguous()
+                self.assertEqual(list(c.shape), [20])
 
 
 if __name__ == '__main__':