Fix out-of-bounds access in GraphSendUERecvCUDAKernel

xxyux · xxyux · commit 248d176f6e48 · 2026-03-24T18:31:24.000+08:00
This fix adds boundary checks to prevent out-of-bounds memory access
when src/dst indices exceed valid node range or when broadcast offsets
exceed feature dimensions.

Root cause:
- When src_indices contain values &gt;= num_nodes, the kernel accesses
  memory beyond allocated buffer
- When broadcast offsets exceed x_len/e_len, out-of-bounds access occurs

Fix:
- Add num_nodes parameter to kernel for boundary validation
- Check src/dst indices are within [0, num_nodes) before access
- Check x_add &lt; x_len and e_add &lt; e_len for broadcast offsets
diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
@@ -135,6 +135,7 @@ __global__ void GraphSendUERecvCUDAKernel(const T* x_data,
                                           int64_t x_len,
                                           int64_t e_len,
                                           int64_t out_len,
+                                          int64_t num_nodes,
                                           bool use_bcast,
                                           ComputeFunctor cfunctor,
                                           ReduceFunctor rfunctor) {
@@ -147,15 +148,22 @@ __global__ void GraphSendUERecvCUDAKernel(const T* x_data,
     int64_t tx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
     int64_t stride_x = blockDim.x * static_cast<int64_t>(gridDim.x);
 
-    const T* x_off = x_data + src * x_len;
-    const T* e_off = e_data + ty * e_len;
-    T* out_off = output + dst * out_len;
-    while (tx < out_len) {
-      int64_t x_add = use_bcast ? xbcast_off[tx] : tx;
-      int64_t e_add = use_bcast ? ebcast_off[tx] : tx;
-      T val = cfunctor(x_off[x_add], e_off[e_add]);
-      rfunctor(out_off + tx, val);
-      tx += stride_x;
+    // Add boundary check for src/dst indices to prevent out-of-bounds access
+    // src and dst must be within valid range: src < num_nodes, dst < num_nodes
+    if (src >= 0 && src < num_nodes && dst >= 0 && dst < num_nodes) {
+      const T* x_off = x_data + src * x_len;
+      const T* e_off = e_data + ty * e_len;
+      T* out_off = output + dst * out_len;
+      while (tx < out_len) {
+        int64_t x_add = use_bcast ? xbcast_off[tx] : tx;
+        int64_t e_add = use_bcast ? ebcast_off[tx] : tx;
+        // Add boundary check to prevent out-of-bounds access for bcast offsets
+        if (x_add < x_len && e_add < e_len) {
+          T val = cfunctor(x_off[x_add], e_off[e_add]);
+          rfunctor(out_off + tx, val);
+        }
+        tx += stride_x;
+      }
     }
     ty += stride_y;
   }
diff --git a/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_grad_kernel.cu
@@ -185,6 +185,7 @@ void CalculateXGrad(const Context& dev_ctx,
       const dim3 block_(ntx, nty);
       funcs::MultiplyFunctor<T> mul_functor;
       GraphSendUERecvSumCUDAFunctor<T> sum_functor;
+      int64_t num_nodes = x_dims[0];
       if (!reduce) {
         GraphSendUERecvCUDAKernel<T,
                                   IndexT,
@@ -202,6 +203,7 @@ void CalculateXGrad(const Context& dev_ctx,
                 bcast_info.l_len,
                 bcast_info.r_len,
                 out_len,
+                num_nodes,
                 bcast_info.use_bcast,
                 mul_functor,
                 sum_functor);
@@ -225,6 +227,7 @@ void CalculateXGrad(const Context& dev_ctx,
                 bcast_info.l_len,
                 bcast_info.r_len,
                 out_len,
+                num_nodes,
                 bcast_info.use_bcast,
                 mul_functor,
                 sum_functor);
diff --git a/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/send_ue_recv_kernel.cu
@@ -95,6 +95,7 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& dev_ctx,
   const dim3 grid(nbx, nby);
   const dim3 block(ntx, nty);
   int64_t input_size = x.dims()[0];
+  int64_t num_nodes = x.dims()[0];
   int block_ = 1024;
   if (reduce_op == "SUM" || reduce_op == "MEAN") {
     GraphSendUERecvSumCUDAFunctor<T> sum_functor;
@@ -116,6 +117,7 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& dev_ctx,
               bcast_info.l_len,
               bcast_info.r_len,
               out_len,
+              num_nodes,
               bcast_info.use_bcast,
               add_funtor,
               sum_functor);
@@ -137,6 +139,7 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& dev_ctx,
               bcast_info.l_len,
               bcast_info.r_len,
               out_len,
+              num_nodes,
               bcast_info.use_bcast,
               mul_functor,
               sum_functor);
@@ -184,6 +187,7 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& dev_ctx,
               bcast_info.l_len,
               bcast_info.r_len,
               out_len,
+              num_nodes,
               bcast_info.use_bcast,
               add_funtor,
               max_functor);
@@ -205,6 +209,7 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& dev_ctx,
               bcast_info.l_len,
               bcast_info.r_len,
               out_len,
+              num_nodes,
               bcast_info.use_bcast,
               mul_functor,
               max_functor);
@@ -237,6 +242,7 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& dev_ctx,
               bcast_info.l_len,
               bcast_info.r_len,
               out_len,
+              num_nodes,
               bcast_info.use_bcast,
               add_funtor,
               min_functor);
@@ -258,6 +264,7 @@ void GraphSendUERecvOpCUDAKernelLaunchHelper(const Context& dev_ctx,
               bcast_info.l_len,
               bcast_info.r_len,
               out_len,
+              num_nodes,
               bcast_info.use_bcast,
               mul_functor,
               min_functor);
diff --git a/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu b/paddle/phi/kernels/gpu/send_uv_grad_kernel.cu
@@ -145,6 +145,7 @@ void CalculateGrad(const Context& dev_ctx,
     funcs::MultiplyFunctor<T> mul_functor;
     GraphSendUERecvSumCUDAFunctor<T> sum_functor;
     const T* y_data = y.data<T>();
+    int64_t num_nodes = x_grad_dims[0];
     if (!reduce) {
       GraphSendUERecvCUDAKernel<T,
                                 IndexT,
@@ -162,6 +163,7 @@ void CalculateGrad(const Context& dev_ctx,
               bcast_info.l_len,
               bcast_info.r_len,
               out_len,
+              num_nodes,
               bcast_info.use_bcast,
               mul_functor,
               sum_functor);
@@ -189,6 +191,7 @@ void CalculateGrad(const Context& dev_ctx,
               bcast_info.l_len,
               bcast_info.r_len,
               out_len,
+              num_nodes,
               bcast_info.use_bcast,
               mul_functor,
               sum_functor);