[XPU] Support complex for remainder (PaddlePaddle#73818)

fangfangssj · web-flow · commit 64a826ba5f89 · 2025-07-07T10:55:32.000+08:00
* add complex

* fix bug

* Empty commit to trigger CI
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
@@ -55,7 +55,7 @@ else()
 endif()
 
 if(NOT DEFINED XPU_FFT_BASE_DATE)
-  set(XPU_FFT_BASE_DATE "20250630")
+  set(XPU_FFT_BASE_DATE "20250704")
 endif()
 
 set(XPU_XRE_BASE_URL
diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc
@@ -490,6 +490,9 @@ XPUOpMap& get_kl3_ops() {
       {"elementwise_mod",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16,
+#ifdef PADDLE_WITH_XPU_FFT
+                     phi::DataType::COMPLEX64,
+#endif
                      phi::DataType::INT64,
                      phi::DataType::INT32})},
       {"embedding_with_eltwise_add_xpu",
diff --git a/paddle/phi/kernels/xpu/elementwise_kernel.cc b/paddle/phi/kernels/xpu/elementwise_kernel.cc
@@ -18,6 +18,15 @@
 
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#ifdef PADDLE_WITH_XPU_FFT
+#include "fft/cuComplex.h"
+#include "paddle/phi/kernels/complex_kernel.h"
+#include "paddle/phi/kernels/expand_kernel.h"
+#include "paddle/phi/kernels/funcs/common_infer_shape_functions.h"
+namespace xfft_internal::xpu {
+int RemainderFunctor(int N, float2* input_x, float2* input_y, float2* output);
+}
+#endif
 
 namespace phi {
 
@@ -75,6 +84,69 @@ void ElementwisePowKernel(const Context& dev_ctx,
   ElementwisePowRawKernel<T>(dev_ctx, x, y, axis, out);
 }
 
+#ifdef PADDLE_WITH_XPU_FFT
+template <>
+void RemainderKernel<phi::dtype::complex<float>, XPUContext>(
+    const XPUContext& dev_ctx,
+    const DenseTensor& x,
+    const DenseTensor& y,
+    DenseTensor* out) {
+  using T = phi::dtype::complex<float>;
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  auto out_dims = phi::funcs::BroadcastTwoDims(x_dims, y_dims);
+  std::vector<int64_t> out_dims_vec = phi::vectorize(out_dims);
+
+  auto complex_expand = [](const XPUContext& dev_ctx,
+                           const DenseTensor& x,
+                           const std::vector<int64_t>& out_dims_vec,
+                           DenseTensor* out) {
+    DenseTensor real_out, imag_out;
+    real_out.Resize(out->dims());
+    imag_out.Resize(out->dims());
+    dev_ctx.template Alloc<float>(&real_out);
+    dev_ctx.template Alloc<float>(&imag_out);
+    const DenseTensor real = Real<T, XPUContext>(dev_ctx, x);
+    const DenseTensor imag = Imag<T, XPUContext>(dev_ctx, x);
+    ExpandKernel<float, XPUContext>(
+        dev_ctx, real, phi::IntArray(out_dims_vec), &real_out);
+    ExpandKernel<float, XPUContext>(
+        dev_ctx, imag, phi::IntArray(out_dims_vec), &imag_out);
+    phi::ComplexKernel<float>(dev_ctx, real_out, imag_out, out);
+  };
+
+  DenseTensor broadcasted_x, broadcasted_y;
+  T* x_data = nullptr;
+  T* y_data = nullptr;
+
+  if (x_dims == out_dims) {
+    x_data = const_cast<T*>(x.data<T>());
+  } else {
+    broadcasted_x.Resize(out_dims);
+    dev_ctx.template Alloc<T>(&broadcasted_x);
+    complex_expand(dev_ctx, x, out_dims_vec, &broadcasted_x);
+    x_data = broadcasted_x.data<T>();
+  }
+
+  if (y_dims == out_dims) {
+    y_data = const_cast<T*>(y.data<T>());
+  } else {
+    broadcasted_y.Resize(out_dims);
+    dev_ctx.template Alloc<T>(&broadcasted_y);
+    complex_expand(dev_ctx, y, out_dims_vec, &broadcasted_y);
+    y_data = broadcasted_y.data<T>();
+  }
+
+  dev_ctx.template Alloc<T>(out);
+  int r = xfft_internal::xpu::RemainderFunctor(
+      out->numel(),
+      reinterpret_cast<cuFloatComplex*>(x_data),
+      reinterpret_cast<cuFloatComplex*>(y_data),
+      reinterpret_cast<cuFloatComplex*>(out->data<T>()));
+  PADDLE_ENFORCE_XPU_SUCCESS(r);
+}
+#endif
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(floor_divide,
@@ -110,8 +182,12 @@ PD_REGISTER_KERNEL(remainder,
                    phi::RemainderKernel,
                    float,
                    phi::dtype::float16,
+#ifdef PADDLE_WITH_XPU_FFT
+                   phi::dtype::complex<float>,
+#endif
                    int32_t,
-                   int64_t) {}
+                   int64_t) {
+}
 PD_REGISTER_KERNEL(elementwise_pow,
                    XPU,
                    ALL_LAYOUT,
diff --git a/test/xpu/test_elementwise_mod_op_xpu.py b/test/xpu/test_elementwise_mod_op_xpu.py
@@ -22,6 +22,7 @@
 )
 from op_test import OpTest
 from op_test_xpu import XPUOpTest
+from utils import dygraph_guard
 
 import paddle
 from paddle import base
@@ -109,8 +110,41 @@ def test_dygraph(self):
 
 
 support_types = get_xpu_op_support_types('elementwise_mod')
-for stype in support_types:
+real_types = [t for t in support_types if t != 'complex64']
+for stype in real_types:
     create_test_class(globals(), XPUTestElementwiseModOp, stype)
 
+if 'complex64' in support_types:
+
+    class TestElementwiseModOpComplex64(unittest.TestCase):
+        def test_check_output(self):
+            with dygraph_guard():
+                dtype = "complex64"
+                a = np.array([6 + 4j]).astype(dtype)
+                b = np.array([3 + 5j]).astype(dtype)
+                res = np.array([-2 + 2j]).astype(dtype)
+
+                res_pd = paddle.remainder(
+                    paddle.to_tensor(a), paddle.to_tensor(b)
+                )
+                np.testing.assert_allclose(res, res_pd.numpy())
+
+                dtype = "complex64"
+                a = np.array([6 + 4j]).astype(dtype)
+                b = np.array([3 + 5j]).astype(dtype)
+                res = np.array([-2 + 2j]).astype(dtype)
+
+                res_pd = paddle.remainder(
+                    paddle.to_tensor(a), paddle.to_tensor(b)
+                )
+                np.testing.assert_allclose(res, res_pd.numpy())
+
+                with base.device_guard("xpu"):
+                    res_pd = paddle.remainder(
+                        paddle.to_tensor(a), paddle.to_tensor(b)
+                    )
+                np.testing.assert_allclose(res, res_pd.numpy())
+
+
 if __name__ == '__main__':
     unittest.main()