Merge branch 'develop' into n277

co63oc · co63oc · commit 0de798221b70 · 2026-03-12T09:09:03.000+08:00
diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc
@@ -435,14 +435,14 @@ void BatchNormDoubleGradKernel(const Context& dev_ctx,
   DenseTensor transformed_ddy(ddY->type());
   if (data_layout == DataLayout::NCHW && x_dims.size() > 2) {
     VLOG(3) << "Transform batchnorm output from NCHW to NHWC";
-    // Input Tensor
+    // Input DenseTensor
     ResizeToChannelLast<Context, T>(dev_ctx, X, &transformed_x);
     TransToChannelLast<Context, T>(dev_ctx, X, &transformed_x);
     ResizeToChannelLast<Context, T>(dev_ctx, dY, &transformed_dy);
     TransToChannelLast<Context, T>(dev_ctx, dY, &transformed_dy);
     ResizeToChannelLast<Context, T>(dev_ctx, ddX, &transformed_ddx);
     TransToChannelLast<Context, T>(dev_ctx, ddX, &transformed_ddx);
-    // Output Tensor
+    // Output DenseTensor
     ResizeToChannelLast<Context, T>(dev_ctx, dX, &transformed_dx);
     ResizeToChannelLast<Context, T>(dev_ctx, ddY, &transformed_ddy);
   } else {
@@ -458,7 +458,7 @@ void BatchNormDoubleGradKernel(const Context& dev_ctx,
   ConstEigenVectorArrayMap<T> mean_arr(mean_data, C);
   ConstEigenVectorArrayMap<T> inv_var_arr(inv_var_data, C);
 
-  Tensor mean_tile;
+  DenseTensor mean_tile;
   mean_tile.Resize({C, sample_size});
   EigenArrayMap<T> mean_tile_data(
       dev_ctx.template Alloc<T>(&mean_tile), C, sample_size);
@@ -480,7 +480,7 @@ void BatchNormDoubleGradKernel(const Context& dev_ctx,
   ConstEigenVectorArrayMap<T> scale_arr(
       Scale ? Scale->data<T>() : Scale_data.data<T>(), C);
 
-  Tensor scale_tile;
+  DenseTensor scale_tile;
   scale_tile.Resize({C, sample_size});
   EigenArrayMap<T> scale_tile_data(
       dev_ctx.template Alloc<T>(&scale_tile), C, sample_size);
@@ -505,7 +505,7 @@ void BatchNormDoubleGradKernel(const Context& dev_ctx,
       // math: dx = (ddscale * dy) * inv_var
       if (ddScale) {
         ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-        Tensor ddscale_tile;
+        DenseTensor ddscale_tile;
         ddscale_tile.Resize({C, sample_size});
         EigenArrayMap<T> ddscale_tile_data(
             dev_ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
@@ -557,7 +557,7 @@ void BatchNormDoubleGradKernel(const Context& dev_ctx,
       }
       if (ddScale) {
         ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-        Tensor ddscale_tile;
+        DenseTensor ddscale_tile;
         ddscale_tile.Resize({C, sample_size});
         EigenArrayMap<T> ddscale_tile_data(
             dev_ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
@@ -594,7 +594,7 @@ void BatchNormDoubleGradKernel(const Context& dev_ctx,
       //            inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) *
       //            ddx
       if (ddX) {
-        Tensor first_grad;
+        DenseTensor first_grad;
         first_grad.Resize({C, sample_size});
         EigenArrayMap<T> first_grad_arr(
             dev_ctx.template Alloc<T>(&first_grad), C, sample_size);
@@ -645,7 +645,7 @@ void BatchNormDoubleGradKernel(const Context& dev_ctx,
     }
     if (ddScale) {
       ConstEigenVectorArrayMap<T> ddscale_arr(ddScale->data<T>(), C);
-      Tensor ddscale_tile;
+      DenseTensor ddscale_tile;
       ddscale_tile.Resize({C, sample_size});
       EigenArrayMap<T> ddscale_tile_data(
           dev_ctx.template Alloc<T>(&ddscale_tile), C, sample_size);
@@ -656,7 +656,7 @@ void BatchNormDoubleGradKernel(const Context& dev_ctx,
 
     if (ddBias) {
       ConstEigenVectorArrayMap<T> ddbias_arr(ddBias->data<T>(), C);
-      Tensor ddbias_tile;
+      DenseTensor ddbias_tile;
       ddbias_tile.Resize({C, sample_size});
       EigenArrayMap<T> ddbias_tile_data(
           dev_ctx.template Alloc<T>(&ddbias_tile), C, sample_size);
diff --git a/paddle/phi/kernels/funcs/batch_norm_utils.h b/paddle/phi/kernels/funcs/batch_norm_utils.h
@@ -20,12 +20,10 @@
 
 namespace phi {
 
-using Tensor = DenseTensor;
-
 template <typename DeviceContext, typename T>
 inline void ResizeToChannelFirst(const DeviceContext& dev_ctx,
-                                 const Tensor* input,
-                                 Tensor* transformed_input) {
+                                 const DenseTensor* input,
+                                 DenseTensor* transformed_input) {
   int dim = input->dims().size() - 2;
   if (dim == 3) {
     // input
@@ -61,8 +59,8 @@ inline void ResizeToChannelFirst(const DeviceContext& dev_ctx,
 
 template <typename DeviceContext, typename T>
 inline void ResizeToChannelLast(const DeviceContext& dev_ctx,
-                                const Tensor* input,
-                                Tensor* transformed_input) {
+                                const DenseTensor* input,
+                                DenseTensor* transformed_input) {
   int dim = input->dims().size() - 2;
   if (dim == 3) {
     // input
@@ -99,8 +97,8 @@ inline void ResizeToChannelLast(const DeviceContext& dev_ctx,
 
 template <typename DeviceContext, typename T>
 inline void TransToChannelFirst(const DeviceContext& dev_ctx,
-                                const Tensor* input,
-                                Tensor* transformed_input) {
+                                const DenseTensor* input,
+                                DenseTensor* transformed_input) {
   VLOG(5) << "Why am I called?";
   int dim = input->dims().size() - 2;
   if (dim == 3) {
@@ -121,8 +119,8 @@ inline void TransToChannelFirst(const DeviceContext& dev_ctx,
 
 template <typename DeviceContext, typename T>
 inline void TransToChannelLast(const DeviceContext& dev_ctx,
-                               const Tensor* input,
-                               Tensor* transformed_input) {
+                               const DenseTensor* input,
+                               DenseTensor* transformed_input) {
   int dim = input->dims().size() - 2;
   if (dim == 3) {
     std::vector<int> axis{0, 2, 3, 4, 1};
diff --git a/paddle/phi/kernels/funcs/cross_entropy.cc b/paddle/phi/kernels/funcs/cross_entropy.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 namespace phi {
 namespace funcs {
 
-using Tensor = DenseTensor;
 template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
diff --git a/paddle/phi/kernels/funcs/segment_pooling.cc b/paddle/phi/kernels/funcs/segment_pooling.cc
@@ -21,8 +21,6 @@ limitations under the License. */
 
 namespace phi::funcs {
 
-using Tensor = DenseTensor;
-
 template <typename T, typename IndexT>
 class SegmentPoolFunctor<phi::CPUContext, T, IndexT> {
  public:
@@ -51,8 +49,8 @@ class SegmentPoolFunctor<phi::CPUContext, T, IndexT> {
                               segment_ids[idx]));
       }
 
-      Tensor out_t = output->Slice(current_id, current_id + 1);
-      Tensor in_t = input.Slice(last_idx, idx);
+      DenseTensor out_t = output->Slice(current_id, current_id + 1);
+      DenseTensor in_t = input.Slice(last_idx, idx);
 
       int64_t h = idx - last_idx;
       auto in_e = EigenMatrix<T>::From(in_t, make_ddim({h, w}));
@@ -110,8 +108,8 @@ class SegmentPoolGradFunctor<phi::CPUContext, T, IndexT> {
                               segment_ids[idx]));
       }
 
-      Tensor out_g_t = out_grad.Slice(current_id, current_id + 1);
-      Tensor in_g_t = in_grad->Slice(last_idx, idx);
+      DenseTensor out_g_t = out_grad.Slice(current_id, current_id + 1);
+      DenseTensor in_g_t = in_grad->Slice(last_idx, idx);
 
       int64_t h = idx - last_idx;
       auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
@@ -123,8 +121,8 @@ class SegmentPoolGradFunctor<phi::CPUContext, T, IndexT> {
       } else if (pooltype == "SUM") {
         in_g_e.device(place) = out_g_e.broadcast(bcast);
       } else if (pooltype == "MAX" || pooltype == "MIN") {
-        Tensor out_t = output.Slice(current_id, current_id + 1);
-        Tensor in_t = input.Slice(last_idx, idx);
+        DenseTensor out_t = output.Slice(current_id, current_id + 1);
+        DenseTensor in_t = input.Slice(last_idx, idx);
         auto in_e = EigenMatrix<T>::From(in_t, {h, w});
         auto out_e = EigenMatrix<T>::From(out_t, {1, w});
         in_g_e.device(place) =
diff --git a/paddle/phi/kernels/funcs/segment_pooling.cu b/paddle/phi/kernels/funcs/segment_pooling.cu
@@ -25,8 +25,6 @@ limitations under the License. */
 namespace phi {
 namespace funcs {
 
-using Tensor = DenseTensor;
-
 template <typename T, typename Index, int DimTileSize>
 __global__ void SegmentSumIdsKernel(const Index* segment_ids,
                                     T* summed_ids,
diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h
@@ -84,8 +84,6 @@ struct NumericTraits<phi::bfloat16>
 namespace phi {
 namespace funcs {
 
-using Tensor = DenseTensor;
-
 inline void GetDims(
     const DDim& dim, int axis, int64_t* pre, int64_t* n, int64_t* post) {
   *pre = 1;
@@ -1089,7 +1087,7 @@ bool SortTopk(const GPUContext& dev_ctx,
               bool largest = true) {
   auto cu_stream = dev_ctx.stream();
 
-  Tensor input_indices;
+  DenseTensor input_indices;
   const std::vector<int64_t> dims = {num_rows, num_cols};
   auto dim = make_ddim(dims);
   input_indices.Resize(dim);
@@ -1130,8 +1128,8 @@ bool SortTopk(const GPUContext& dev_ctx,
   T* sorted_values_ptr;
   int64_t* sorted_indices_ptr;
 
-  Tensor temp_values;
-  Tensor temp_indices;
+  DenseTensor temp_values;
+  DenseTensor temp_indices;
 
   const T* input = input_tensor->data<T>();
   T* values = out_tensor->data<T>();
@@ -1217,7 +1215,7 @@ bool SortTopk(const GPUContext& dev_ctx,
     }
 #endif
   }
-  Tensor temp_storage;
+  DenseTensor temp_storage;
   dev_ctx.template Alloc<uint8_t>(&temp_storage, temp_storage_bytes);
 
   if (largest) {
@@ -1299,14 +1297,14 @@ bool SortTopk(const GPUContext& dev_ctx,
     const Eigen::DSizes<Eigen::DenseIndex, 2> slice_indices{0, 0};
     const Eigen::DSizes<Eigen::DenseIndex, 2> slice_sizes{num_rows, k};
     auto e_indices = EigenMatrix<int64_t>::From(*indices_tensor, dim);
-    auto e_tmp_indices =
-        EigenMatrix<int64_t>::From(static_cast<const Tensor>(temp_indices));
+    auto e_tmp_indices = EigenMatrix<int64_t>::From(
+        static_cast<const DenseTensor>(temp_indices));
 
     std::vector<int> odims = {static_cast<int>(num_rows), static_cast<int>(k)};
     auto dim = make_ddim(odims);
     auto e_values = EigenMatrix<T>::From(*out_tensor, dim);
     auto e_tmp_values =
-        EigenMatrix<T>::From(static_cast<const Tensor>(temp_values));
+        EigenMatrix<T>::From(static_cast<const DenseTensor>(temp_values));
 
     funcs::EigenSlice<std::decay_t<decltype(dev)>, int64_t, 2>::Eval(
         dev, e_indices, e_tmp_indices, slice_indices, slice_sizes);
diff --git a/paddle/phi/kernels/funcs/transpose_function.cu.h b/paddle/phi/kernels/funcs/transpose_function.cu.h
@@ -27,8 +27,6 @@ limitations under the License. */
 namespace phi {
 namespace funcs {
 
-using Tensor = DenseTensor;
-
 struct EqualTo {
   constexpr bool operator()(int a, int b) const { return a == b; }
 };
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -554,8 +554,8 @@ void ConvCudnnGradKernel(const Context& dev_ctx,
   // So we create a new padded input tensor.
   int data_dim = strides.size();  // 2d or 3d
   bool is_sys_pad = funcs::IsSymmetricPadding(paddings, data_dim);
-  Tensor transformed_input(input.type());
-  Tensor transformed_input_grad(input.type());
+  DenseTensor transformed_input(input.type());
+  DenseTensor transformed_input_grad(input.type());
   std::vector<int> padding_common(data_dim, 0);
   std::vector<int> input_pad(transformed_input_channel.dims().size() * 2, 0);
 
diff --git a/paddle/phi/kernels/impl/conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/conv_grad_kernel_impl.h
@@ -207,7 +207,7 @@ void ConvGradKernel(const Context& dev_ctx,
 
   if (filter_grad) {
     dev_ctx.template Alloc<T>(filter_grad);
-    Tensor filter_grad_ = *filter_grad;
+    DenseTensor filter_grad_ = *filter_grad;
     filter_grad_.Resize(filter_matrix_shape);
     set_zero(dev_ctx, filter_grad, static_cast<T>(0));
     funcs::Im2ColFunctor<funcs::ColFormat::CFO, Context, T> im2col;
@@ -369,7 +369,7 @@ void ConvGradGradKernel(const Context& dev_ctx,
   // dx = ddw * dy  ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout,
   // oH, oW)
   if (dX && ddW_in) {
-    Tensor ddW;
+    DenseTensor ddW;
     ddW.ShareDataWith(*ddW_in).Resize(filter_matrix_shape);
     dev_ctx.template Alloc<T>(dX);
 
@@ -436,7 +436,8 @@ void ConvGradGradKernel(const Context& dev_ctx,
     for (int i = 0; i < batch_size; ++i) {
       DenseTensor dy_batch =
           transformed_dY.Slice(i, i + 1).Resize(output_matrix_shape);
-      Tensor ddx_batch = transformed_ddX.Slice(i, i + 1).Resize(input_shape);
+      DenseTensor ddx_batch =
+          transformed_ddX.Slice(i, i + 1).Resize(input_shape);
       for (int g = 0; g < groups; ++g) {
         // im2col
         DenseTensor dy_slice = dy_batch.Slice(g * out_step, (g + 1) * out_step);
diff --git a/paddle/phi/kernels/impl/expand_kernel_impl.h b/paddle/phi/kernels/impl/expand_kernel_impl.h
@@ -22,7 +22,6 @@
 #define MAX_RANK_SUPPORTED 8
 
 namespace phi {
-using Tensor = DenseTensor;
 
 template <typename Context, typename T, int Rank>
 void Expand(const Context& dev_ctx,
diff --git a/paddle/phi/kernels/impl/isclose_kernel_impl.h b/paddle/phi/kernels/impl/isclose_kernel_impl.h
@@ -27,7 +27,6 @@
 #include "paddle/phi/common/memory_utils.h"
 
 namespace phi {
-using Tensor = DenseTensor;
 template <typename Context, typename T>
 struct GetTensorValue {
   T operator()(const Context& dev_ctx, const DenseTensor& tensor) const;
diff --git a/paddle/phi/kernels/impl/isfinite_kernel_impl.h b/paddle/phi/kernels/impl/isfinite_kernel_impl.h
@@ -47,8 +47,6 @@ struct is_complex64_or_complex128
                                  std::is_same<T, phi::complex128>::value> {};
 
 namespace phi {
-using Tensor = DenseTensor;
-
 /*
 Codes for isfinite/isinf/isnan as constructed as below:
 1. A general template,
diff --git a/paddle/phi/kernels/impl/solve_kernel_impl.h b/paddle/phi/kernels/impl/solve_kernel_impl.h
@@ -23,8 +23,6 @@ limitations under the License. */
 
 namespace phi {
 
-using Tensor = DenseTensor;
-
 // check the input other is vector_case or not
 static inline bool is_vector_rhs(const DenseTensor& input,
                                  const DenseTensor& other) {
@@ -78,7 +76,7 @@ static std::vector<int64_t> get_broadcast_batch_portion(
 
 // broadcast the batch dimensions of tensor x and tensor y.
 static inline std::tuple<std::vector<int64_t>, std::vector<int64_t>>
-get_broadcast_dims(const Tensor& x, const Tensor& y) {
+get_broadcast_dims(const DenseTensor& x, const DenseTensor& y) {
   std::vector<int64_t> x_dims_vec = common::vectorize(x.dims());
   std::vector<int64_t> y_dims_vec = common::vectorize(y.dims());
   std::vector<int64_t>::const_iterator f1 = x_dims_vec.begin();
@@ -116,7 +114,7 @@ static void linalg_solve(const Context& dev_ctx,
   bool is_vector = false;
   is_vector = is_vector_rhs(x, y);
 
-  Tensor tmp_y;
+  DenseTensor tmp_y;
   if (is_vector) {
     dev_ctx.Alloc(&tmp_y, y.dtype());
 
@@ -128,7 +126,7 @@ static void linalg_solve(const Context& dev_ctx,
     Copy(dev_ctx, y, dev_ctx.GetPlace(), false, &tmp_y);
   }
 
-  Tensor tmp_x;
+  DenseTensor tmp_x;
   tmp_x.Resize(x.dims());
   dev_ctx.Alloc(&tmp_x, x.dtype());
   Copy(dev_ctx, x, dev_ctx.GetPlace(), false, &tmp_x);
@@ -138,12 +136,12 @@ static void linalg_solve(const Context& dev_ctx,
   std::tie(x_broadcast_dims, y_broadcast_dims) =
       get_broadcast_dims(tmp_x, tmp_y);
 
-  Tensor tmp_x_bc;
+  DenseTensor tmp_x_bc;
 
   phi::ExpandAsKernel<T, Context>(
       dev_ctx, tmp_x, nullptr, x_broadcast_dims, &tmp_x_bc);
 
-  Tensor tmp_y_bc;
+  DenseTensor tmp_y_bc;
   phi::ExpandAsKernel<T, Context>(
       dev_ctx, tmp_y, nullptr, y_broadcast_dims, &tmp_y_bc);
 
@@ -156,7 +154,7 @@ static void linalg_solve(const Context& dev_ctx,
     out->Resize(tmp_y_bc.dims());  // out.unsqueeze(-1)
     mat_solve(dev_ctx, tmp_x_bc, tmp_y_bc, out);
 
-    Tensor out_tmp;
+    DenseTensor out_tmp;
     out_tmp.Resize(out->dims());
     out_tmp = *out;
 
diff --git a/paddle/phi/ops/yaml/python_api_info.yaml b/paddle/phi/ops/yaml/python_api_info.yaml
@@ -390,6 +390,11 @@
   args_alias:
     use_default_mapping : True
 
+- op : lgamma
+  name : [paddle.lgamma, paddle.Tensor.lgamma]
+  args_alias :
+    use_default_mapping : True
+
 - op : log
   name : [paddle.log, paddle.Tensor.log]
   args_alias :
diff --git a/python/paddle/_paddle_docs.py b/python/paddle/_paddle_docs.py
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/forward_backward_overlap_utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/forward_backward_overlap_utils.py
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
diff --git a/test/legacy_test/test_api_compatibility.py b/test/legacy_test/test_api_compatibility.py