PaddlePaddle
diff --git a/‎.github/workflows/H-Coverage.yml‎
Lines changed: 7 additions & 1 deletion b/‎.github/workflows/H-Coverage.yml‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎paddle/phi/api/include/compat/ATen/ops/resize.h‎
Lines changed: 33 additions & 21 deletions b/‎paddle/phi/api/include/compat/ATen/ops/resize.h‎
Lines changed: 33 additions & 21 deletions
diff --git a/‎paddle/phi/api/include/compat/ATen/ops/slice.h‎
Lines changed: 3 additions & 0 deletions b/‎paddle/phi/api/include/compat/ATen/ops/slice.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎paddle/phi/kernels/gpu/adam_kernel.cu‎
Lines changed: 4 additions & 4 deletions b/‎paddle/phi/kernels/gpu/adam_kernel.cu‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎paddle/phi/kernels/gpu/adamw_kernel.cu‎
Lines changed: 3 additions & 4 deletions b/‎paddle/phi/kernels/gpu/adamw_kernel.cu‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎paddle/phi/kernels/gpu/add_n_kernel.cu‎
Lines changed: 6 additions & 6 deletions b/‎paddle/phi/kernels/gpu/add_n_kernel.cu‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎paddle/phi/kernels/gpu/arg_min_max_kernel.cu‎
Lines changed: 1 addition & 1 deletion b/‎paddle/phi/kernels/gpu/arg_min_max_kernel.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/phi/kernels/gpu/c_embedding_grad_kernel.cu‎
Lines changed: 4 additions & 4 deletions b/‎paddle/phi/kernels/gpu/c_embedding_grad_kernel.cu‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎paddle/phi/kernels/gpu/c_embedding_kernel.cu‎
Lines changed: 2 additions & 2 deletions b/‎paddle/phi/kernels/gpu/c_embedding_kernel.cu‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/phi/kernels/gpu/c_softmax_with_cross_entropy_grad_kernel.cu‎
Lines changed: 2 additions & 2 deletions b/‎paddle/phi/kernels/gpu/c_softmax_with_cross_entropy_grad_kernel.cu‎
Lines changed: 2 additions & 2 deletions
@@ -428,6 +428,7 @@ jobs:
           rm -rf * .[^.]*
           set -x
           source ${{ github.workspace }}/../../../proxy
+          python -m pip install --upgrade pip wheel setuptools
           git clone https://github.com/PaddlePaddle/PaddleFleet.git .
           git config --global --add safe.directory /paddle
           git config user.name "PaddleCI"
@@ -450,7 +451,12 @@ jobs:
           export LD_LIBRARY_PATH=/usr/local/cuda-12.9/compat:$LD_LIBRARY_PATH
           export IS_NVIDIA=True
           export PADDLEFLEET_VERSION=0.0.0
-          uv build --wheel -v
+          echo "Downloading Paddle.tar.gz from cfs"
+          wget -q --tries=5 --no-proxy https://paddle-github-action.bj.bcebos.com/PR/h-coverage/${PR_ID}/${COMMIT_ID}/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --no-check-certificate
+          pip uninstall paddlepaddle-gpu -y
+          pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall --extra-index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/
+          pip install paddle-nvidia-nvshmem-cu12 --extra-index-url https://www.paddlepaddle.org.cn/packages/nightly/cu129/
+          uv build --wheel --no-build-isolation -v
           '
 
       - name: upload whl to BOS
 
@@ -24,6 +24,7 @@
 #include "paddle/phi/api/include/api.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/memory/malloc.h"
 
 namespace at {
 
@@ -49,11 +50,24 @@ inline int64_t ResizeCheckedNumel(at::IntArrayRef size) {
   return numel;
 }
 
+inline size_t ResizeCheckedStorageBytes(int64_t numel,
+                                        size_t itemsize,
+                                        size_t storage_offset_bytes) {
+  const auto numel_size = static_cast<size_t>(numel);
+  TORCH_CHECK(
+      itemsize == 0 || numel_size <= (std::numeric_limits<size_t>::max() -
+                                      storage_offset_bytes) /
+                                         itemsize,
+      "resize_ size is too large in bytes");
+  return storage_offset_bytes + numel_size * itemsize;
+}
+
 }  // namespace detail
 
 // resize_ - operate on the underlying DenseTensor directly so we preserve
-// storage semantics across shrink/grow round-trips and only reallocate when
-// the requested shape exceeds the current storage capacity.
+// storage semantics across shrink/grow round-trips. When growth exceeds the
+// current capacity, expand the shared storage itself so aliasing views keep
+// their storage offset and existing storage contents stay intact.
 inline const at::Tensor& Tensor::resize_(
     at::IntArrayRef size,
     ::std::optional<at::MemoryFormat> memory_format) const {
@@ -72,35 +86,33 @@ inline const at::Tensor& Tensor::resize_(
               "resize_ is not allowed on an undefined tensor");
 
   const size_t itemsize = phi::SizeOf(dense_tensor->dtype());
-  const size_t old_numel = static_cast<size_t>(tensor_.numel());
-  const size_t new_numel_size = static_cast<size_t>(new_numel);
-  const size_t required_bytes = new_numel_size * itemsize;
-  const size_t available_bytes =
-      dense_tensor->Holder() == nullptr
-          ? 0
-          : dense_tensor->Holder()->size() - dense_tensor->meta().offset;
+  const size_t new_storage_bytes = detail::ResizeCheckedStorageBytes(
+      new_numel, itemsize, dense_tensor->meta().offset);
+  const size_t current_storage_bytes =
+      dense_tensor->Holder() == nullptr ? 0 : dense_tensor->Holder()->size();
 
-  if (required_bytes <= available_bytes || new_numel == 0) {
+  if (new_storage_bytes <= current_storage_bytes || new_numel == 0) {
     dense_tensor->Resize(dims);
     return *this;
   }
 
+  // Sync through the compat Storage path first so the DenseTensor holder is a
+  // live StorageHolderView backed by shared StorageImpl.
+  auto storage = this->storage();
   const auto old_holder = dense_tensor->Holder();
   TORCH_CHECK(old_holder != nullptr,
               "resize_ cannot grow a tensor without allocated storage");
-  const size_t old_offset = dense_tensor->meta().offset;
-  const size_t copy_bytes = std::min(old_numel, new_numel_size) * itemsize;
   const phi::Place place = old_holder->place();
-  const void* old_data =
-      old_holder == nullptr
-          ? nullptr
-          : reinterpret_cast<const uint8_t*>(old_holder->ptr()) + old_offset;
-
-  dense_tensor->ResizeAndAllocate(phi::make_ddim(dims));
-  void* new_data = dense_tensor->data();
-  if (copy_bytes > 0 && old_data != nullptr && old_data != new_data) {
-    phi::memory_utils::Copy(place, new_data, place, old_data, copy_bytes);
+  auto new_holder = paddle::memory::AllocShared(place, new_storage_bytes);
+  TORCH_CHECK(new_holder != nullptr, "resize_ failed to allocate storage");
+  const size_t copy_bytes = std::min(old_holder->size(), new_storage_bytes);
+  if (copy_bytes > 0 && old_holder->ptr() != nullptr &&
+      old_holder->ptr() != new_holder->ptr()) {
+    phi::memory_utils::Copy(
+        place, new_holder->ptr(), place, old_holder->ptr(), copy_bytes);
   }
+  storage.set_data_ptr_noswap(std::move(new_holder));
+  dense_tensor->Resize(phi::make_ddim(dims));
   return *this;
 }
 
 
@@ -23,6 +23,9 @@ inline at::Tensor slice(const at::Tensor& self,
                         ::std::optional<int64_t> start = ::std::nullopt,
                         ::std::optional<int64_t> end = ::std::nullopt,
                         int64_t step = 1) {
+  // Materialize the compat StorageHolderView before creating the slice so the
+  // base tensor and its views observe the same shared storage during resize_.
+  (void)self.storage();
   return paddle::experimental::slice(
       self._PD_GetInner(),
       {dim},
 
@@ -262,7 +262,7 @@ PADDLE_API void AdamDenseKernel(const Context& dev_ctx,
 
   if (beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace()) {
     // Compute with betapow in REG
-    if (grad_type == phi::DataType::FLOAT32) {
+    if (grad_type == DataType::FLOAT32) {
       AdamKernelREG<T, float, MT><<<blocks, threads, 0, dev_ctx.stream()>>>(
           beta1_,
           beta2_,
@@ -313,7 +313,7 @@ PADDLE_API void AdamDenseKernel(const Context& dev_ctx,
           beta2_ * beta2_pow.data<MT>()[0];
     }
   } else {
-    if (grad_type == phi::DataType::FLOAT32) {
+    if (grad_type == DataType::FLOAT32) {
       AdamKernelMEM<T, float, MT><<<blocks, threads, 0, dev_ctx.stream()>>>(
           beta1_,
           beta2_,
@@ -424,7 +424,7 @@ void MergedAdamKernel(
     if (beta1_pow[idx]->place() == CPUPlace() &&
         beta2_pow[idx]->place() == CPUPlace()) {
       // Compute with betapow in REG
-      if (grad_type == phi::DataType::FLOAT32) {
+      if (grad_type == DataType::FLOAT32) {
         AdamKernelREG<T, float, MT><<<blocks, threads, 0, dev_ctx.stream()>>>(
             beta1_,
             beta2_,
@@ -475,7 +475,7 @@ void MergedAdamKernel(
             beta2_ * beta2_pow[idx]->data<MT>()[0];
       }
     } else {
-      if (grad_type == phi::DataType::FLOAT32) {
+      if (grad_type == DataType::FLOAT32) {
         AdamKernelMEM<T, float, MT><<<blocks, threads, 0, dev_ctx.stream()>>>(
             beta1_,
             beta2_,
 
@@ -250,11 +250,10 @@ PADDLE_API void AdamwDenseKernel(const Context& dev_ctx,
       beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace();
 
   // Determine gradient type
-  const bool use_bfloat32_grad = grad.dtype() == phi::DataType::FLOAT32;
+  const bool use_bfloat32_grad = grad.dtype() == DataType::FLOAT32;
   // Determine moment type
-  const bool use_bfloat16_moments =
-      moment1.dtype() == phi::DataType::BFLOAT16 &&
-      moment2.dtype() == phi::DataType::BFLOAT16;
+  const bool use_bfloat16_moments = moment1.dtype() == DataType::BFLOAT16 &&
+                                    moment2.dtype() == DataType::BFLOAT16;
 
 #define LAUNCH_ADAMW_KERNEL(MOMENT_T)                                     \
   if (beta_pow_on_cpu) {                                                  \
 
@@ -157,8 +157,8 @@ void AddNKernel(const Context &dev_ctx,
   // 1. all inputs are DensorTensor and number >= 2
   // 2. the first tensor is fp32 type and the others are fp16/bf16 type
   if (in_num >= 2 && DenseTensor::classof(x[0]) &&
-      x[0]->dtype() == phi::DataType::FLOAT32 &&
-      x[1]->dtype() != phi::DataType::FLOAT32) {
+      x[0]->dtype() == DataType::FLOAT32 &&
+      x[1]->dtype() != DataType::FLOAT32) {
     auto in_other_dtype = x[1]->dtype();
     int64_t numel = static_cast<const DenseTensor *>(x[0])->numel();
     bool all_dense_tensor = true;
@@ -184,8 +184,8 @@ void AddNKernel(const Context &dev_ctx,
       }
     }
 
-    if (all_dense_tensor && (in_other_dtype == phi::DataType::BFLOAT16 ||
-                             in_other_dtype == phi::DataType::FLOAT16)) {
+    if (all_dense_tensor && (in_other_dtype == DataType::BFLOAT16 ||
+                             in_other_dtype == DataType::FLOAT16)) {
       auto tmp_in_array = phi::memory_utils::Alloc(
           dev_ctx.GetPlace(), in_data.size() * sizeof(void *));
       size_t nbytes_in = in_data.size() * sizeof(void *);
@@ -203,15 +203,15 @@ void AddNKernel(const Context &dev_ctx,
       void **in_array_data = reinterpret_cast<void **>(tmp_in_array->ptr());
       ComputeKernelParameter(numel);
       VLOG(4) << "Call SumArrayMixedTypeCUDAKernel";
-      if (in_other_dtype == phi::DataType::FLOAT16) {
+      if (in_other_dtype == DataType::FLOAT16) {
         SumArrayMixedTypeCUDAKernel<T, phi::float16>
             <<<grids, blocks, 0, stream>>>(in_0,
                                            in_array_data,
                                            out->data<T>(),
                                            numel,
                                            in_data.size(),
                                            in_place);
-      } else if (in_other_dtype == phi::DataType::BFLOAT16) {
+      } else if (in_other_dtype == DataType::BFLOAT16) {
         SumArrayMixedTypeCUDAKernel<T, phi::bfloat16>
             <<<grids, blocks, 0, stream>>>(in_0,
                                            in_array_data,
 
@@ -227,7 +227,7 @@ void ArgMinMaxOpCUDAKernel(const Context& dev_ctx,
           "argmin/argmax input numel must > 0, bug got %d", x.numel()));
   if (dtype == DataType::UNDEFINED) {
     phi::VisitDataTypeTiny(
-        phi::DataType::INT64,
+        DataType::INT64,
         VisitDataCudaArgMinMaxFunctor<Context, T, Reducer>(
             dev_ctx, x, axis.to<int64_t>(), keepdims, flatten, out));
     return;
 
@@ -79,7 +79,7 @@ void CEmbeddingGradKernel(const Context& dev_ctx,
 
   const auto& index_type = ids.dtype();
   if (FLAGS_embedding_deterministic == 1) {
-    if (index_type == phi::DataType::INT32) {
+    if (index_type == DataType::INT32) {
       funcs::LaunchEmbeddingGradDeterministicKernel<T, int32_t>(
           dev_ctx,
           ids.data<int32_t>(),
@@ -90,7 +90,7 @@ void CEmbeddingGradKernel(const Context& dev_ctx,
           K,
           start_index);
       return;
-    } else if (index_type == phi::DataType::INT64) {
+    } else if (index_type == DataType::INT64) {
       funcs::LaunchEmbeddingGradDeterministicKernel<T, int64_t>(
           dev_ctx,
           ids.data<int64_t>(),
@@ -108,7 +108,7 @@ void CEmbeddingGradKernel(const Context& dev_ctx,
       blocks = 1;
     }
     const int64_t end_idx = start_index + N;
-    if (index_type == phi::DataType::INT32) {
+    if (index_type == DataType::INT32) {
       CEmbeddingGrad<T, int32_t>
           <<<blocks, threads, 0, dev_ctx.stream()>>>(d_table,
                                                      d_output,
@@ -120,7 +120,7 @@ void CEmbeddingGradKernel(const Context& dev_ctx,
                                                      end_idx,
                                                      limit);
       return;
-    } else if (index_type == phi::DataType::INT64) {
+    } else if (index_type == DataType::INT64) {
       CEmbeddingGrad<T, int64_t>
           <<<blocks, threads, 0, dev_ctx.stream()>>>(d_table,
                                                      d_output,
 
@@ -81,7 +81,7 @@ void CEmbeddingKernel(const Context& dev_ctx,
   int threads = kNumCUDAThreads;
 
   const auto& index_type = ids.dtype();
-  if (index_type == phi::DataType::INT32) {
+  if (index_type == DataType::INT32) {
     CEmbedding<T, int32_t>
         <<<blocks, threads, 0, dev_ctx.stream()>>>(output,
                                                    table,
@@ -94,7 +94,7 @@ void CEmbeddingKernel(const Context& dev_ctx,
                                                    limit,
                                                    vocab_size);
 
-  } else if (index_type == phi::DataType::INT64) {
+  } else if (index_type == DataType::INT64) {
     CEmbedding<T, int64_t>
         <<<blocks, threads, 0, dev_ctx.stream()>>>(output,
                                                    table,
 
@@ -140,7 +140,7 @@ void CSoftmaxWithCrossEntropyGradKernel(const Context& dev_ctx,
   const int64_t start_index = rank * D;
   const int64_t end_index = start_index + D;
 
-  if (label_type == phi::DataType::INT32) {
+  if (label_type == DataType::INT32) {
     if (C > 1) {
       DenseTensor is_ignore;
       is_ignore.Resize({N, 1});
@@ -178,7 +178,7 @@ void CSoftmaxWithCrossEntropyGradKernel(const Context& dev_ctx,
                                                      D,
                                                      ignore_index);
     }
-  } else if (label_type == phi::DataType::INT64) {
+  } else if (label_type == DataType::INT64) {
     if (C > 1) {
       DenseTensor is_ignore;
       is_ignore.Resize({N, 1});