Skip to content

Commit 3e990ee

Browse files
committed
Merge branch 'develop' into n302
2 parents 3635913 + e78b5d8 commit 3e990ee

File tree

114 files changed

+535
-497
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

114 files changed

+535
-497
lines changed

.github/workflows/H-Coverage.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,7 @@ jobs:
428428
rm -rf * .[^.]*
429429
set -x
430430
source ${{ github.workspace }}/../../../proxy
431+
python -m pip install --upgrade pip wheel setuptools
431432
git clone https://github.com/PaddlePaddle/PaddleFleet.git .
432433
git config --global --add safe.directory /paddle
433434
git config user.name "PaddleCI"
@@ -450,7 +451,12 @@ jobs:
450451
export LD_LIBRARY_PATH=/usr/local/cuda-12.9/compat:$LD_LIBRARY_PATH
451452
export IS_NVIDIA=True
452453
export PADDLEFLEET_VERSION=0.0.0
453-
uv build --wheel -v
454+
echo "Downloading Paddle.tar.gz from cfs"
455+
wget -q --tries=5 --no-proxy https://paddle-github-action.bj.bcebos.com/PR/h-coverage/${PR_ID}/${COMMIT_ID}/paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --no-check-certificate
456+
pip uninstall paddlepaddle-gpu -y
457+
pip install paddlepaddle_gpu-0.0.0-cp310-cp310-linux_x86_64.whl --force-reinstall --extra-index-url=https://www.paddlepaddle.org.cn/packages/nightly/cu129/
458+
pip install paddle-nvidia-nvshmem-cu12 --extra-index-url https://www.paddlepaddle.org.cn/packages/nightly/cu129/
459+
uv build --wheel --no-build-isolation -v
454460
'
455461
456462
- name: upload whl to BOS

paddle/phi/api/include/compat/ATen/ops/resize.h

Lines changed: 33 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "paddle/phi/api/include/api.h"
2525
#include "paddle/phi/common/memory_utils.h"
2626
#include "paddle/phi/core/ddim.h"
27+
#include "paddle/phi/core/memory/malloc.h"
2728

2829
namespace at {
2930

@@ -49,11 +50,24 @@ inline int64_t ResizeCheckedNumel(at::IntArrayRef size) {
4950
return numel;
5051
}
5152

53+
inline size_t ResizeCheckedStorageBytes(int64_t numel,
54+
size_t itemsize,
55+
size_t storage_offset_bytes) {
56+
const auto numel_size = static_cast<size_t>(numel);
57+
TORCH_CHECK(
58+
itemsize == 0 || numel_size <= (std::numeric_limits<size_t>::max() -
59+
storage_offset_bytes) /
60+
itemsize,
61+
"resize_ size is too large in bytes");
62+
return storage_offset_bytes + numel_size * itemsize;
63+
}
64+
5265
} // namespace detail
5366

5467
// resize_ - operate on the underlying DenseTensor directly so we preserve
55-
// storage semantics across shrink/grow round-trips and only reallocate when
56-
// the requested shape exceeds the current storage capacity.
68+
// storage semantics across shrink/grow round-trips. When growth exceeds the
69+
// current capacity, expand the shared storage itself so aliasing views keep
70+
// their storage offset and existing storage contents stay intact.
5771
inline const at::Tensor& Tensor::resize_(
5872
at::IntArrayRef size,
5973
::std::optional<at::MemoryFormat> memory_format) const {
@@ -72,35 +86,33 @@ inline const at::Tensor& Tensor::resize_(
7286
"resize_ is not allowed on an undefined tensor");
7387

7488
const size_t itemsize = phi::SizeOf(dense_tensor->dtype());
75-
const size_t old_numel = static_cast<size_t>(tensor_.numel());
76-
const size_t new_numel_size = static_cast<size_t>(new_numel);
77-
const size_t required_bytes = new_numel_size * itemsize;
78-
const size_t available_bytes =
79-
dense_tensor->Holder() == nullptr
80-
? 0
81-
: dense_tensor->Holder()->size() - dense_tensor->meta().offset;
89+
const size_t new_storage_bytes = detail::ResizeCheckedStorageBytes(
90+
new_numel, itemsize, dense_tensor->meta().offset);
91+
const size_t current_storage_bytes =
92+
dense_tensor->Holder() == nullptr ? 0 : dense_tensor->Holder()->size();
8293

83-
if (required_bytes <= available_bytes || new_numel == 0) {
94+
if (new_storage_bytes <= current_storage_bytes || new_numel == 0) {
8495
dense_tensor->Resize(dims);
8596
return *this;
8697
}
8798

99+
// Sync through the compat Storage path first so the DenseTensor holder is a
100+
// live StorageHolderView backed by shared StorageImpl.
101+
auto storage = this->storage();
88102
const auto old_holder = dense_tensor->Holder();
89103
TORCH_CHECK(old_holder != nullptr,
90104
"resize_ cannot grow a tensor without allocated storage");
91-
const size_t old_offset = dense_tensor->meta().offset;
92-
const size_t copy_bytes = std::min(old_numel, new_numel_size) * itemsize;
93105
const phi::Place place = old_holder->place();
94-
const void* old_data =
95-
old_holder == nullptr
96-
? nullptr
97-
: reinterpret_cast<const uint8_t*>(old_holder->ptr()) + old_offset;
98-
99-
dense_tensor->ResizeAndAllocate(phi::make_ddim(dims));
100-
void* new_data = dense_tensor->data();
101-
if (copy_bytes > 0 && old_data != nullptr && old_data != new_data) {
102-
phi::memory_utils::Copy(place, new_data, place, old_data, copy_bytes);
106+
auto new_holder = paddle::memory::AllocShared(place, new_storage_bytes);
107+
TORCH_CHECK(new_holder != nullptr, "resize_ failed to allocate storage");
108+
const size_t copy_bytes = std::min(old_holder->size(), new_storage_bytes);
109+
if (copy_bytes > 0 && old_holder->ptr() != nullptr &&
110+
old_holder->ptr() != new_holder->ptr()) {
111+
phi::memory_utils::Copy(
112+
place, new_holder->ptr(), place, old_holder->ptr(), copy_bytes);
103113
}
114+
storage.set_data_ptr_noswap(std::move(new_holder));
115+
dense_tensor->Resize(phi::make_ddim(dims));
104116
return *this;
105117
}
106118

paddle/phi/api/include/compat/ATen/ops/slice.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@ inline at::Tensor slice(const at::Tensor& self,
2323
::std::optional<int64_t> start = ::std::nullopt,
2424
::std::optional<int64_t> end = ::std::nullopt,
2525
int64_t step = 1) {
26+
// Materialize the compat StorageHolderView before creating the slice so the
27+
// base tensor and its views observe the same shared storage during resize_.
28+
(void)self.storage();
2629
return paddle::experimental::slice(
2730
self._PD_GetInner(),
2831
{dim},

paddle/phi/kernels/gpu/adam_kernel.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ PADDLE_API void AdamDenseKernel(const Context& dev_ctx,
262262

263263
if (beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace()) {
264264
// Compute with betapow in REG
265-
if (grad_type == phi::DataType::FLOAT32) {
265+
if (grad_type == DataType::FLOAT32) {
266266
AdamKernelREG<T, float, MT><<<blocks, threads, 0, dev_ctx.stream()>>>(
267267
beta1_,
268268
beta2_,
@@ -313,7 +313,7 @@ PADDLE_API void AdamDenseKernel(const Context& dev_ctx,
313313
beta2_ * beta2_pow.data<MT>()[0];
314314
}
315315
} else {
316-
if (grad_type == phi::DataType::FLOAT32) {
316+
if (grad_type == DataType::FLOAT32) {
317317
AdamKernelMEM<T, float, MT><<<blocks, threads, 0, dev_ctx.stream()>>>(
318318
beta1_,
319319
beta2_,
@@ -424,7 +424,7 @@ void MergedAdamKernel(
424424
if (beta1_pow[idx]->place() == CPUPlace() &&
425425
beta2_pow[idx]->place() == CPUPlace()) {
426426
// Compute with betapow in REG
427-
if (grad_type == phi::DataType::FLOAT32) {
427+
if (grad_type == DataType::FLOAT32) {
428428
AdamKernelREG<T, float, MT><<<blocks, threads, 0, dev_ctx.stream()>>>(
429429
beta1_,
430430
beta2_,
@@ -475,7 +475,7 @@ void MergedAdamKernel(
475475
beta2_ * beta2_pow[idx]->data<MT>()[0];
476476
}
477477
} else {
478-
if (grad_type == phi::DataType::FLOAT32) {
478+
if (grad_type == DataType::FLOAT32) {
479479
AdamKernelMEM<T, float, MT><<<blocks, threads, 0, dev_ctx.stream()>>>(
480480
beta1_,
481481
beta2_,

paddle/phi/kernels/gpu/adamw_kernel.cu

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -250,11 +250,10 @@ PADDLE_API void AdamwDenseKernel(const Context& dev_ctx,
250250
beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace();
251251

252252
// Determine gradient type
253-
const bool use_bfloat32_grad = grad.dtype() == phi::DataType::FLOAT32;
253+
const bool use_bfloat32_grad = grad.dtype() == DataType::FLOAT32;
254254
// Determine moment type
255-
const bool use_bfloat16_moments =
256-
moment1.dtype() == phi::DataType::BFLOAT16 &&
257-
moment2.dtype() == phi::DataType::BFLOAT16;
255+
const bool use_bfloat16_moments = moment1.dtype() == DataType::BFLOAT16 &&
256+
moment2.dtype() == DataType::BFLOAT16;
258257

259258
#define LAUNCH_ADAMW_KERNEL(MOMENT_T) \
260259
if (beta_pow_on_cpu) { \

paddle/phi/kernels/gpu/add_n_kernel.cu

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -157,8 +157,8 @@ void AddNKernel(const Context &dev_ctx,
157157
// 1. all inputs are DensorTensor and number >= 2
158158
// 2. the first tensor is fp32 type and the others are fp16/bf16 type
159159
if (in_num >= 2 && DenseTensor::classof(x[0]) &&
160-
x[0]->dtype() == phi::DataType::FLOAT32 &&
161-
x[1]->dtype() != phi::DataType::FLOAT32) {
160+
x[0]->dtype() == DataType::FLOAT32 &&
161+
x[1]->dtype() != DataType::FLOAT32) {
162162
auto in_other_dtype = x[1]->dtype();
163163
int64_t numel = static_cast<const DenseTensor *>(x[0])->numel();
164164
bool all_dense_tensor = true;
@@ -184,8 +184,8 @@ void AddNKernel(const Context &dev_ctx,
184184
}
185185
}
186186

187-
if (all_dense_tensor && (in_other_dtype == phi::DataType::BFLOAT16 ||
188-
in_other_dtype == phi::DataType::FLOAT16)) {
187+
if (all_dense_tensor && (in_other_dtype == DataType::BFLOAT16 ||
188+
in_other_dtype == DataType::FLOAT16)) {
189189
auto tmp_in_array = phi::memory_utils::Alloc(
190190
dev_ctx.GetPlace(), in_data.size() * sizeof(void *));
191191
size_t nbytes_in = in_data.size() * sizeof(void *);
@@ -203,15 +203,15 @@ void AddNKernel(const Context &dev_ctx,
203203
void **in_array_data = reinterpret_cast<void **>(tmp_in_array->ptr());
204204
ComputeKernelParameter(numel);
205205
VLOG(4) << "Call SumArrayMixedTypeCUDAKernel";
206-
if (in_other_dtype == phi::DataType::FLOAT16) {
206+
if (in_other_dtype == DataType::FLOAT16) {
207207
SumArrayMixedTypeCUDAKernel<T, phi::float16>
208208
<<<grids, blocks, 0, stream>>>(in_0,
209209
in_array_data,
210210
out->data<T>(),
211211
numel,
212212
in_data.size(),
213213
in_place);
214-
} else if (in_other_dtype == phi::DataType::BFLOAT16) {
214+
} else if (in_other_dtype == DataType::BFLOAT16) {
215215
SumArrayMixedTypeCUDAKernel<T, phi::bfloat16>
216216
<<<grids, blocks, 0, stream>>>(in_0,
217217
in_array_data,

paddle/phi/kernels/gpu/arg_min_max_kernel.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,7 @@ void ArgMinMaxOpCUDAKernel(const Context& dev_ctx,
227227
"argmin/argmax input numel must > 0, bug got %d", x.numel()));
228228
if (dtype == DataType::UNDEFINED) {
229229
phi::VisitDataTypeTiny(
230-
phi::DataType::INT64,
230+
DataType::INT64,
231231
VisitDataCudaArgMinMaxFunctor<Context, T, Reducer>(
232232
dev_ctx, x, axis.to<int64_t>(), keepdims, flatten, out));
233233
return;

paddle/phi/kernels/gpu/c_embedding_grad_kernel.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ void CEmbeddingGradKernel(const Context& dev_ctx,
7979

8080
const auto& index_type = ids.dtype();
8181
if (FLAGS_embedding_deterministic == 1) {
82-
if (index_type == phi::DataType::INT32) {
82+
if (index_type == DataType::INT32) {
8383
funcs::LaunchEmbeddingGradDeterministicKernel<T, int32_t>(
8484
dev_ctx,
8585
ids.data<int32_t>(),
@@ -90,7 +90,7 @@ void CEmbeddingGradKernel(const Context& dev_ctx,
9090
K,
9191
start_index);
9292
return;
93-
} else if (index_type == phi::DataType::INT64) {
93+
} else if (index_type == DataType::INT64) {
9494
funcs::LaunchEmbeddingGradDeterministicKernel<T, int64_t>(
9595
dev_ctx,
9696
ids.data<int64_t>(),
@@ -108,7 +108,7 @@ void CEmbeddingGradKernel(const Context& dev_ctx,
108108
blocks = 1;
109109
}
110110
const int64_t end_idx = start_index + N;
111-
if (index_type == phi::DataType::INT32) {
111+
if (index_type == DataType::INT32) {
112112
CEmbeddingGrad<T, int32_t>
113113
<<<blocks, threads, 0, dev_ctx.stream()>>>(d_table,
114114
d_output,
@@ -120,7 +120,7 @@ void CEmbeddingGradKernel(const Context& dev_ctx,
120120
end_idx,
121121
limit);
122122
return;
123-
} else if (index_type == phi::DataType::INT64) {
123+
} else if (index_type == DataType::INT64) {
124124
CEmbeddingGrad<T, int64_t>
125125
<<<blocks, threads, 0, dev_ctx.stream()>>>(d_table,
126126
d_output,

paddle/phi/kernels/gpu/c_embedding_kernel.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ void CEmbeddingKernel(const Context& dev_ctx,
8181
int threads = kNumCUDAThreads;
8282

8383
const auto& index_type = ids.dtype();
84-
if (index_type == phi::DataType::INT32) {
84+
if (index_type == DataType::INT32) {
8585
CEmbedding<T, int32_t>
8686
<<<blocks, threads, 0, dev_ctx.stream()>>>(output,
8787
table,
@@ -94,7 +94,7 @@ void CEmbeddingKernel(const Context& dev_ctx,
9494
limit,
9595
vocab_size);
9696

97-
} else if (index_type == phi::DataType::INT64) {
97+
} else if (index_type == DataType::INT64) {
9898
CEmbedding<T, int64_t>
9999
<<<blocks, threads, 0, dev_ctx.stream()>>>(output,
100100
table,

paddle/phi/kernels/gpu/c_softmax_with_cross_entropy_grad_kernel.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ void CSoftmaxWithCrossEntropyGradKernel(const Context& dev_ctx,
140140
const int64_t start_index = rank * D;
141141
const int64_t end_index = start_index + D;
142142

143-
if (label_type == phi::DataType::INT32) {
143+
if (label_type == DataType::INT32) {
144144
if (C > 1) {
145145
DenseTensor is_ignore;
146146
is_ignore.Resize({N, 1});
@@ -178,7 +178,7 @@ void CSoftmaxWithCrossEntropyGradKernel(const Context& dev_ctx,
178178
D,
179179
ignore_index);
180180
}
181-
} else if (label_type == phi::DataType::INT64) {
181+
} else if (label_type == DataType::INT64) {
182182
if (C > 1) {
183183
DenseTensor is_ignore;
184184
is_ignore.Resize({N, 1});

0 commit comments

Comments
 (0)