Skip to content

Commit 86bb9e7

Browse files
wanghuancoderlshpkuEnigmatismsDmovicggggxm
authored
Fix bigtensor 421 (#72953)
* refine forrange (#72360) * refine forrange * refine forrange * reduce support big tensor (#71970) * reduce support big tensor * [PHI] Fix gridDim limit for reduce kernel (#72507) * [API] isclose support bigtensor (#72516) * isclose support bigtensor * refine * [API] isnan isinf isfinite support bigtensor (#72517) * isnan isinf isfinite support bigtensor * refine * [PHI] Fix cum kernel for big tensor (#72562) * [PHI] Preliminary fix for elementwise broadcast int32 shape overflow (#72584) * [PHI] Align linalg.solve kernel with torch (#72608) * Update strided copy kernel (#72662) * [PHI] Fix grid sample kernel for big tensor (#72628) * [PHI] Fix argsort big tensor bug (#72712) * [PHI] Fixed argsort big tensor bug * [PHI] Fixed shape mismatch problem. * [PHI] Fix contiguous kernel for big tensor (#72705) * [PHI] Fix flatten and split kernel for big tensor (#72634) * [PHI] Fix out-of-bound issue of paddle.take_along_axis (#72757) * [PHI] fix paddle.diag with big tensor (#72638) * [API] fix paddle.cross with big tensor (#72652) * [PHI] Fix paddle.where api for big tensor (#72717) * [PHI] Fix bincount kernel for big tensor (#72706) * fix bincount kernel for big tensor * use HostAlloc to alloc memory * add cpu test case * [PHI] Fix full_like kernel for big tensor (#72831) * [API] Fix int overflow and float16 support for paddle.frac (#72815) * [PHI] Align paddle.inner with torch in matmul logic (#72843) * [PHI] Fix paddle.var & paddle.std float16 overflow (#72650) * [PHI] Fix logsumexp precision problem (#72681) * [PHI] Debug for logsumexp, bug source found * [PHI] Removed GetNumBlocks func to get correct logsumexp * [PHI] Removed redundant debug VLOG * [PHI] Elegant grid bounded solution * [Accuracy diff No.55-56、76-77] Fix accuracy diff for var&std API (#72879) * [Accuracy diff No.21] Fix accuracy diff for heaviside API (#72894) --------- Co-authored-by: Shuhao Liang <[email protected]> Co-authored-by: Qianyue He <[email protected]> Co-authored-by: Lei Ding <[email protected]> Co-authored-by: ggggxm <[email protected]> Co-authored-by: xkkkkkk23 <[email protected]> Co-authored-by: Zx <[email protected]> Co-authored-by: huangjiyi <[email protected]> Co-authored-by: ooo oo <[email protected]>
1 parent d488751 commit 86bb9e7

File tree

71 files changed

+1920
-1442
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

71 files changed

+1920
-1442
lines changed

paddle/fluid/operators/elementwise/elementwise_op.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -106,9 +106,9 @@ class ElementwiseOp : public framework::OperatorWithKernel {
106106
axis));
107107
axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1)
108108
: axis);
109-
std::vector<int> x_dims_array(max_dim);
110-
std::vector<int> y_dims_array(max_dim);
111-
std::vector<int> out_dims_array(max_dim);
109+
std::vector<int64_t> x_dims_array(max_dim);
110+
std::vector<int64_t> y_dims_array(max_dim);
111+
std::vector<int64_t> out_dims_array(max_dim);
112112
#ifdef PADDLE_WITH_DNNL
113113
// Broadcasting of dims has to be done on Paddle shapes (NHWC)
114114
// if model is using NHWC and any of shapes in at least 3D
@@ -120,8 +120,8 @@ class ElementwiseOp : public framework::OperatorWithKernel {
120120
if (should_rotate) {
121121
// Pick bigger shape and rotate this one
122122
bool x_over_y = (x_dims.size() > y_dims.size());
123-
auto vdims = x_over_y ? common::vectorize<int>(x_dims)
124-
: common::vectorize<int>(y_dims);
123+
auto vdims = x_over_y ? common::vectorize<int64_t>(x_dims)
124+
: common::vectorize<int64_t>(y_dims);
125125
std::rotate(vdims.begin() + 1, vdims.begin() + 2, vdims.end());
126126
if (x_over_y) {
127127
x_dims = common::make_ddim(vdims);

paddle/fluid/prim/api/composite_backward/composite_backward_api.h

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1308,7 +1308,13 @@ void max_grad(const Tensor& x,
13081308
auto out_grad_tmp = out_grad.expand(IntArray(x_dim));
13091309
auto out_tmp = out.expand(IntArray(x_dim));
13101310
auto mask = equal<T>(x, out_tmp);
1311-
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
1311+
if (axis_size == 0) {
1312+
auto mask_sum = sum<T>(mask, axis, x.dtype(), keepdim = true);
1313+
auto grad_tmp = out_grad_tmp / mask_sum;
1314+
x_grad_tmp = where<T>(mask, grad_tmp, zero_tensor);
1315+
} else {
1316+
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
1317+
}
13121318
} else {
13131319
auto axis_ = std::vector<int64_t>();
13141320
if (reduce_all) {
@@ -1329,7 +1335,13 @@ void max_grad(const Tensor& x,
13291335
auto out_grad_tmp = out_grad_.expand(IntArray(x_dim));
13301336
auto out_tmp = out_.expand(IntArray(x_dim));
13311337
auto mask = equal<T>(x, out_tmp);
1332-
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
1338+
if (axis_size == 0) {
1339+
auto mask_sum = sum<T>(mask, axis_, x.dtype(), keepdim = true);
1340+
auto grad_tmp = out_grad_tmp / mask_sum;
1341+
x_grad_tmp = where<T>(mask, grad_tmp, zero_tensor);
1342+
} else {
1343+
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
1344+
}
13331345
}
13341346
set_output<T>(x_grad_tmp, x_grad);
13351347
}
@@ -1361,7 +1373,13 @@ void min_grad(const Tensor& x,
13611373
auto out_grad_tmp = out_grad.expand(IntArray(x_dim));
13621374
auto out_tmp = out.expand(IntArray(x_dim));
13631375
auto mask = equal<T>(x, out_tmp);
1364-
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
1376+
if (axis_size == 0) {
1377+
auto mask_sum = sum<T>(mask, axis, x.dtype(), keepdim = true);
1378+
auto grad_tmp = out_grad_tmp / mask_sum;
1379+
x_grad_tmp = where<T>(mask, grad_tmp, zero_tensor);
1380+
} else {
1381+
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
1382+
}
13651383
} else {
13661384
auto axis_ = std::vector<int64_t>();
13671385
if (reduce_all) {
@@ -1382,7 +1400,13 @@ void min_grad(const Tensor& x,
13821400
auto out_grad_tmp = out_grad_.expand(IntArray(x_dim));
13831401
auto out_tmp = out_.expand(IntArray(x_dim));
13841402
auto mask = equal<T>(x, out_tmp);
1385-
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
1403+
if (axis_size == 0) {
1404+
auto mask_sum = sum<T>(mask, axis_, x.dtype(), keepdim = true);
1405+
auto grad_tmp = out_grad_tmp / mask_sum;
1406+
x_grad_tmp = where<T>(mask, grad_tmp, zero_tensor);
1407+
} else {
1408+
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
1409+
}
13861410
}
13871411
set_output<T>(x_grad_tmp, x_grad);
13881412
}

paddle/fluid/primitive/decomp_rule/decomp_rule/composite.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -909,7 +909,7 @@ Tensor flatten_decomp(const Tensor& x, int start_axis, int end_axis) {
909909
return reshape<T>(x, x_dim);
910910
}
911911

912-
int slice_numel = 1;
912+
int64_t slice_numel = 1;
913913
for (int i = start_axis; i <= end_axis; ++i) {
914914
slice_numel *= x_dim[i];
915915
}

paddle/fluid/primitive/decomp_rule/decomp_vjp/details.h

Lines changed: 108 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -1575,107 +1575,6 @@ void pad_grad(const Tensor& input,
15751575
}
15761576
}
15771577

1578-
template <typename T>
1579-
void max_grad(const Tensor& x,
1580-
const Tensor& out,
1581-
const Tensor& out_grad,
1582-
const IntArray& axis,
1583-
bool keepdim,
1584-
bool reduce_all,
1585-
Tensor* x_grad) {
1586-
if (!x_grad) {
1587-
return;
1588-
}
1589-
1590-
Tensor x_grad_tmp;
1591-
if (has_dynamic_shape(x.shape())) {
1592-
const Tensor x_shape = shape64<T>(x);
1593-
const Tensor zero_tensor =
1594-
backend::full_with_tensor<T>(x_shape, 0.0, x.dtype(), x.place());
1595-
const int64_t axis_size = axis.size();
1596-
const int64_t x_dim_size = x.dims().size();
1597-
1598-
reduce_all = false;
1599-
if (reduce_all || axis_size == 0 || axis_size == x_dim_size) {
1600-
reduce_all = true;
1601-
} else {
1602-
reduce_all = false;
1603-
}
1604-
1605-
if (x_dim_size == 0 || x_dim_size == 1 || keepdim) {
1606-
auto out_grad_tmp = backend::expand<T>(out_grad, x_shape);
1607-
auto out_tmp = backend::expand<T>(out, x_shape);
1608-
auto mask = equal<T>(x, out_tmp);
1609-
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
1610-
} else {
1611-
const Tensor out_grad_shape = shape64<T>(out_grad);
1612-
auto axis_ = std::vector<int64_t>();
1613-
1614-
if (reduce_all) {
1615-
for (int64_t i = 0; i < x_dim_size; i++) {
1616-
axis_.push_back(i);
1617-
}
1618-
} else {
1619-
axis_ = axis.GetData();
1620-
for (int64_t i = 0; i < axis_size; i++) {
1621-
if (axis[i] < 0) {
1622-
axis_[i] = axis[i] + x_dim_size;
1623-
}
1624-
}
1625-
}
1626-
const Tensor out_grad_shape_extend =
1627-
get_unsqueeze_dims<T>(out_grad_shape, axis_);
1628-
auto out_grad_ = backend::reshape<T>(out_grad, out_grad_shape_extend);
1629-
auto out_ = backend::reshape<T>(out, out_grad_shape_extend);
1630-
auto out_grad_tmp = backend::expand<T>(out_grad_, x_shape);
1631-
auto out_tmp = backend::expand<T>(out_, x_shape);
1632-
auto mask = equal<T>(x, out_tmp);
1633-
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
1634-
}
1635-
} else {
1636-
auto zero_tensor =
1637-
full<T>(common::vectorize(x.dims()), 0.0, x.dtype(), x.place());
1638-
std::vector<int64_t> x_dim = common::vectorize<int64_t>(x.dims());
1639-
int64_t axis_size = axis.size();
1640-
int64_t x_dim_size = x_dim.size();
1641-
reduce_all = false;
1642-
if (reduce_all || axis_size == 0 || axis_size == x_dim_size) {
1643-
reduce_all = true;
1644-
} else {
1645-
reduce_all = false;
1646-
}
1647-
1648-
if (x_dim_size == 0 || x_dim_size == 1 || keepdim) {
1649-
auto out_grad_tmp = out_grad.expand(IntArray(x_dim));
1650-
auto out_tmp = out.expand(IntArray(x_dim));
1651-
auto mask = equal<T>(x, out_tmp);
1652-
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
1653-
} else {
1654-
auto axis_ = std::vector<int64_t>();
1655-
if (reduce_all) {
1656-
for (int64_t i = 0; i < x_dim_size; i++) {
1657-
axis_.push_back(i);
1658-
}
1659-
} else {
1660-
axis_ = axis.GetData();
1661-
for (int64_t i = 0; i < axis_size; i++) {
1662-
if (axis[i] < 0) {
1663-
axis_[i] = axis[i] + x_dim_size;
1664-
}
1665-
}
1666-
}
1667-
auto out_grad_shape = get_unsqueeze_dims(out_grad, axis_);
1668-
auto out_grad_ = reshape<T>(out_grad, out_grad_shape);
1669-
auto out_ = reshape<T>(out, out_grad_shape);
1670-
auto out_grad_tmp = out_grad_.expand(IntArray(x_dim));
1671-
auto out_tmp = out_.expand(IntArray(x_dim));
1672-
auto mask = equal<T>(x, out_tmp);
1673-
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
1674-
}
1675-
}
1676-
set_output<T>(x_grad_tmp, x_grad);
1677-
}
1678-
16791578
template <typename T>
16801579
void slice_grad(const Tensor& input,
16811580
const Tensor& out_grad,
@@ -3498,6 +3397,114 @@ void amin_grad(const Tensor& x,
34983397
}
34993398
}
35003399

3400+
template <typename T>
3401+
void max_grad(const Tensor& x,
3402+
const Tensor& out,
3403+
const Tensor& out_grad,
3404+
const IntArray& axis,
3405+
bool keepdim,
3406+
bool reduce_all,
3407+
Tensor* x_grad) {
3408+
if (!x_grad) {
3409+
return;
3410+
}
3411+
3412+
if (axis.size() == 0) {
3413+
Tensor x_grad_tmp;
3414+
amax_grad<T>(x, out, out_grad, axis, keepdim, reduce_all, &x_grad_tmp);
3415+
set_output<T>(x_grad_tmp, x_grad);
3416+
return;
3417+
}
3418+
3419+
Tensor x_grad_tmp;
3420+
if (has_dynamic_shape(x.shape())) {
3421+
const Tensor x_shape = shape64<T>(x);
3422+
const Tensor zero_tensor =
3423+
backend::full_with_tensor<T>(x_shape, 0.0, x.dtype(), x.place());
3424+
const int64_t axis_size = axis.size();
3425+
const int64_t x_dim_size = x.dims().size();
3426+
3427+
reduce_all = false;
3428+
if (reduce_all || axis_size == 0 || axis_size == x_dim_size) {
3429+
reduce_all = true;
3430+
} else {
3431+
reduce_all = false;
3432+
}
3433+
3434+
if (x_dim_size == 0 || x_dim_size == 1 || keepdim) {
3435+
auto out_grad_tmp = backend::expand<T>(out_grad, x_shape);
3436+
auto out_tmp = backend::expand<T>(out, x_shape);
3437+
auto mask = equal<T>(x, out_tmp);
3438+
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
3439+
} else {
3440+
const Tensor out_grad_shape = shape64<T>(out_grad);
3441+
auto axis_ = std::vector<int64_t>();
3442+
3443+
if (reduce_all) {
3444+
for (int64_t i = 0; i < x_dim_size; i++) {
3445+
axis_.push_back(i);
3446+
}
3447+
} else {
3448+
axis_ = axis.GetData();
3449+
for (int64_t i = 0; i < axis_size; i++) {
3450+
if (axis[i] < 0) {
3451+
axis_[i] = axis[i] + x_dim_size;
3452+
}
3453+
}
3454+
}
3455+
const Tensor out_grad_shape_extend =
3456+
get_unsqueeze_dims<T>(out_grad_shape, axis_);
3457+
auto out_grad_ = backend::reshape<T>(out_grad, out_grad_shape_extend);
3458+
auto out_ = backend::reshape<T>(out, out_grad_shape_extend);
3459+
auto out_grad_tmp = backend::expand<T>(out_grad_, x_shape);
3460+
auto out_tmp = backend::expand<T>(out_, x_shape);
3461+
auto mask = equal<T>(x, out_tmp);
3462+
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
3463+
}
3464+
} else {
3465+
auto zero_tensor =
3466+
full<T>(common::vectorize(x.dims()), 0.0, x.dtype(), x.place());
3467+
std::vector<int64_t> x_dim = common::vectorize<int64_t>(x.dims());
3468+
int64_t axis_size = axis.size();
3469+
int64_t x_dim_size = x_dim.size();
3470+
reduce_all = false;
3471+
if (reduce_all || axis_size == 0 || axis_size == x_dim_size) {
3472+
reduce_all = true;
3473+
} else {
3474+
reduce_all = false;
3475+
}
3476+
3477+
if (x_dim_size == 0 || x_dim_size == 1 || keepdim) {
3478+
auto out_grad_tmp = out_grad.expand(IntArray(x_dim));
3479+
auto out_tmp = out.expand(IntArray(x_dim));
3480+
auto mask = equal<T>(x, out_tmp);
3481+
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
3482+
} else {
3483+
auto axis_ = std::vector<int64_t>();
3484+
if (reduce_all) {
3485+
for (int64_t i = 0; i < x_dim_size; i++) {
3486+
axis_.push_back(i);
3487+
}
3488+
} else {
3489+
axis_ = axis.GetData();
3490+
for (int64_t i = 0; i < axis_size; i++) {
3491+
if (axis[i] < 0) {
3492+
axis_[i] = axis[i] + x_dim_size;
3493+
}
3494+
}
3495+
}
3496+
auto out_grad_shape = get_unsqueeze_dims(out_grad, axis_);
3497+
auto out_grad_ = reshape<T>(out_grad, out_grad_shape);
3498+
auto out_ = reshape<T>(out, out_grad_shape);
3499+
auto out_grad_tmp = out_grad_.expand(IntArray(x_dim));
3500+
auto out_tmp = out_.expand(IntArray(x_dim));
3501+
auto mask = equal<T>(x, out_tmp);
3502+
x_grad_tmp = where<T>(mask, out_grad_tmp, zero_tensor);
3503+
}
3504+
}
3505+
set_output<T>(x_grad_tmp, x_grad);
3506+
}
3507+
35013508
template <typename T>
35023509
void p_norm_grad(const Tensor& x,
35033510
/*output of forward was reserved for efficient backward*/

paddle/phi/core/platform/device/gpu/gpu_launch_config.h

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -176,14 +176,6 @@ inline GpuLaunchConfig GetGpuLaunchConfig2D(const phi::GPUContext& context,
176176
return config;
177177
}
178178

179-
template <typename Context>
180-
void LimitGridDim(const Context& ctx, dim3* grid_dim) {
181-
auto max_grid_dim =
182-
reinterpret_cast<const phi::GPUContext&>(ctx).GetCUDAMaxGridDimSize();
183-
grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
184-
grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
185-
grid_dim->z = grid_dim->z < max_grid_dim[2] ? grid_dim->z : max_grid_dim[2];
186-
}
187179
} // namespace platform
188180
} // namespace paddle
189181

0 commit comments

Comments
 (0)