Skip to content

Commit c6478ef

Browse files
committed
fix npu bug.
1 parent 2100900 commit c6478ef

17 files changed

+273
-53
lines changed

mmcv/ops/csrc/pytorch/npu/chamfer_distance_npu.cpp

+17-2
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,34 @@ using namespace std;
55

66
void chamfer_distance_forward_npu(Tensor XYZ1, Tensor XYZ2, Tensor dist1,
77
Tensor dist2, Tensor idx1, Tensor idx2) {
8+
bool is_half = XYZ1.scalar_type() == at::kHalf;
89
at::Tensor xyz1 = at::ones_like(XYZ1);
910
at::Tensor xyz2 = at::ones_like(XYZ2);
11+
at::Tensor distf1 = at::ones_like(dist1);
12+
at::Tensor distf2 = at::ones_like(dist2);
1013
xyz1 = XYZ1.transpose(1, 2).transpose(0, 1);
1114
xyz2 = XYZ2.transpose(1, 2).transpose(0, 1);
15+
if (is_half) {
16+
xyz1 = xyz1.to(at::kFloat);
17+
xyz2 = xyz2.to(at::kFloat);
18+
distf1 = dist1.to(at::kFloat);
19+
distf2 = dist2.to(at::kFloat);
20+
}
1221
OpCommand cmd;
1322
cmd.Name("ChamferDistance")
1423
.Input(xyz1)
1524
.Input(xyz2)
16-
.Output(dist1)
17-
.Output(dist2)
25+
.Output(distf1)
26+
.Output(distf2)
1827
.Output(idx1)
1928
.Output(idx2)
2029
.Run();
30+
if (is_half) {
31+
distf1 = distf1.to(at::kHalf);
32+
distf2 = distf2.to(at::kHalf);
33+
}
34+
dist1.copy_(distf1);
35+
dist2.copy_(distf2);
2136
}
2237

2338
void chamfer_distance_backward_npu(Tensor xyz1, Tensor xyz2, Tensor idx1,

mmcv/ops/csrc/pytorch/npu/focal_loss_npu.cpp

+85-20
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,21 @@ using namespace std;
44

55
void sigmoid_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
66
Tensor output, float gamma, float alpha) {
7+
at::Tensor input_y = input;
8+
at::Tensor output_y = output;
9+
bool is_half = input.scalar_type() == at::kHalf;
10+
if (is_half) {
11+
input_y = input.to(at::kFloat);
12+
output_y = output.to(at::kFloat);
13+
}
14+
int64_t weight_size = weight.size(0);
15+
at::Tensor weight_y = at::ones_like(input_y);
16+
if (weight_size > 0) {
17+
weight_y = at::broadcast_to(weight, input.sizes());
18+
if (is_half) {
19+
weight_y = weight_y.to(at::kFloat);
20+
}
21+
}
722
int64_t n_class = input.size(1);
823
at::Tensor target_y = at::ones_like(input);
924
if (n_class == 1) {
@@ -12,24 +27,26 @@ void sigmoid_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
1227
target_y = at::add(target_y, 1.0);
1328
} else {
1429
target_y = at::one_hot(target, n_class);
30+
weight_y = at::mul(weight_y, target_y);
31+
weight_y = at::sum(weight_y, 1, true);
32+
weight_y = at::broadcast_to(weight_y, input.sizes());
1533
}
1634
target_y = target_y.to(at::kInt);
17-
int64_t weight_size = weight.size(0);
18-
at::Tensor weight_y = at::ones_like(input);
19-
if (weight_size > 0) {
20-
weight_y = at::broadcast_to(weight, input.sizes());
21-
}
2235
OpCommand cmd;
2336
string reduction = "none";
2437
cmd.Name("SigmoidFocalLoss")
25-
.Input(input)
38+
.Input(input_y)
2639
.Input(target_y)
2740
.Input(weight_y)
28-
.Output(output)
41+
.Output(output_y)
2942
.Attr("gamma", gamma)
3043
.Attr("alpha", alpha)
3144
.Attr("reduction", reduction)
3245
.Run();
46+
if (is_half) {
47+
output_y = output_y.to(at::kHalf);
48+
}
49+
output.copy_(output_y);
3350
}
3451

3552
void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
@@ -38,34 +55,51 @@ void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
3855
void sigmoid_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight,
3956
Tensor grad_input, float gamma,
4057
float alpha) {
58+
at::Tensor input_y = input;
59+
at::Tensor grad_input_y = grad_input;
60+
bool is_half = input.scalar_type() == at::kHalf;
61+
if (is_half) {
62+
input_y = input.to(at::kFloat);
63+
grad_input_y = grad_input.to(at::kFloat);
64+
}
65+
int64_t weight_size = weight.size(0);
66+
at::Tensor weight_y = at::ones_like(input_y);
67+
if (weight_size > 0) {
68+
weight_y = at::broadcast_to(weight, input.sizes());
69+
if (is_half) {
70+
weight_y = weight_y.to(at::kFloat);
71+
}
72+
}
4173
int64_t n_class = input.size(1);
4274
at::Tensor target_y = at::ones_like(input);
4375
if (n_class == 1) {
4476
target_y = at::reshape(target, input.sizes());
4577
} else {
4678
target_y = at::one_hot(target, n_class);
79+
weight_y = at::mul(weight_y, target_y);
80+
weight_y = at::sum(weight_y, 1, true);
81+
weight_y = at::broadcast_to(weight_y, input.sizes());
4782
target_y = at::mul(target_y, -1.0);
4883
target_y = at::add(target_y, 1.0);
4984
}
5085
target_y = target_y.to(at::kInt);
5186
at::Tensor grad_up = at::ones_like(input);
52-
int64_t weight_size = weight.size(0);
53-
at::Tensor weight_y = at::ones_like(input);
54-
if (weight_size > 0) {
55-
weight_y = at::broadcast_to(weight, input.sizes());
56-
}
5787
OpCommand cmd;
5888
string reduction = "none";
5989
cmd.Name("SigmoidFocalLossGrad")
60-
.Input(input)
90+
.Input(input_y)
6191
.Input(target_y)
6292
.Input(grad_up)
6393
.Input(weight_y)
64-
.Output(grad_input)
94+
.Output(grad_input_y)
6595
.Attr("gamma", gamma)
6696
.Attr("alpha", alpha)
6797
.Attr("reduction", reduction)
6898
.Run();
99+
if (is_half) {
100+
grad_input_y = grad_input_y.to(at::kHalf);
101+
}
102+
grad_input.copy_(grad_input_y);
69103
}
70104

71105
void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
@@ -74,26 +108,40 @@ void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
74108

75109
void softmax_focal_loss_forward_npu(Tensor input, Tensor target, Tensor weight,
76110
Tensor output, float gamma, float alpha) {
111+
at::Tensor input_y = input;
112+
bool is_half = input.scalar_type() == at::kHalf;
113+
if (is_half) {
114+
input_y = input.to(at::kFloat);
115+
}
77116
int64_t n_class = input.size(1);
78117
at::Tensor target_y = at::one_hot(target, n_class);
79118
target_y = target_y.to(at::kInt);
80119
int64_t weight_size = weight.size(0);
81-
at::Tensor weight_y = at::ones_like(input);
120+
at::Tensor weight_y = at::ones_like(input_y);
82121
if (weight_size > 0) {
83122
weight_y = at::broadcast_to(weight, input.sizes());
123+
if (is_half) {
124+
weight_y = weight_y.to(at::kFloat);
125+
}
126+
weight_y = at::mul(weight_y, target_y);
127+
weight_y = at::sum(weight_y, 1, true);
128+
weight_y = at::broadcast_to(weight_y, input.sizes());
84129
}
85-
at::Tensor op_output = at::ones_like(input);
130+
at::Tensor op_output = at::ones_like(input_y);
86131
OpCommand cmd;
87132
string reduction = "none";
88133
cmd.Name("SoftmaxFocalLoss")
89-
.Input(input)
134+
.Input(input_y)
90135
.Input(target_y)
91136
.Input(weight_y)
92137
.Output(op_output)
93138
.Attr("gamma", gamma)
94139
.Attr("alpha", alpha)
95140
.Attr("reduction", reduction)
96141
.Run();
142+
if (is_half) {
143+
op_output = op_output.to(at::kHalf);
144+
}
97145
int64_t n_batch = input.size(0);
98146
c10::SmallVector<int64_t, 2> offsets = {0, 0};
99147
c10::SmallVector<int64_t, 2> sizes = {n_batch, 1};
@@ -124,27 +172,44 @@ void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
124172
void softmax_focal_loss_backward_npu(Tensor input, Tensor target, Tensor weight,
125173
Tensor buff, Tensor grad_input,
126174
float gamma, float alpha) {
175+
at::Tensor input_y = input;
176+
at::Tensor grad_input_y = grad_input;
177+
bool is_half = input.scalar_type() == at::kHalf;
178+
if (is_half) {
179+
input_y = input.to(at::kFloat);
180+
grad_input_y = grad_input.to(at::kFloat);
181+
}
127182
int64_t n_class = input.size(1);
128183
at::Tensor target_y = at::one_hot(target, n_class);
129184
target_y = target_y.to(at::kInt);
130185
at::Tensor grad_up = at::ones_like(input);
131186
int64_t weight_size = weight.size(0);
132-
at::Tensor weight_y = at::ones_like(input);
187+
at::Tensor weight_y = at::ones_like(input_y);
133188
if (weight_size > 0) {
134189
weight_y = at::broadcast_to(weight, input.sizes());
190+
if (is_half) {
191+
weight_y = weight_y.to(at::kFloat);
192+
}
193+
weight_y = at::mul(weight_y, target_y);
194+
weight_y = at::sum(weight_y, 1, true);
195+
weight_y = at::broadcast_to(weight_y, input.sizes());
135196
}
136197
OpCommand cmd;
137198
string reduction = "none";
138199
cmd.Name("SoftmaxFocalLossGrad")
139-
.Input(input)
200+
.Input(input_y)
140201
.Input(target_y)
141202
.Input(grad_up)
142203
.Input(weight_y)
143-
.Output(grad_input)
204+
.Output(grad_input_y)
144205
.Attr("gamma", gamma)
145206
.Attr("alpha", alpha)
146207
.Attr("reduction", reduction)
147208
.Run();
209+
if (is_half) {
210+
grad_input_y = grad_input_y.to(at::kHalf);
211+
}
212+
grad_input.copy_(grad_input_y);
148213
}
149214

150215
void softmax_focal_loss_backward_impl(Tensor input, Tensor target,

mmcv/ops/csrc/pytorch/npu/gather_points_npu.cpp

+13-3
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,12 @@ void gather_points_forward_npu(int b, int c, int n, int npoints,
2424
void gather_points_backward_npu(int b, int c, int n, int npoints,
2525
const Tensor grad_out, const Tensor idx,
2626
Tensor grad_points) {
27+
at::Tensor grad_out_cast = grad_out;
28+
at::Tensor grad_points_cast = grad_points;
29+
if (grad_out.scalar_type() == at::ScalarType::Half) {
30+
grad_out_cast = grad_out.to(at::kFloat);
31+
grad_points_cast = grad_points.to(at::kFloat);
32+
}
2733
at::Tensor indices = idx;
2834
if (idx.scalar_type() != at::ScalarType::Int) {
2935
indices = idx.to(at::kInt);
@@ -37,11 +43,11 @@ void gather_points_backward_npu(int b, int c, int n, int npoints,
3743
for (uint64_t i = 0; i < shape.size(); i++) {
3844
pad_size.emplace_back(shape[i]);
3945
}
40-
at::Tensor trans_grad_points = grad_points.transpose(1, 2).contiguous();
46+
at::Tensor trans_grad_points = grad_points_cast.transpose(1, 2).contiguous();
4147
at::Tensor grad_points_view = trans_grad_points.view(
4248
{trans_grad_points.sizes()[0] * trans_grad_points.sizes()[1],
4349
trans_grad_points.sizes()[2]});
44-
at::Tensor trans_grad_out = grad_out.transpose(1, 2).contiguous();
50+
at::Tensor trans_grad_out = grad_out_cast.transpose(1, 2).contiguous();
4551
trans_grad_out = trans_grad_out.view(
4652
{trans_grad_out.sizes()[0] * trans_grad_out.sizes()[1],
4753
trans_grad_out.sizes()[2]});
@@ -63,7 +69,11 @@ void gather_points_backward_npu(int b, int c, int n, int npoints,
6369
at::Tensor grad_points_result =
6470
grad_points_view.view(trans_grad_points.sizes());
6571
grad_points_result = grad_points_result.transpose(1, 2);
66-
grad_points.copy_(grad_points_result);
72+
at::Tensor grad_points_result_cast = grad_points_result;
73+
if (grad_out.scalar_type() == at::ScalarType::Half) {
74+
grad_points_result_cast = grad_points_result.to(at::kHalf);
75+
}
76+
grad_points.copy_(grad_points_result_cast);
6777
}
6878

6979
void gather_points_forward_impl(int b, int c, int n, int npoints,

mmcv/ops/csrc/pytorch/npu/knn_npu.cpp

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#include "pytorch_npu_helper.hpp"
2+
#include "torch_npu/csrc/aten/NPUNativeFunctions.h"
3+
#include "torch_npu/csrc/framework/utils/OpAdapter.h"
4+
5+
using namespace NPU_NAME_SPACE;
6+
using namespace std;
7+
8+
void knn_forward_npu(int b, int n, int m, int nsample, const Tensor xyz,
9+
const Tensor new_xyz, Tensor idx, Tensor dist2) {
10+
// transpose known from [B, N, 3] to [B, 3, N]
11+
at::Tensor source = xyz.transpose(2, 1).contiguous();
12+
at::Tensor target = new_xyz.contiguous();
13+
14+
bool is_from_knn = true;
15+
EXEC_NPU_CMD(aclnnKnn, source, target, is_from_knn, dist2);
16+
}
17+
18+
void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
19+
const Tensor new_xyz, Tensor idx, Tensor dist2);
20+
21+
REGISTER_NPU_IMPL(knn_forward_impl, knn_forward_npu);

mmcv/ops/csrc/pytorch/npu/roi_pool_npu.cpp

+10-4
Original file line numberDiff line numberDiff line change
@@ -50,23 +50,29 @@ void roi_pool_backward_npu(Tensor grad_output, Tensor rois, Tensor argmax,
5050
int64_t pooled_height_64 = pooled_height;
5151
int64_t pooled_width_64 = pooled_width;
5252
int64_t pooled_channel = 1;
53+
at::Tensor argmax_trans = argmax.transpose(1, 2).transpose(2, 3);
54+
at::Tensor grad_output_trans = grad_output.transpose(1, 2).transpose(2, 3);
5355
at::Tensor roi_actual_num =
5456
at::empty_like(rois, rois.options().dtype(at::kInt));
55-
at::Tensor x = at::ones_like(grad_input);
57+
at::Tensor x = at::ones_like(grad_input).transpose(1, 2).transpose(2, 3);
58+
at::Tensor y = at::zeros_like(x);
5659
OpCommand cmd;
5760
cmd.Name("RoiPoolingGradWithArgMax")
58-
.Input(grad_output)
61+
.Input(grad_output_trans)
5962
.Input(x)
6063
.Input(rois)
6164
.Input(roi_actual_num)
62-
.Input(argmax)
63-
.Output(grad_input)
65+
.Input(argmax_trans)
66+
.Output(y)
6467
.Attr("pooled_h", pooled_height_64)
6568
.Attr("pooled_w", pooled_width_64)
6669
.Attr("spatial_scale_h", spatial_scale)
6770
.Attr("spatial_scale_w", spatial_scale)
6871
.Attr("pool_channel", pooled_channel)
6972
.Run();
73+
at::Tensor result = y.transpose(2, 3).transpose(1, 2);
74+
at::Tensor res = result.contiguous();
75+
grad_input.copy_(res);
7076
}
7177

7278
void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,

mmcv/ops/csrc/pytorch/npu/stack_ball_query_npu.cpp

+3-2
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,10 @@ void stack_ball_query_forward_npu(float max_radius, int nsample,
88
const Tensor new_xyz_batch_cnt,
99
const Tensor xyz, const Tensor xyz_batch_cnt,
1010
Tensor idx) {
11-
at::Tensor xyz_transpose = xyz.transpose(0, 1).contiguous();
11+
at::Tensor xyz_transpose = xyz.transpose(0, 1).contiguous().to(at::kFloat);
12+
at::Tensor new_xyz_fp32 = new_xyz.to(at::kFloat);
1213
double max_radius_double = double(max_radius);
13-
EXEC_NPU_CMD(aclnnStackBallQuery, xyz_transpose, new_xyz, xyz_batch_cnt,
14+
EXEC_NPU_CMD(aclnnStackBallQuery, xyz_transpose, new_xyz_fp32, xyz_batch_cnt,
1415
new_xyz_batch_cnt, max_radius_double, nsample, idx);
1516
}
1617

0 commit comments

Comments
 (0)