1
+ #include " pytorch_npu_helper.hpp"
2
+
3
+ using namespace NPU_NAME_SPACE ;
4
+ using namespace std ;
5
+
6
+ void border_align_forward_impl (const Tensor &input, const Tensor &boxes, Tensor output,
7
+ Tensor argmax_idx, const int pool_size);
8
+
9
+ void border_align_forward_npu (const Tensor &input, const Tensor &boxes, Tensor output,
10
+ Tensor argmax_idx, const int pool_size){
11
+ TORCH_CHECK (input.size (0 ) == boxes.size (0 ), " The batch sizes of feature map and rois must be the same." );
12
+ TORCH_CHECK (input.size (1 ) % 4 == 0 , " The number of channels must be divisible by 4." );
13
+ TORCH_CHECK (pool_size >= 2 , " The pool size should be larger than 2." );
14
+ int32_t batch_size = input.size (0 );
15
+ int32_t channels = input.size (1 );
16
+ int32_t height = input.size (2 );
17
+ int32_t width = input.size (3 );
18
+ at::Tensor feature_map = input.permute ({0 , 2 , 3 , 1 }).contiguous ();
19
+ at::Tensor rois_map = boxes.contiguous ();
20
+ at::Tensor temp_tensor = at::zeros ({batch_size, height * width, pool_size + 1 , channels}, input.options ());
21
+ EXEC_NPU_CMD (aclnnBorderAlign, feature_map, rois_map, pool_size, temp_tensor);
22
+ auto max_result = temp_tensor.max (-2 );
23
+ at::Tensor output_ = std::get<0 >(max_result).to (at::kFloat );
24
+ output_ = output_.reshape ({batch_size, height * width, 4 , channels / 4 }).permute ({0 , 3 , 1 , 2 }).contiguous ();
25
+ output.copy_ (output_);
26
+ at::Tensor argmax_idx_ = std::get<1 >(max_result).to (at::kInt );
27
+ argmax_idx_ = argmax_idx_.reshape ({batch_size, height * width, 4 , channels / 4 }).permute ({0 , 3 , 1 , 2 }).contiguous ();
28
+ argmax_idx.copy_ (argmax_idx_);
29
+ }
30
+ REGISTER_NPU_IMPL (border_align_forward_impl, border_align_forward_npu);
31
+
32
+
33
+ void border_align_backward_impl (const Tensor &grad_output, const Tensor &boxes,
34
+ const Tensor &argmax_idx, Tensor grad_input,
35
+ const int pool_size);
36
+
37
+ void border_align_backward_npu (const Tensor &grad_output, const Tensor &boxes,
38
+ const Tensor &argmax_idx, Tensor grad_input,
39
+ const int pool_size){
40
+ TORCH_CHECK (grad_output.dim () == 4 , " grad_out.dim() must be 4, but got: " , grad_output.dim ());
41
+ TORCH_CHECK (boxes.dim () == 3 , " idx.dim() must be 3, but got: " , boxes.dim ());
42
+ TORCH_CHECK (argmax_idx.dim () == 4 , " argmax_idx.dim() must be 4, but got: " , argmax_idx.dim ());
43
+
44
+ int32_t batch_size = grad_output.size (0 );
45
+ int32_t feat_channels = grad_output.size (1 ) * 4 ;
46
+ int32_t channels = grad_output.size (1 );
47
+ int32_t box_size = boxes.size (1 );
48
+ int32_t height = grad_input.size (2 );
49
+ int32_t width = grad_input.size (3 );
50
+
51
+ EXEC_NPU_CMD (aclnnBorderAlignGrad, grad_output, boxes, argmax_idx, channels, box_size, height, width, pool_size, batch_size, grad_input);
52
+ }
53
+ REGISTER_NPU_IMPL (border_align_backward_impl, border_align_backward_npu);
0 commit comments