|
| 1 | +#include "pytorch_npu_helper.hpp" |
| 2 | + |
| 3 | +using namespace NPU_NAME_SPACE; |
| 4 | +using namespace std; |
| 5 | + |
| 6 | +void border_align_forward_impl(const Tensor &input, const Tensor &boxes, Tensor output, |
| 7 | + Tensor argmax_idx, const int pool_size); |
| 8 | + |
| 9 | +void border_align_forward_npu(const Tensor &input, const Tensor &boxes, Tensor output, |
| 10 | + Tensor argmax_idx, const int pool_size){ |
| 11 | + TORCH_CHECK(input.size(0) == boxes.size(0), "The batch sizes of feature map and rois must be the same."); |
| 12 | + TORCH_CHECK(input.size(1) % 4 == 0, "The number of channels must be divisible by 4."); |
| 13 | + TORCH_CHECK(pool_size >= 2, "The pool size should be larger than 2."); |
| 14 | + int32_t batch_size = input.size(0); |
| 15 | + int32_t channels = input.size(1); |
| 16 | + int32_t height = input.size(2); |
| 17 | + int32_t width = input.size(3); |
| 18 | + at::Tensor feature_map = input.permute({0, 2, 3, 1}).contiguous(); |
| 19 | + at::Tensor rois_map = boxes.contiguous(); |
| 20 | + at::Tensor temp_tensor = at::zeros({batch_size, height * width, pool_size + 1, channels}, input.options()); |
| 21 | + EXEC_NPU_CMD(aclnnBorderAlign, feature_map, rois_map, pool_size, temp_tensor); |
| 22 | + auto max_result = temp_tensor.max(-2); |
| 23 | + at::Tensor output_ = std::get<0>(max_result).to(at::kFloat); |
| 24 | + output_ = output_.reshape({batch_size, height * width, 4, channels / 4}).permute({0, 3, 1, 2}).contiguous(); |
| 25 | + output.copy_(output_); |
| 26 | + at::Tensor argmax_idx_ = std::get<1>(max_result).to(at::kInt); |
| 27 | + argmax_idx_ = argmax_idx_.reshape({batch_size, height * width, 4, channels / 4}).permute({0, 3, 1, 2}).contiguous(); |
| 28 | + argmax_idx.copy_(argmax_idx_); |
| 29 | +} |
| 30 | +REGISTER_NPU_IMPL(border_align_forward_impl, border_align_forward_npu); |
| 31 | + |
| 32 | + |
| 33 | +void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes, |
| 34 | + const Tensor &argmax_idx, Tensor grad_input, |
| 35 | + const int pool_size); |
| 36 | + |
| 37 | +void border_align_backward_npu(const Tensor &grad_output, const Tensor &boxes, |
| 38 | + const Tensor &argmax_idx, Tensor grad_input, |
| 39 | + const int pool_size){ |
| 40 | + TORCH_CHECK(grad_output.dim() == 4, "grad_out.dim() must be 4, but got: ", grad_output.dim()); |
| 41 | + TORCH_CHECK(boxes.dim() == 3, "idx.dim() must be 3, but got: ", boxes.dim()); |
| 42 | + TORCH_CHECK(argmax_idx.dim() == 4, "argmax_idx.dim() must be 4, but got: ", argmax_idx.dim()); |
| 43 | + |
| 44 | + int32_t batch_size = grad_output.size(0); |
| 45 | + int32_t feat_channels = grad_output.size(1) * 4; |
| 46 | + int32_t channels = grad_output.size(1); |
| 47 | + int32_t box_size = boxes.size(1); |
| 48 | + int32_t height = grad_input.size(2); |
| 49 | + int32_t width = grad_input.size(3); |
| 50 | + |
| 51 | + EXEC_NPU_CMD(aclnnBorderAlignGrad, grad_output, boxes, argmax_idx, channels, box_size, height, width, pool_size, batch_size, grad_input); |
| 52 | +} |
| 53 | +REGISTER_NPU_IMPL(border_align_backward_impl, border_align_backward_npu); |
0 commit comments