更换文档检测模型

2024-08-27 14:42:45 +08:00
parent aea6f19951
commit 1514e09c40
2072 changed files with 254336 additions and 4967 deletions
--- a/paddle_detection/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cc
+++ b/paddle_detection/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cc
@@ -0,0 +1,91 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// The code is based on
+// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
+
+#include "../rbox_iou/rbox_iou_utils.h"
+#include "paddle/extension.h"
+
+template <typename T>
+void matched_rbox_iou_cpu_kernel(const int rbox_num, const T *rbox1_data_ptr,
+                                 const T *rbox2_data_ptr, T *output_data_ptr) {
+
+  int i;
+  for (i = 0; i < rbox_num; i++) {
+    output_data_ptr[i] =
+        rbox_iou_single<T>(rbox1_data_ptr + i * 5, rbox2_data_ptr + i * 5);
+  }
+}
+
+#define CHECK_INPUT_CPU(x)                                                     \
+  PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+
+std::vector<paddle::Tensor>
+MatchedRboxIouCPUForward(const paddle::Tensor &rbox1,
+                         const paddle::Tensor &rbox2) {
+  CHECK_INPUT_CPU(rbox1);
+  CHECK_INPUT_CPU(rbox2);
+  PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim");
+
+  auto rbox_num = rbox1.shape()[0];
+  auto output = paddle::empty({rbox_num}, rbox1.dtype(), paddle::CPUPlace());
+
+  PD_DISPATCH_FLOATING_TYPES(rbox1.type(), "matched_rbox_iou_cpu_kernel", ([&] {
+                               matched_rbox_iou_cpu_kernel<data_t>(
+                                   rbox_num, rbox1.data<data_t>(),
+                                   rbox2.data<data_t>(), output.data<data_t>());
+                             }));
+
+  return {output};
+}
+
+#ifdef PADDLE_WITH_CUDA
+std::vector<paddle::Tensor>
+MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1,
+                          const paddle::Tensor &rbox2);
+#endif
+
+#define CHECK_INPUT_SAME(x1, x2)                                               \
+  PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.")
+
+std::vector<paddle::Tensor> MatchedRboxIouForward(const paddle::Tensor &rbox1,
+                                                  const paddle::Tensor &rbox2) {
+  CHECK_INPUT_SAME(rbox1, rbox2);
+  if (rbox1.is_cpu()) {
+    return MatchedRboxIouCPUForward(rbox1, rbox2);
+#ifdef PADDLE_WITH_CUDA
+  } else if (rbox1.is_gpu()) {
+    return MatchedRboxIouCUDAForward(rbox1, rbox2);
+#endif
+  }
+}
+
+std::vector<std::vector<int64_t>>
+MatchedRboxIouInferShape(std::vector<int64_t> rbox1_shape,
+                         std::vector<int64_t> rbox2_shape) {
+  return {{rbox1_shape[0]}};
+}
+
+std::vector<paddle::DataType> MatchedRboxIouInferDtype(paddle::DataType t1,
+                                                       paddle::DataType t2) {
+  return {t1};
+}
+
+PD_BUILD_OP(matched_rbox_iou)
+    .Inputs({"RBOX1", "RBOX2"})
+    .Outputs({"Output"})
+    .SetKernelFn(PD_KERNEL(MatchedRboxIouForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(MatchedRboxIouInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(MatchedRboxIouInferDtype));
--- a/paddle_detection/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cu
+++ b/paddle_detection/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cu
@@ -0,0 +1,58 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// The code is based on
+// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
+
+#include "../rbox_iou/rbox_iou_utils.h"
+#include "paddle/extension.h"
+
+template <typename T>
+__global__ void
+matched_rbox_iou_cuda_kernel(const int rbox_num, const T *rbox1_data_ptr,
+                             const T *rbox2_data_ptr, T *output_data_ptr) {
+  for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < rbox_num;
+       tid += blockDim.x * gridDim.x) {
+    output_data_ptr[tid] =
+        rbox_iou_single<T>(rbox1_data_ptr + tid * 5, rbox2_data_ptr + tid * 5);
+  }
+}
+
+#define CHECK_INPUT_GPU(x)                                                     \
+  PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+
+std::vector<paddle::Tensor>
+MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1,
+                          const paddle::Tensor &rbox2) {
+  CHECK_INPUT_GPU(rbox1);
+  CHECK_INPUT_GPU(rbox2);
+  PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim");
+
+  auto rbox_num = rbox1.shape()[0];
+
+  auto output = paddle::empty({rbox_num}, rbox1.dtype(), paddle::GPUPlace());
+
+  const int thread_per_block = 512;
+  const int block_per_grid = CeilDiv(rbox_num, thread_per_block);
+
+  PD_DISPATCH_FLOATING_TYPES(
+      rbox1.type(), "matched_rbox_iou_cuda_kernel", ([&] {
+        matched_rbox_iou_cuda_kernel<
+            data_t><<<block_per_grid, thread_per_block, 0, rbox1.stream()>>>(
+            rbox_num, rbox1.data<data_t>(), rbox2.data<data_t>(),
+            output.data<data_t>());
+      }));
+
+  return {output};
+}
--- a/paddle_detection/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cc
+++ b/paddle_detection/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cc
@@ -0,0 +1,121 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "../rbox_iou/rbox_iou_utils.h"
+#include "paddle/extension.h"
+
+template <typename T>
+void nms_rotated_cpu_kernel(const T *boxes_data, const float threshold,
+                            const int64_t num_boxes, int64_t *num_keep_boxes,
+                            int64_t *output_data) {
+
+  int num_masks = CeilDiv(num_boxes, 64);
+  std::vector<int64_t> masks(num_masks, 0);
+  for (int64_t i = 0; i < num_boxes; ++i) {
+    if (masks[i / 64] & 1ULL << (i % 64))
+      continue;
+    T box_1[5];
+    for (int k = 0; k < 5; ++k) {
+      box_1[k] = boxes_data[i * 5 + k];
+    }
+    for (int64_t j = i + 1; j < num_boxes; ++j) {
+      if (masks[j / 64] & 1ULL << (j % 64))
+        continue;
+      T box_2[5];
+      for (int k = 0; k < 5; ++k) {
+        box_2[k] = boxes_data[j * 5 + k];
+      }
+      if (rbox_iou_single<T>(box_1, box_2) > threshold) {
+        masks[j / 64] |= 1ULL << (j % 64);
+      }
+    }
+  }
+  int64_t output_data_idx = 0;
+  for (int64_t i = 0; i < num_boxes; ++i) {
+    if (masks[i / 64] & 1ULL << (i % 64))
+      continue;
+    output_data[output_data_idx++] = i;
+  }
+  *num_keep_boxes = output_data_idx;
+  for (; output_data_idx < num_boxes; ++output_data_idx) {
+    output_data[output_data_idx] = 0;
+  }
+}
+
+#define CHECK_INPUT_CPU(x)                                                     \
+  PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+
+std::vector<paddle::Tensor> NMSRotatedCPUForward(const paddle::Tensor &boxes,
+                                                 const paddle::Tensor &scores,
+                                                 float threshold) {
+  CHECK_INPUT_CPU(boxes);
+  CHECK_INPUT_CPU(scores);
+
+  auto num_boxes = boxes.shape()[0];
+
+  auto order_t =
+      std::get<1>(paddle::argsort(scores, /* axis=*/0, /* descending=*/true));
+  auto boxes_sorted = paddle::gather(boxes, order_t, /* axis=*/0);
+
+  auto keep =
+      paddle::empty({num_boxes}, paddle::DataType::INT64, paddle::CPUPlace());
+  int64_t num_keep_boxes = 0;
+
+  PD_DISPATCH_FLOATING_TYPES(boxes.type(), "nms_rotated_cpu_kernel", ([&] {
+                               nms_rotated_cpu_kernel<data_t>(
+                                   boxes_sorted.data<data_t>(), threshold,
+                                   num_boxes, &num_keep_boxes,
+                                   keep.data<int64_t>());
+                             }));
+
+  keep = keep.slice(0, num_keep_boxes);
+  return {paddle::gather(order_t, keep, /* axis=*/0)};
+}
+
+#ifdef PADDLE_WITH_CUDA
+std::vector<paddle::Tensor> NMSRotatedCUDAForward(const paddle::Tensor &boxes,
+                                                  const paddle::Tensor &scores,
+                                                  float threshold);
+#endif
+
+std::vector<paddle::Tensor> NMSRotatedForward(const paddle::Tensor &boxes,
+                                              const paddle::Tensor &scores,
+                                              float threshold) {
+  if (boxes.is_cpu()) {
+    return NMSRotatedCPUForward(boxes, scores, threshold);
+#ifdef PADDLE_WITH_CUDA
+  } else if (boxes.is_gpu()) {
+    return NMSRotatedCUDAForward(boxes, scores, threshold);
+#endif
+  }
+}
+
+std::vector<std::vector<int64_t>>
+NMSRotatedInferShape(std::vector<int64_t> boxes_shape,
+                     std::vector<int64_t> scores_shape) {
+  return {{-1}};
+}
+
+std::vector<paddle::DataType> NMSRotatedInferDtype(paddle::DataType t1,
+                                                   paddle::DataType t2) {
+  return {paddle::DataType::INT64};
+}
+
+PD_BUILD_OP(nms_rotated)
+    .Inputs({"Boxes", "Scores"})
+    .Outputs({"Output"})
+    .Attrs({"threshold: float"})
+    .SetKernelFn(PD_KERNEL(NMSRotatedForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(NMSRotatedInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(NMSRotatedInferDtype));
--- a/paddle_detection/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cu
+++ b/paddle_detection/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cu
@@ -0,0 +1,96 @@
+//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "../rbox_iou/rbox_iou_utils.h"
+#include "paddle/extension.h"
+
+static const int64_t threadsPerBlock = sizeof(int64_t) * 8;
+
+template <typename T>
+__global__ void
+nms_rotated_cuda_kernel(const T *boxes_data, const float threshold,
+                        const int64_t num_boxes, int64_t *masks) {
+  auto raw_start = blockIdx.y;
+  auto col_start = blockIdx.x;
+  if (raw_start > col_start)
+    return;
+  const int raw_last_storage =
+      min(num_boxes - raw_start * threadsPerBlock, threadsPerBlock);
+  const int col_last_storage =
+      min(num_boxes - col_start * threadsPerBlock, threadsPerBlock);
+  if (threadIdx.x < raw_last_storage) {
+    int64_t mask = 0;
+    auto current_box_idx = raw_start * threadsPerBlock + threadIdx.x;
+    const T *current_box = boxes_data + current_box_idx * 5;
+    for (int i = 0; i < col_last_storage; ++i) {
+      const T *target_box = boxes_data + (col_start * threadsPerBlock + i) * 5;
+      if (rbox_iou_single<T>(current_box, target_box) > threshold) {
+        mask |= 1ULL << i;
+      }
+    }
+    const int blocks_per_line = CeilDiv(num_boxes, threadsPerBlock);
+    masks[current_box_idx * blocks_per_line + col_start] = mask;
+  }
+}
+
+#define CHECK_INPUT_GPU(x)                                                     \
+  PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+
+std::vector<paddle::Tensor> NMSRotatedCUDAForward(const paddle::Tensor &boxes,
+                                                  const paddle::Tensor &scores,
+                                                  float threshold) {
+  CHECK_INPUT_GPU(boxes);
+  CHECK_INPUT_GPU(scores);
+
+  auto num_boxes = boxes.shape()[0];
+  auto order_t =
+      std::get<1>(paddle::argsort(scores, /* axis=*/0, /* descending=*/true));
+  auto boxes_sorted = paddle::gather(boxes, order_t, /* axis=*/0);
+
+  const auto blocks_per_line = CeilDiv(num_boxes, threadsPerBlock);
+  dim3 block(threadsPerBlock);
+  dim3 grid(blocks_per_line, blocks_per_line);
+  auto mask_dev = paddle::empty({num_boxes * blocks_per_line},
+                                paddle::DataType::INT64, paddle::GPUPlace());
+
+  PD_DISPATCH_FLOATING_TYPES(
+      boxes.type(), "nms_rotated_cuda_kernel", ([&] {
+        nms_rotated_cuda_kernel<data_t><<<grid, block, 0, boxes.stream()>>>(
+            boxes_sorted.data<data_t>(), threshold, num_boxes,
+            mask_dev.data<int64_t>());
+      }));
+
+  auto mask_host = mask_dev.copy_to(paddle::CPUPlace(), true);
+  auto keep_host =
+      paddle::empty({num_boxes}, paddle::DataType::INT64, paddle::CPUPlace());
+  int64_t *keep_host_ptr = keep_host.data<int64_t>();
+  int64_t *mask_host_ptr = mask_host.data<int64_t>();
+  std::vector<int64_t> remv(blocks_per_line);
+  int64_t last_box_num = 0;
+  for (int64_t i = 0; i < num_boxes; ++i) {
+    auto remv_element_id = i / threadsPerBlock;
+    auto remv_bit_id = i % threadsPerBlock;
+    if (!(remv[remv_element_id] & 1ULL << remv_bit_id)) {
+      keep_host_ptr[last_box_num++] = i;
+      int64_t *current_mask = mask_host_ptr + i * blocks_per_line;
+      for (auto j = remv_element_id; j < blocks_per_line; ++j) {
+        remv[j] |= current_mask[j];
+      }
+    }
+  }
+
+  keep_host = keep_host.slice(0, last_box_num);
+  auto keep_dev = keep_host.copy_to(paddle::GPUPlace(), true);
+  return {paddle::gather(order_t, keep_dev, /* axis=*/0)};
+}
--- a/paddle_detection/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cc
+++ b/paddle_detection/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cc
@@ -0,0 +1,95 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// The code is based on
+// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
+
+#include "paddle/extension.h"
+#include "rbox_iou_utils.h"
+
+template <typename T>
+void rbox_iou_cpu_kernel(const int rbox1_num, const int rbox2_num,
+                         const T *rbox1_data_ptr, const T *rbox2_data_ptr,
+                         T *output_data_ptr) {
+
+  int i, j;
+  for (i = 0; i < rbox1_num; i++) {
+    for (j = 0; j < rbox2_num; j++) {
+      int offset = i * rbox2_num + j;
+      output_data_ptr[offset] =
+          rbox_iou_single<T>(rbox1_data_ptr + i * 5, rbox2_data_ptr + j * 5);
+    }
+  }
+}
+
+#define CHECK_INPUT_CPU(x)                                                     \
+  PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
+
+std::vector<paddle::Tensor> RboxIouCPUForward(const paddle::Tensor &rbox1,
+                                              const paddle::Tensor &rbox2) {
+  CHECK_INPUT_CPU(rbox1);
+  CHECK_INPUT_CPU(rbox2);
+
+  auto rbox1_num = rbox1.shape()[0];
+  auto rbox2_num = rbox2.shape()[0];
+
+  auto output =
+      paddle::empty({rbox1_num, rbox2_num}, rbox1.dtype(), paddle::CPUPlace());
+
+  PD_DISPATCH_FLOATING_TYPES(rbox1.type(), "rbox_iou_cpu_kernel", ([&] {
+                               rbox_iou_cpu_kernel<data_t>(
+                                   rbox1_num, rbox2_num, rbox1.data<data_t>(),
+                                   rbox2.data<data_t>(), output.data<data_t>());
+                             }));
+
+  return {output};
+}
+
+#ifdef PADDLE_WITH_CUDA
+std::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor &rbox1,
+                                               const paddle::Tensor &rbox2);
+#endif
+
+#define CHECK_INPUT_SAME(x1, x2)                                               \
+  PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.")
+
+std::vector<paddle::Tensor> RboxIouForward(const paddle::Tensor &rbox1,
+                                           const paddle::Tensor &rbox2) {
+  CHECK_INPUT_SAME(rbox1, rbox2);
+  if (rbox1.is_cpu()) {
+    return RboxIouCPUForward(rbox1, rbox2);
+#ifdef PADDLE_WITH_CUDA
+  } else if (rbox1.is_gpu()) {
+    return RboxIouCUDAForward(rbox1, rbox2);
+#endif
+  }
+}
+
+std::vector<std::vector<int64_t>>
+RboxIouInferShape(std::vector<int64_t> rbox1_shape,
+                  std::vector<int64_t> rbox2_shape) {
+  return {{rbox1_shape[0], rbox2_shape[0]}};
+}
+
+std::vector<paddle::DataType> RboxIouInferDtype(paddle::DataType t1,
+                                                paddle::DataType t2) {
+  return {t1};
+}
+
+PD_BUILD_OP(rbox_iou)
+    .Inputs({"RBox1", "RBox2"})
+    .Outputs({"Output"})
+    .SetKernelFn(PD_KERNEL(RboxIouForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(RboxIouInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(RboxIouInferDtype));
--- a/paddle_detection/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cu
+++ b/paddle_detection/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cu
@@ -0,0 +1,109 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// The code is based on
+// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
+
+#include "paddle/extension.h"
+#include "rbox_iou_utils.h"
+
+// 2D block with 32 * 16 = 512 threads per block
+const int BLOCK_DIM_X = 32;
+const int BLOCK_DIM_Y = 16;
+
+template <typename T>
+__global__ void rbox_iou_cuda_kernel(const int rbox1_num, const int rbox2_num,
+                                     const T *rbox1_data_ptr,
+                                     const T *rbox2_data_ptr,
+                                     T *output_data_ptr) {
+
+  // get row_start and col_start
+  const int rbox1_block_idx = blockIdx.x * blockDim.x;
+  const int rbox2_block_idx = blockIdx.y * blockDim.y;
+
+  const int rbox1_thread_num = min(rbox1_num - rbox1_block_idx, blockDim.x);
+  const int rbox2_thread_num = min(rbox2_num - rbox2_block_idx, blockDim.y);
+
+  __shared__ T block_boxes1[BLOCK_DIM_X * 5];
+  __shared__ T block_boxes2[BLOCK_DIM_Y * 5];
+
+  // It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y
+  if (threadIdx.x < rbox1_thread_num && threadIdx.y == 0) {
+    block_boxes1[threadIdx.x * 5 + 0] =
+        rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 0];
+    block_boxes1[threadIdx.x * 5 + 1] =
+        rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 1];
+    block_boxes1[threadIdx.x * 5 + 2] =
+        rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 2];
+    block_boxes1[threadIdx.x * 5 + 3] =
+        rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 3];
+    block_boxes1[threadIdx.x * 5 + 4] =
+        rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 4];
+  }
+
+  // threadIdx.x < BLOCK_DIM_Y=rbox2_thread_num, just use same condition as
+  // above: threadIdx.y == 0
+  if (threadIdx.x < rbox2_thread_num && threadIdx.y == 0) {
+    block_boxes2[threadIdx.x * 5 + 0] =
+        rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 0];
+    block_boxes2[threadIdx.x * 5 + 1] =
+        rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 1];
+    block_boxes2[threadIdx.x * 5 + 2] =
+        rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 2];
+    block_boxes2[threadIdx.x * 5 + 3] =
+        rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 3];
+    block_boxes2[threadIdx.x * 5 + 4] =
+        rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 4];
+  }
+
+  // sync
+  __syncthreads();
+
+  if (threadIdx.x < rbox1_thread_num && threadIdx.y < rbox2_thread_num) {
+    int offset = (rbox1_block_idx + threadIdx.x) * rbox2_num + rbox2_block_idx +
+                 threadIdx.y;
+    output_data_ptr[offset] = rbox_iou_single<T>(
+        block_boxes1 + threadIdx.x * 5, block_boxes2 + threadIdx.y * 5);
+  }
+}
+
+#define CHECK_INPUT_GPU(x)                                                     \
+  PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
+
+std::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor &rbox1,
+                                               const paddle::Tensor &rbox2) {
+  CHECK_INPUT_GPU(rbox1);
+  CHECK_INPUT_GPU(rbox2);
+
+  auto rbox1_num = rbox1.shape()[0];
+  auto rbox2_num = rbox2.shape()[0];
+
+  auto output =
+      paddle::empty({rbox1_num, rbox2_num}, rbox1.dtype(), paddle::GPUPlace());
+
+  const int blocks_x = CeilDiv(rbox1_num, BLOCK_DIM_X);
+  const int blocks_y = CeilDiv(rbox2_num, BLOCK_DIM_Y);
+
+  dim3 blocks(blocks_x, blocks_y);
+  dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
+
+  PD_DISPATCH_FLOATING_TYPES(
+      rbox1.type(), "rbox_iou_cuda_kernel", ([&] {
+        rbox_iou_cuda_kernel<data_t><<<blocks, threads, 0, rbox1.stream()>>>(
+            rbox1_num, rbox2_num, rbox1.data<data_t>(), rbox2.data<data_t>(),
+            output.data<data_t>());
+      }));
+
+  return {output};
+}
--- a/paddle_detection/ppdet/ext_op/csrc/rbox_iou/rbox_iou_utils.h
+++ b/paddle_detection/ppdet/ext_op/csrc/rbox_iou/rbox_iou_utils.h
@@ -0,0 +1,356 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// The code is based on
+// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
+
+#pragma once
+
+#include <cassert>
+#include <cmath>
+#include <vector>
+
+#ifdef __CUDACC__
+// Designates functions callable from the host (CPU) and the device (GPU)
+#define HOST_DEVICE __host__ __device__
+#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__
+#else
+#include <algorithm>
+#define HOST_DEVICE
+#define HOST_DEVICE_INLINE HOST_DEVICE inline
+#endif
+
+namespace {
+
+template <typename T> struct RotatedBox { T x_ctr, y_ctr, w, h, a; };
+
+template <typename T> struct Point {
+  T x, y;
+  HOST_DEVICE_INLINE Point(const T &px = 0, const T &py = 0) : x(px), y(py) {}
+  HOST_DEVICE_INLINE Point operator+(const Point &p) const {
+    return Point(x + p.x, y + p.y);
+  }
+  HOST_DEVICE_INLINE Point &operator+=(const Point &p) {
+    x += p.x;
+    y += p.y;
+    return *this;
+  }
+  HOST_DEVICE_INLINE Point operator-(const Point &p) const {
+    return Point(x - p.x, y - p.y);
+  }
+  HOST_DEVICE_INLINE Point operator*(const T coeff) const {
+    return Point(x * coeff, y * coeff);
+  }
+};
+
+template <typename T>
+HOST_DEVICE_INLINE T dot_2d(const Point<T> &A, const Point<T> &B) {
+  return A.x * B.x + A.y * B.y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T cross_2d(const Point<T> &A, const Point<T> &B) {
+  return A.x * B.y - B.x * A.y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE void get_rotated_vertices(const RotatedBox<T> &box,
+                                             Point<T> (&pts)[4]) {
+  // M_PI / 180. == 0.01745329251
+  // double theta = box.a * 0.01745329251;
+  // MODIFIED
+  double theta = box.a;
+  T cosTheta2 = (T)cos(theta) * 0.5f;
+  T sinTheta2 = (T)sin(theta) * 0.5f;
+
+  // y: top --> down; x: left --> right
+  pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
+  pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
+  pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[2].x = 2 * box.x_ctr - pts[0].x;
+  pts[2].y = 2 * box.y_ctr - pts[0].y;
+  pts[3].x = 2 * box.x_ctr - pts[1].x;
+  pts[3].y = 2 * box.y_ctr - pts[1].y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE int get_intersection_points(const Point<T> (&pts1)[4],
+                                               const Point<T> (&pts2)[4],
+                                               Point<T> (&intersections)[24]) {
+  // Line vector
+  // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
+  Point<T> vec1[4], vec2[4];
+  for (int i = 0; i < 4; i++) {
+    vec1[i] = pts1[(i + 1) % 4] - pts1[i];
+    vec2[i] = pts2[(i + 1) % 4] - pts2[i];
+  }
+
+  // Line test - test all line combos for intersection
+  int num = 0; // number of intersections
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      // Solve for 2x2 Ax=b
+      T det = cross_2d<T>(vec2[j], vec1[i]);
+
+      // This takes care of parallel lines
+      if (fabs(det) <= 1e-14) {
+        continue;
+      }
+
+      auto vec12 = pts2[j] - pts1[i];
+
+      T t1 = cross_2d<T>(vec2[j], vec12) / det;
+      T t2 = cross_2d<T>(vec1[i], vec12) / det;
+
+      if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
+        intersections[num++] = pts1[i] + vec1[i] * t1;
+      }
+    }
+  }
+
+  // Check for vertices of rect1 inside rect2
+  {
+    const auto &AB = vec2[0];
+    const auto &DA = vec2[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      // assume ABCD is the rectangle, and P is the point to be judged
+      // P is inside ABCD iff. P's projection on AB lies within AB
+      // and P's projection on AD lies within AD
+
+      auto AP = pts1[i] - pts2[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
+          (APdotAD <= ADdotAD)) {
+        intersections[num++] = pts1[i];
+      }
+    }
+  }
+
+  // Reverse the check - check for vertices of rect2 inside rect1
+  {
+    const auto &AB = vec1[0];
+    const auto &DA = vec1[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      auto AP = pts2[i] - pts1[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
+          (APdotAD <= ADdotAD)) {
+        intersections[num++] = pts2[i];
+      }
+    }
+  }
+
+  return num;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],
+                                          const int &num_in, Point<T> (&q)[24],
+                                          bool shift_to_zero = false) {
+  assert(num_in >= 2);
+
+  // Step 1:
+  // Find point with minimum y
+  // if more than 1 points have the same minimum y,
+  // pick the one with the minimum x.
+  int t = 0;
+  for (int i = 1; i < num_in; i++) {
+    if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
+      t = i;
+    }
+  }
+  auto &start = p[t]; // starting point
+
+  // Step 2:
+  // Subtract starting point from every points (for sorting in the next step)
+  for (int i = 0; i < num_in; i++) {
+    q[i] = p[i] - start;
+  }
+
+  // Swap the starting point to position 0
+  auto tmp = q[0];
+  q[0] = q[t];
+  q[t] = tmp;
+
+  // Step 3:
+  // Sort point 1 ~ num_in according to their relative cross-product values
+  // (essentially sorting according to angles)
+  // If the angles are the same, sort according to their distance to origin
+  T dist[24];
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+
+#ifdef __CUDACC__
+  // CUDA version
+  // In the future, we can potentially use thrust
+  // for sorting here to improve speed (though not guaranteed)
+  for (int i = 1; i < num_in - 1; i++) {
+    for (int j = i + 1; j < num_in; j++) {
+      T crossProduct = cross_2d<T>(q[i], q[j]);
+      if ((crossProduct < -1e-6) ||
+          (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
+        auto q_tmp = q[i];
+        q[i] = q[j];
+        q[j] = q_tmp;
+        auto dist_tmp = dist[i];
+        dist[i] = dist[j];
+        dist[j] = dist_tmp;
+      }
+    }
+  }
+#else
+  // CPU version
+  std::sort(q + 1, q + num_in,
+            [](const Point<T> &A, const Point<T> &B) -> bool {
+              T temp = cross_2d<T>(A, B);
+              if (fabs(temp) < 1e-6) {
+                return dot_2d<T>(A, A) < dot_2d<T>(B, B);
+              } else {
+                return temp > 0;
+              }
+            });
+#endif
+
+  // Step 4:
+  // Make sure there are at least 2 points (that don't overlap with each other)
+  // in the stack
+  int k; // index of the non-overlapped second point
+  for (k = 1; k < num_in; k++) {
+    if (dist[k] > 1e-8) {
+      break;
+    }
+  }
+  if (k == num_in) {
+    // We reach the end, which means the convex hull is just one point
+    q[0] = p[t];
+    return 1;
+  }
+  q[1] = q[k];
+  int m = 2; // 2 points in the stack
+  // Step 5:
+  // Finally we can start the scanning process.
+  // When a non-convex relationship between the 3 points is found
+  // (either concave shape or duplicated points),
+  // we pop the previous point from the stack
+  // until the 3-point relationship is convex again, or
+  // until the stack only contains two points
+  for (int i = k + 1; i < num_in; i++) {
+    while (m > 1 && cross_2d<T>(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
+      m--;
+    }
+    q[m++] = q[i];
+  }
+
+  // Step 6 (Optional):
+  // In general sense we need the original coordinates, so we
+  // need to shift the points back (reverting Step 2)
+  // But if we're only interested in getting the area/perimeter of the shape
+  // We can simply return.
+  if (!shift_to_zero) {
+    for (int i = 0; i < m; i++) {
+      q[i] += start;
+    }
+  }
+
+  return m;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int &m) {
+  if (m <= 2) {
+    return 0;
+  }
+
+  T area = 0;
+  for (int i = 1; i < m - 1; i++) {
+    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
+  }
+
+  return area / 2.0;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T rboxes_intersection(const RotatedBox<T> &box1,
+                                         const RotatedBox<T> &box2) {
+  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
+  // from rotated_rect_intersection_pts
+  Point<T> intersectPts[24], orderedPts[24];
+
+  Point<T> pts1[4];
+  Point<T> pts2[4];
+  get_rotated_vertices<T>(box1, pts1);
+  get_rotated_vertices<T>(box2, pts2);
+
+  int num = get_intersection_points<T>(pts1, pts2, intersectPts);
+
+  if (num <= 2) {
+    return 0.0;
+  }
+
+  // Convex Hull to order the intersection points in clockwise order and find
+  // the contour area.
+  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
+  return polygon_area<T>(orderedPts, num_convex);
+}
+
+} // namespace
+
+template <typename T>
+HOST_DEVICE_INLINE T rbox_iou_single(T const *const box1_raw,
+                                     T const *const box2_raw) {
+  // shift center to the middle point to achieve higher precision in result
+  RotatedBox<T> box1, box2;
+  auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
+  auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
+  box1.x_ctr = box1_raw[0] - center_shift_x;
+  box1.y_ctr = box1_raw[1] - center_shift_y;
+  box1.w = box1_raw[2];
+  box1.h = box1_raw[3];
+  box1.a = box1_raw[4];
+  box2.x_ctr = box2_raw[0] - center_shift_x;
+  box2.y_ctr = box2_raw[1] - center_shift_y;
+  box2.w = box2_raw[2];
+  box2.h = box2_raw[3];
+  box2.a = box2_raw[4];
+
+  if (box1.w < 1e-2 || box1.h < 1e-2 || box2.w < 1e-2 || box2.h < 1e-2) {
+    return 0.f;
+  }
+  const T area1 = box1.w * box1.h;
+  const T area2 = box2.w * box2.h;
+
+  const T intersection = rboxes_intersection<T>(box1, box2);
+  const T iou = intersection / (area1 + area2 - intersection);
+  return iou;
+}
+
+/**
+   Computes ceil(a / b)
+*/
+
+HOST_DEVICE inline int CeilDiv(const int a, const int b) {
+  return (a + b - 1) / b;
+}