更换文档检测模型

2024-08-27 14:42:45 +08:00
parent aea6f19951
commit 1514e09c40
2072 changed files with 254336 additions and 4967 deletions
--- a/paddle_detection/ppdet/data/init.py
+++ b/paddle_detection/ppdet/data/init.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from . import source
+from . import transform
+from . import reader
+
+from .source import *
+from .transform import *
+from .reader import *
--- a/paddle_detection/ppdet/data/crop_utils/init.py
+++ b/paddle_detection/ppdet/data/crop_utils/init.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/paddle_detection/ppdet/data/crop_utils/annotation_cropper.py
+++ b/paddle_detection/ppdet/data/crop_utils/annotation_cropper.py
@@ -0,0 +1,580 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import math
+import random
+import numpy as np
+from copy import deepcopy
+from typing import List, Tuple
+from collections import defaultdict
+
+from .chip_box_utils import nms, transform_chip_boxes2image_boxes
+from .chip_box_utils import find_chips_to_cover_overlaped_boxes
+from .chip_box_utils import transform_chip_box
+from .chip_box_utils import intersection_over_box
+
+
+class AnnoCropper(object):
+    def __init__(self,
+                 image_target_sizes: List[int],
+                 valid_box_ratio_ranges: List[List[float]],
+                 chip_target_size: int,
+                 chip_target_stride: int,
+                 use_neg_chip: bool=False,
+                 max_neg_num_per_im: int=8,
+                 max_per_img: int=-1,
+                 nms_thresh: int=0.5):
+        """
+        Generate chips by chip_target_size and chip_target_stride.
+        These two parameters just like kernel_size and stride in cnn.
+
+        Each image has its raw size. After resizing, then get its target size.
+        The resizing scale = target_size / raw_size.
+        So are chips of the image.
+        box_ratio = box_raw_size / image_raw_size = box_target_size / image_target_size
+        The 'size' above mentioned is the size of long-side of image, box or chip.
+
+        :param image_target_sizes: [2000, 1000]
+        :param valid_box_ratio_ranges:  [[-1, 0.1],[0.08, -1]]
+        :param chip_target_size: 500
+        :param chip_target_stride: 200
+        """
+        self.target_sizes = image_target_sizes
+        self.valid_box_ratio_ranges = valid_box_ratio_ranges
+        assert len(self.target_sizes) == len(self.valid_box_ratio_ranges)
+        self.scale_num = len(self.target_sizes)
+        self.chip_target_size = chip_target_size  # is target size
+        self.chip_target_stride = chip_target_stride  # is target stride
+        self.use_neg_chip = use_neg_chip
+        self.max_neg_num_per_im = max_neg_num_per_im
+        self.max_per_img = max_per_img
+        self.nms_thresh = nms_thresh
+
+    def crop_anno_records(self, records: List[dict]):
+        """
+        The main logic:
+        # foreach record(image):
+        #   foreach scale:
+        #     1 generate chips by chip size and stride for each scale
+        #     2 get pos chips
+        #     - validate boxes: current scale; h,w >= 1
+        #     - find pos chips greedily by valid gt boxes in each scale
+        #     - for every valid gt box, find its corresponding pos chips in each scale
+        #     3 get neg chips
+        #     - If given proposals, find neg boxes in them which are not in pos chips
+        #     - If got neg boxes in last step, we find neg chips and assign neg boxes to neg chips such as 2.
+        # 4 sample neg chips if too much each image
+        #   transform this image-scale annotations to chips(pos chips&neg chips) annotations
+
+        :param records, standard coco_record but with extra key `proposals`(Px4), which are predicted by stage1
+                        model and maybe have neg boxes in them.
+        :return: new_records, list of dict like
+        {
+            'im_file': 'fake_image1.jpg',
+            'im_id': np.array([1]),  # new _global_chip_id as im_id
+            'h': h,  # chip height
+            'w': w,  # chip width
+            'is_crowd': is_crowd,  # Nx1 -> Mx1
+            'gt_class': gt_class,  # Nx1 -> Mx1
+            'gt_bbox': gt_bbox,  # Nx4 -> Mx4, 4 represents [x1,y1,x2,y2]
+            'gt_poly': gt_poly,  # [None]xN -> [None]xM
+            'chip': [x1, y1, x2, y2]  # added
+        }
+
+        Attention:
+        ------------------------------>x
+        |
+        |    (x1,y1)------
+        |       |        |
+        |       |        |
+        |       |        |
+        |       |        |
+        |       |        |
+        |       ----------
+        |                 (x2,y2)
+        |
+        ↓
+        y
+
+        If we use [x1, y1, x2, y2] to represent boxes or chips,
+        (x1,y1) is the left-top point which is in the box,
+        but (x2,y2) is the right-bottom point which is not in the box.
+        So x1 in [0, w-1], x2 in [1, w], y1 in [0, h-1], y2 in [1,h].
+        And you can use x2-x1 to get width, and you can use image[y1:y2, x1:x2] to get the box area.
+        """
+
+        self.chip_records = []
+        self._global_chip_id = 1
+        for r in records:
+            self._cur_im_pos_chips = [
+            ]  # element: (chip, boxes_idx), chip is [x1, y1, x2, y2], boxes_ids is List[int]
+            self._cur_im_neg_chips = []  # element: (chip, neg_box_num)
+            for scale_i in range(self.scale_num):
+                self._get_current_scale_parameters(scale_i, r)
+
+                # Cx4
+                chips = self._create_chips(r['h'], r['w'], self._cur_scale)
+
+                # # dict: chipid->[box_id, ...]
+                pos_chip2boxes_idx = self._get_valid_boxes_and_pos_chips(
+                    r['gt_bbox'], chips)
+
+                # dict: chipid->neg_box_num
+                neg_chip2box_num = self._get_neg_boxes_and_chips(
+                    chips,
+                    list(pos_chip2boxes_idx.keys()), r.get('proposals', None))
+
+                self._add_to_cur_im_chips(chips, pos_chip2boxes_idx,
+                                          neg_chip2box_num)
+
+            cur_image_records = self._trans_all_chips2annotations(r)
+            self.chip_records.extend(cur_image_records)
+        return self.chip_records
+
+    def _add_to_cur_im_chips(self, chips, pos_chip2boxes_idx, neg_chip2box_num):
+        for pos_chipid, boxes_idx in pos_chip2boxes_idx.items():
+            chip = np.array(chips[pos_chipid])  # copy chips slice
+            self._cur_im_pos_chips.append((chip, boxes_idx))
+
+        if neg_chip2box_num is None:
+            return
+
+        for neg_chipid, neg_box_num in neg_chip2box_num.items():
+            chip = np.array(chips[neg_chipid])
+            self._cur_im_neg_chips.append((chip, neg_box_num))
+
+    def _trans_all_chips2annotations(self, r):
+        gt_bbox = r['gt_bbox']
+        im_file = r['im_file']
+        is_crowd = r['is_crowd']
+        gt_class = r['gt_class']
+        # gt_poly = r['gt_poly']   # [None]xN
+        # remaining keys: im_id, h, w
+        chip_records = self._trans_pos_chips2annotations(im_file, gt_bbox,
+                                                         is_crowd, gt_class)
+
+        if not self.use_neg_chip:
+            return chip_records
+
+        sampled_neg_chips = self._sample_neg_chips()
+        neg_chip_records = self._trans_neg_chips2annotations(im_file,
+                                                             sampled_neg_chips)
+        chip_records.extend(neg_chip_records)
+        return chip_records
+
+    def _trans_pos_chips2annotations(self, im_file, gt_bbox, is_crowd,
+                                     gt_class):
+        chip_records = []
+        for chip, boxes_idx in self._cur_im_pos_chips:
+            chip_bbox, final_boxes_idx = transform_chip_box(gt_bbox, boxes_idx,
+                                                            chip)
+            x1, y1, x2, y2 = chip
+            chip_h = y2 - y1
+            chip_w = x2 - x1
+            rec = {
+                'im_file': im_file,
+                'im_id': np.array([self._global_chip_id]),
+                'h': chip_h,
+                'w': chip_w,
+                'gt_bbox': chip_bbox,
+                'is_crowd': is_crowd[final_boxes_idx].copy(),
+                'gt_class': gt_class[final_boxes_idx].copy(),
+                # 'gt_poly': [None] * len(final_boxes_idx),
+                'chip': chip
+            }
+            self._global_chip_id += 1
+            chip_records.append(rec)
+        return chip_records
+
+    def _sample_neg_chips(self):
+        pos_num = len(self._cur_im_pos_chips)
+        neg_num = len(self._cur_im_neg_chips)
+        sample_num = min(pos_num + 2, self.max_neg_num_per_im)
+        assert sample_num >= 1
+        if neg_num <= sample_num:
+            return self._cur_im_neg_chips
+
+        candidate_num = int(sample_num * 1.5)
+        candidate_neg_chips = sorted(
+            self._cur_im_neg_chips, key=lambda x: -x[1])[:candidate_num]
+        random.shuffle(candidate_neg_chips)
+        sampled_neg_chips = candidate_neg_chips[:sample_num]
+        return sampled_neg_chips
+
+    def _trans_neg_chips2annotations(self,
+                                     im_file: str,
+                                     sampled_neg_chips: List[Tuple]):
+        chip_records = []
+        for chip, neg_box_num in sampled_neg_chips:
+            x1, y1, x2, y2 = chip
+            chip_h = y2 - y1
+            chip_w = x2 - x1
+            rec = {
+                'im_file': im_file,
+                'im_id': np.array([self._global_chip_id]),
+                'h': chip_h,
+                'w': chip_w,
+                'gt_bbox': np.zeros(
+                    (0, 4), dtype=np.float32),
+                'is_crowd': np.zeros(
+                    (0, 1), dtype=np.int32),
+                'gt_class': np.zeros(
+                    (0, 1), dtype=np.int32),
+                # 'gt_poly': [],
+                'chip': chip
+            }
+            self._global_chip_id += 1
+            chip_records.append(rec)
+        return chip_records
+
+    def _get_current_scale_parameters(self, scale_i, r):
+        im_size = max(r['h'], r['w'])
+        im_target_size = self.target_sizes[scale_i]
+        self._cur_im_size, self._cur_im_target_size = im_size, im_target_size
+        self._cur_scale = self._get_current_scale(im_target_size, im_size)
+        self._cur_valid_ratio_range = self.valid_box_ratio_ranges[scale_i]
+
+    def _get_current_scale(self, im_target_size, im_size):
+        return im_target_size / im_size
+
+    def _create_chips(self, h: int, w: int, scale: float):
+        """
+        Generate chips by chip_target_size and chip_target_stride.
+        These two parameters just like kernel_size and stride in cnn.
+        :return: chips, Cx4, xy in raw size dimension
+        """
+        chip_size = self.chip_target_size  # omit target for simplicity
+        stride = self.chip_target_stride
+        width = int(scale * w)
+        height = int(scale * h)
+        min_chip_location_diff = 20  # in target size
+
+        assert chip_size >= stride
+        chip_overlap = chip_size - stride
+        if (width - chip_overlap
+            ) % stride > min_chip_location_diff:  # 不能被stride整除的部分比较大，则保留
+            w_steps = max(1, int(math.ceil((width - chip_overlap) / stride)))
+        else:  # 不能被stride整除的部分比较小，则丢弃
+            w_steps = max(1, int(math.floor((width - chip_overlap) / stride)))
+        if (height - chip_overlap) % stride > min_chip_location_diff:
+            h_steps = max(1, int(math.ceil((height - chip_overlap) / stride)))
+        else:
+            h_steps = max(1, int(math.floor((height - chip_overlap) / stride)))
+
+        chips = list()
+        for j in range(h_steps):
+            for i in range(w_steps):
+                x1 = i * stride
+                y1 = j * stride
+                x2 = min(x1 + chip_size, width)
+                y2 = min(y1 + chip_size, height)
+                chips.append([x1, y1, x2, y2])
+
+        # check  chip size
+        for item in chips:
+            if item[2] - item[0] > chip_size * 1.1 or item[3] - item[
+                    1] > chip_size * 1.1:
+                raise ValueError(item)
+        chips = np.array(chips, dtype=np.float32)
+
+        raw_size_chips = chips / scale
+        return raw_size_chips
+
+    def _get_valid_boxes_and_pos_chips(self, gt_bbox, chips):
+        valid_ratio_range = self._cur_valid_ratio_range
+        im_size = self._cur_im_size
+        scale = self._cur_scale
+        #   Nx4            N
+        valid_boxes, valid_boxes_idx = self._validate_boxes(
+            valid_ratio_range, im_size, gt_bbox, scale)
+        # dict: chipid->[box_id, ...]
+        pos_chip2boxes_idx = self._find_pos_chips(chips, valid_boxes,
+                                                  valid_boxes_idx)
+        return pos_chip2boxes_idx
+
+    def _validate_boxes(self,
+                        valid_ratio_range: List[float],
+                        im_size: int,
+                        gt_boxes: 'np.array of Nx4',
+                        scale: float):
+        """
+        :return: valid_boxes: Nx4, valid_boxes_idx: N
+        """
+        ws = (gt_boxes[:, 2] - gt_boxes[:, 0]).astype(np.int32)
+        hs = (gt_boxes[:, 3] - gt_boxes[:, 1]).astype(np.int32)
+        maxs = np.maximum(ws, hs)
+        box_ratio = maxs / im_size
+        mins = np.minimum(ws, hs)
+        target_mins = mins * scale
+
+        low = valid_ratio_range[0] if valid_ratio_range[0] > 0 else 0
+        high = valid_ratio_range[1] if valid_ratio_range[1] > 0 else np.finfo(
+            np.float32).max
+
+        valid_boxes_idx = np.nonzero((low <= box_ratio) & (box_ratio < high) & (
+            target_mins >= 2))[0]
+        valid_boxes = gt_boxes[valid_boxes_idx]
+        return valid_boxes, valid_boxes_idx
+
+    def _find_pos_chips(self,
+                        chips: 'Cx4',
+                        valid_boxes: 'Bx4',
+                        valid_boxes_idx: 'B'):
+        """
+        :return: pos_chip2boxes_idx, dict: chipid->[box_id, ...]
+        """
+        iob = intersection_over_box(chips, valid_boxes)  # overlap, CxB
+
+        iob_threshold_to_find_chips = 1.
+        pos_chip_ids, _ = self._find_chips_to_cover_overlaped_boxes(
+            iob, iob_threshold_to_find_chips)
+        pos_chip_ids = set(pos_chip_ids)
+
+        iob_threshold_to_assign_box = 0.5
+        pos_chip2boxes_idx = self._assign_boxes_to_pos_chips(
+            iob, iob_threshold_to_assign_box, pos_chip_ids, valid_boxes_idx)
+        return pos_chip2boxes_idx
+
+    def _find_chips_to_cover_overlaped_boxes(self, iob, overlap_threshold):
+        return find_chips_to_cover_overlaped_boxes(iob, overlap_threshold)
+
+    def _assign_boxes_to_pos_chips(self, iob, overlap_threshold, pos_chip_ids,
+                                   valid_boxes_idx):
+        chip_ids, box_ids = np.nonzero(iob >= overlap_threshold)
+        pos_chip2boxes_idx = defaultdict(list)
+        for chip_id, box_id in zip(chip_ids, box_ids):
+            if chip_id not in pos_chip_ids:
+                continue
+            raw_gt_box_idx = valid_boxes_idx[box_id]
+            pos_chip2boxes_idx[chip_id].append(raw_gt_box_idx)
+        return pos_chip2boxes_idx
+
+    def _get_neg_boxes_and_chips(self,
+                                 chips: 'Cx4',
+                                 pos_chip_ids: 'D',
+                                 proposals: 'Px4'):
+        """
+        :param chips:
+        :param pos_chip_ids:
+        :param proposals:
+        :return: neg_chip2box_num, None or dict: chipid->neg_box_num
+        """
+        if not self.use_neg_chip:
+            return None
+
+        # train proposals maybe None
+        if proposals is None or len(proposals) < 1:
+            return None
+
+        valid_ratio_range = self._cur_valid_ratio_range
+        im_size = self._cur_im_size
+        scale = self._cur_scale
+
+        valid_props, _ = self._validate_boxes(valid_ratio_range, im_size,
+                                              proposals, scale)
+        neg_boxes = self._find_neg_boxes(chips, pos_chip_ids, valid_props)
+        neg_chip2box_num = self._find_neg_chips(chips, pos_chip_ids, neg_boxes)
+        return neg_chip2box_num
+
+    def _find_neg_boxes(self,
+                        chips: 'Cx4',
+                        pos_chip_ids: 'D',
+                        valid_props: 'Px4'):
+        """
+        :return: neg_boxes: Nx4
+        """
+        if len(pos_chip_ids) == 0:
+            return valid_props
+
+        pos_chips = chips[pos_chip_ids]
+        iob = intersection_over_box(pos_chips, valid_props)
+        overlap_per_prop = np.max(iob, axis=0)
+        non_overlap_props_idx = overlap_per_prop < 0.5
+        neg_boxes = valid_props[non_overlap_props_idx]
+        return neg_boxes
+
+    def _find_neg_chips(self, chips: 'Cx4', pos_chip_ids: 'D',
+                        neg_boxes: 'Nx4'):
+        """
+        :return: neg_chip2box_num, dict: chipid->neg_box_num
+        """
+        neg_chip_ids = np.setdiff1d(np.arange(len(chips)), pos_chip_ids)
+        neg_chips = chips[neg_chip_ids]
+
+        iob = intersection_over_box(neg_chips, neg_boxes)
+        iob_threshold_to_find_chips = 0.7
+        chosen_neg_chip_ids, chip_id2overlap_box_num = \
+            self._find_chips_to_cover_overlaped_boxes(iob, iob_threshold_to_find_chips)
+
+        neg_chipid2box_num = {}
+        for cid in chosen_neg_chip_ids:
+            box_num = chip_id2overlap_box_num[cid]
+            raw_chip_id = neg_chip_ids[cid]
+            neg_chipid2box_num[raw_chip_id] = box_num
+        return neg_chipid2box_num
+
+    def crop_infer_anno_records(self, records: List[dict]):
+        """
+        transform image record to chips record
+        :param records:
+        :return: new_records, list of dict like
+        {
+            'im_file': 'fake_image1.jpg',
+            'im_id': np.array([1]),  # new _global_chip_id as im_id
+            'h': h,  # chip height
+            'w': w,  # chip width
+            'chip': [x1, y1, x2, y2]  # added
+            'ori_im_h': ori_im_h  # added, origin image height
+            'ori_im_w': ori_im_w  # added, origin image width
+            'scale_i': 0  # added,
+        }
+        """
+        self.chip_records = []
+        self._global_chip_id = 1  # im_id start from 1
+        self._global_chip_id2img_id = {}
+
+        for r in records:
+            for scale_i in range(self.scale_num):
+                self._get_current_scale_parameters(scale_i, r)
+                # Cx4
+                chips = self._create_chips(r['h'], r['w'], self._cur_scale)
+                cur_img_chip_record = self._get_chips_records(r, chips, scale_i)
+                self.chip_records.extend(cur_img_chip_record)
+
+        return self.chip_records
+
+    def _get_chips_records(self, rec, chips, scale_i):
+        cur_img_chip_records = []
+        ori_im_h = rec["h"]
+        ori_im_w = rec["w"]
+        im_file = rec["im_file"]
+        ori_im_id = rec["im_id"]
+        for id, chip in enumerate(chips):
+            chip_rec = {}
+            x1, y1, x2, y2 = chip
+            chip_h = y2 - y1
+            chip_w = x2 - x1
+            chip_rec["im_file"] = im_file
+            chip_rec["im_id"] = self._global_chip_id
+            chip_rec["h"] = chip_h
+            chip_rec["w"] = chip_w
+            chip_rec["chip"] = chip
+            chip_rec["ori_im_h"] = ori_im_h
+            chip_rec["ori_im_w"] = ori_im_w
+            chip_rec["scale_i"] = scale_i
+
+            self._global_chip_id2img_id[self._global_chip_id] = int(ori_im_id)
+            self._global_chip_id += 1
+            cur_img_chip_records.append(chip_rec)
+
+        return cur_img_chip_records
+
+    def aggregate_chips_detections(self, results, records=None):
+        """
+        # 1. transform chip dets to image dets
+        # 2. nms boxes per image;
+        # 3. format output results
+        :param results:
+        :param roidb:
+        :return:
+        """
+        results = deepcopy(results)
+        records = records if records else self.chip_records
+        img_id2bbox = self._transform_chip2image_bboxes(results, records)
+        nms_img_id2bbox = self._nms_dets(img_id2bbox)
+        aggregate_results = self._reformat_results(nms_img_id2bbox)
+        return aggregate_results
+
+    def _transform_chip2image_bboxes(self, results, records):
+        # 1. Transform chip dets to image dets;
+        # 2. Filter valid range;
+        # 3. Reformat and Aggregate chip dets to Get scale_cls_dets
+        img_id2bbox = defaultdict(list)
+        for result in results:
+            bbox_locs = result['bbox']
+            bbox_nums = result['bbox_num']
+            if len(bbox_locs) == 1 and bbox_locs[0][
+                    0] == -1:  # current batch has no detections
+                # bbox_locs = array([[-1.]], dtype=float32); bbox_nums = [[1]]
+                # MultiClassNMS output: If there is no detected boxes for all images, lod will be set to {1} and Out only contains one value which is -1.
+                continue
+            im_ids = result['im_id']  # replace with range(len(bbox_nums))
+
+            last_bbox_num = 0
+            for idx, im_id in enumerate(im_ids):
+
+                cur_bbox_len = bbox_nums[idx]
+                bboxes = bbox_locs[last_bbox_num:last_bbox_num + cur_bbox_len]
+                last_bbox_num += cur_bbox_len
+                # box: [num_id, score, xmin, ymin, xmax, ymax]
+                if len(bboxes) == 0:  # current image has no detections
+                    continue
+
+                chip_rec = records[int(im_id) -
+                                   1]  # im_id starts from 1, type is np.int64
+                image_size = max(chip_rec["ori_im_h"], chip_rec["ori_im_w"])
+
+                bboxes = transform_chip_boxes2image_boxes(
+                    bboxes, chip_rec["chip"], chip_rec["ori_im_h"],
+                    chip_rec["ori_im_w"])
+
+                scale_i = chip_rec["scale_i"]
+                cur_scale = self._get_current_scale(self.target_sizes[scale_i],
+                                                    image_size)
+                _, valid_boxes_idx = self._validate_boxes(
+                    self.valid_box_ratio_ranges[scale_i], image_size,
+                    bboxes[:, 2:], cur_scale)
+                ori_img_id = self._global_chip_id2img_id[int(im_id)]
+
+                img_id2bbox[ori_img_id].append(bboxes[valid_boxes_idx])
+
+        return img_id2bbox
+
+    def _nms_dets(self, img_id2bbox):
+        # 1. NMS on each image-class
+        # 2. Limit number of detections to MAX_PER_IMAGE if requested
+        max_per_img = self.max_per_img
+        nms_thresh = self.nms_thresh
+
+        for img_id in img_id2bbox:
+            box = img_id2bbox[
+                img_id]  # list of np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2]
+            box = np.concatenate(box, axis=0)
+            nms_dets = nms(box, nms_thresh)
+            if max_per_img > 0:
+                if len(nms_dets) > max_per_img:
+                    keep = np.argsort(-nms_dets[:, 1])[:max_per_img]
+                    nms_dets = nms_dets[keep]
+
+            img_id2bbox[img_id] = nms_dets
+
+        return img_id2bbox
+
+    def _reformat_results(self, img_id2bbox):
+        """reformat results"""
+        im_ids = img_id2bbox.keys()
+        results = []
+        for img_id in im_ids:  # output by original im_id order
+            if len(img_id2bbox[img_id]) == 0:
+                bbox = np.array(
+                    [[-1., 0., 0., 0., 0., 0.]])  # edge case: no detections
+                bbox_num = np.array([0])
+            else:
+                # np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2]
+                bbox = img_id2bbox[img_id]
+                bbox_num = np.array([len(bbox)])
+            res = dict(im_id=np.array([[img_id]]), bbox=bbox, bbox_num=bbox_num)
+            results.append(res)
+        return results
--- a/paddle_detection/ppdet/data/crop_utils/chip_box_utils.py
+++ b/paddle_detection/ppdet/data/crop_utils/chip_box_utils.py
@@ -0,0 +1,170 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+
+def bbox_area(boxes):
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+def intersection_over_box(chips, boxes):
+    """
+    intersection area over box area
+    :param chips:  C
+    :param boxes:  B
+    :return: iob, CxB
+    """
+    M = chips.shape[0]
+    N = boxes.shape[0]
+    if M * N == 0:
+        return np.zeros([M, N], dtype='float32')
+
+    box_area = bbox_area(boxes)  # B
+
+    inter_x2y2 = np.minimum(np.expand_dims(chips, 1)[:, :, 2:],
+                            boxes[:, 2:])  # CxBX2
+    inter_x1y1 = np.maximum(np.expand_dims(chips, 1)[:, :, :2],
+                            boxes[:, :2])  # CxBx2
+    inter_wh = inter_x2y2 - inter_x1y1
+    inter_wh = np.clip(inter_wh, a_min=0, a_max=None)
+    inter_area = inter_wh[:, :, 0] * inter_wh[:, :, 1]  # CxB
+
+    iob = inter_area / np.expand_dims(box_area, 0)
+    return iob
+
+
+def clip_boxes(boxes, im_shape):
+    """
+    Clip boxes to image boundaries.
+    :param boxes: [N, 4]
+    :param im_shape: tuple of 2, [h, w]
+    :return: [N, 4]
+    """
+    # x1 >= 0
+    boxes[:, 0] = np.clip(boxes[:, 0], 0, im_shape[1] - 1)
+    # y1 >= 0
+    boxes[:, 1] = np.clip(boxes[:, 1], 0, im_shape[0] - 1)
+    # x2 < im_shape[1]
+    boxes[:, 2] = np.clip(boxes[:, 2], 1, im_shape[1])
+    # y2 < im_shape[0]
+    boxes[:, 3] = np.clip(boxes[:, 3], 1, im_shape[0])
+    return boxes
+
+
+def transform_chip_box(gt_bbox: 'Gx4', boxes_idx: 'B', chip: '4'):
+    boxes_idx = np.array(boxes_idx)
+    cur_gt_bbox = gt_bbox[boxes_idx].copy()  # Bx4
+    x1, y1, x2, y2 = chip
+    cur_gt_bbox[:, 0] -= x1
+    cur_gt_bbox[:, 1] -= y1
+    cur_gt_bbox[:, 2] -= x1
+    cur_gt_bbox[:, 3] -= y1
+    h = y2 - y1
+    w = x2 - x1
+    cur_gt_bbox = clip_boxes(cur_gt_bbox, (h, w))
+    ws = (cur_gt_bbox[:, 2] - cur_gt_bbox[:, 0]).astype(np.int32)
+    hs = (cur_gt_bbox[:, 3] - cur_gt_bbox[:, 1]).astype(np.int32)
+    valid_idx = (ws >= 2) & (hs >= 2)
+    return cur_gt_bbox[valid_idx], boxes_idx[valid_idx]
+
+
+def find_chips_to_cover_overlaped_boxes(iob, overlap_threshold):
+    chip_ids, box_ids = np.nonzero(iob >= overlap_threshold)
+    chip_id2overlap_box_num = np.bincount(chip_ids)  # 1d array
+    chip_id2overlap_box_num = np.pad(
+        chip_id2overlap_box_num, (0, len(iob) - len(chip_id2overlap_box_num)),
+        constant_values=0)
+
+    chosen_chip_ids = []
+    while len(box_ids) > 0:
+        value_counts = np.bincount(chip_ids)  # 1d array
+        max_count_chip_id = np.argmax(value_counts)
+        assert max_count_chip_id not in chosen_chip_ids
+        chosen_chip_ids.append(max_count_chip_id)
+
+        box_ids_in_cur_chip = box_ids[chip_ids == max_count_chip_id]
+        ids_not_in_cur_boxes_mask = np.logical_not(
+            np.isin(box_ids, box_ids_in_cur_chip))
+        chip_ids = chip_ids[ids_not_in_cur_boxes_mask]
+        box_ids = box_ids[ids_not_in_cur_boxes_mask]
+    return chosen_chip_ids, chip_id2overlap_box_num
+
+
+def transform_chip_boxes2image_boxes(chip_boxes, chip, img_h, img_w):
+    chip_boxes = np.array(sorted(chip_boxes, key=lambda item: -item[1]))
+    xmin, ymin, _, _ = chip
+    # Transform to origin image loc
+    chip_boxes[:, 2] += xmin
+    chip_boxes[:, 4] += xmin
+    chip_boxes[:, 3] += ymin
+    chip_boxes[:, 5] += ymin
+    chip_boxes = clip_boxes(chip_boxes, (img_h, img_w))
+    return chip_boxes
+
+
+def nms(dets, thresh):
+    """Apply classic DPM-style greedy NMS."""
+    if dets.shape[0] == 0:
+        return dets[[], :]
+    scores = dets[:, 1]
+    x1 = dets[:, 2]
+    y1 = dets[:, 3]
+    x2 = dets[:, 4]
+    y2 = dets[:, 5]
+
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+
+    ndets = dets.shape[0]
+    suppressed = np.zeros((ndets), dtype=np.int32)
+
+    # nominal indices
+    # _i, _j
+    # sorted indices
+    # i, j
+    # temp variables for box i's (the box currently under consideration)
+    # ix1, iy1, ix2, iy2, iarea
+
+    # variables for computing overlap with box j (lower scoring box)
+    # xx1, yy1, xx2, yy2
+    # w, h
+    # inter, ovr
+
+    for _i in range(ndets):
+        i = order[_i]
+        if suppressed[i] == 1:
+            continue
+        ix1 = x1[i]
+        iy1 = y1[i]
+        ix2 = x2[i]
+        iy2 = y2[i]
+        iarea = areas[i]
+        for _j in range(_i + 1, ndets):
+            j = order[_j]
+            if suppressed[j] == 1:
+                continue
+            xx1 = max(ix1, x1[j])
+            yy1 = max(iy1, y1[j])
+            xx2 = min(ix2, x2[j])
+            yy2 = min(iy2, y2[j])
+            w = max(0.0, xx2 - xx1 + 1)
+            h = max(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (iarea + areas[j] - inter)
+            if ovr >= thresh:
+                suppressed[j] = 1
+    keep = np.where(suppressed == 0)[0]
+    dets = dets[keep, :]
+    return dets
--- a/paddle_detection/ppdet/data/culane_utils.py
+++ b/paddle_detection/ppdet/data/culane_utils.py
@@ -0,0 +1,130 @@
+import math
+import numpy as np
+from imgaug.augmentables.lines import LineString
+from scipy.interpolate import InterpolatedUnivariateSpline
+
+
+def lane_to_linestrings(lanes):
+    lines = []
+    for lane in lanes:
+        lines.append(LineString(lane))
+
+    return lines
+
+
+def linestrings_to_lanes(lines):
+    lanes = []
+    for line in lines:
+        lanes.append(line.coords)
+
+    return lanes
+
+
+def sample_lane(points, sample_ys, img_w):
+    # this function expects the points to be sorted
+    points = np.array(points)
+    if not np.all(points[1:, 1] < points[:-1, 1]):
+        raise Exception('Annotaion points have to be sorted')
+    x, y = points[:, 0], points[:, 1]
+
+    # interpolate points inside domain
+    assert len(points) > 1
+    interp = InterpolatedUnivariateSpline(
+        y[::-1], x[::-1], k=min(3, len(points) - 1))
+    domain_min_y = y.min()
+    domain_max_y = y.max()
+    sample_ys_inside_domain = sample_ys[(sample_ys >= domain_min_y) & (
+        sample_ys <= domain_max_y)]
+    assert len(sample_ys_inside_domain) > 0
+    interp_xs = interp(sample_ys_inside_domain)
+
+    # extrapolate lane to the bottom of the image with a straight line using the 2 points closest to the bottom
+    two_closest_points = points[:2]
+    extrap = np.polyfit(
+        two_closest_points[:, 1], two_closest_points[:, 0], deg=1)
+    extrap_ys = sample_ys[sample_ys > domain_max_y]
+    extrap_xs = np.polyval(extrap, extrap_ys)
+    all_xs = np.hstack((extrap_xs, interp_xs))
+
+    # separate between inside and outside points
+    inside_mask = (all_xs >= 0) & (all_xs < img_w)
+    xs_inside_image = all_xs[inside_mask]
+    xs_outside_image = all_xs[~inside_mask]
+
+    return xs_outside_image, xs_inside_image
+
+
+def filter_lane(lane):
+    assert lane[-1][1] <= lane[0][1]
+    filtered_lane = []
+    used = set()
+    for p in lane:
+        if p[1] not in used:
+            filtered_lane.append(p)
+            used.add(p[1])
+
+    return filtered_lane
+
+
+def transform_annotation(img_w, img_h, max_lanes, n_offsets, offsets_ys,
+                         n_strips, strip_size, anno):
+    old_lanes = anno['lanes']
+
+    # removing lanes with less than 2 points
+    old_lanes = filter(lambda x: len(x) > 1, old_lanes)
+    # sort lane points by Y (bottom to top of the image)
+    old_lanes = [sorted(lane, key=lambda x: -x[1]) for lane in old_lanes]
+    # remove points with same Y (keep first occurrence)
+    old_lanes = [filter_lane(lane) for lane in old_lanes]
+    # normalize the annotation coordinates
+    old_lanes = [[[x * img_w / float(img_w), y * img_h / float(img_h)]
+                  for x, y in lane] for lane in old_lanes]
+    # create tranformed annotations
+    lanes = np.ones(
+        (max_lanes, 2 + 1 + 1 + 2 + n_offsets), dtype=np.float32
+    ) * -1e5  # 2 scores, 1 start_y, 1 start_x, 1 theta, 1 length, S+1 coordinates
+    lanes_endpoints = np.ones((max_lanes, 2))
+    # lanes are invalid by default
+    lanes[:, 0] = 1
+    lanes[:, 1] = 0
+    for lane_idx, lane in enumerate(old_lanes):
+        if lane_idx >= max_lanes:
+            break
+
+        try:
+            xs_outside_image, xs_inside_image = sample_lane(lane, offsets_ys,
+                                                            img_w)
+        except AssertionError:
+            continue
+        if len(xs_inside_image) <= 1:
+            continue
+        all_xs = np.hstack((xs_outside_image, xs_inside_image))
+        lanes[lane_idx, 0] = 0
+        lanes[lane_idx, 1] = 1
+        lanes[lane_idx, 2] = len(xs_outside_image) / n_strips
+        lanes[lane_idx, 3] = xs_inside_image[0]
+
+        thetas = []
+        for i in range(1, len(xs_inside_image)):
+            theta = math.atan(
+                i * strip_size /
+                (xs_inside_image[i] - xs_inside_image[0] + 1e-5)) / math.pi
+            theta = theta if theta > 0 else 1 - abs(theta)
+            thetas.append(theta)
+
+        theta_far = sum(thetas) / len(thetas)
+
+        # lanes[lane_idx,
+        #       4] = (theta_closest + theta_far) / 2  # averaged angle
+        lanes[lane_idx, 4] = theta_far
+        lanes[lane_idx, 5] = len(xs_inside_image)
+        lanes[lane_idx, 6:6 + len(all_xs)] = all_xs
+        lanes_endpoints[lane_idx, 0] = (len(all_xs) - 1) / n_strips
+        lanes_endpoints[lane_idx, 1] = xs_inside_image[-1]
+
+    new_anno = {
+        'label': lanes,
+        'old_anno': anno,
+        'lane_endpoints': lanes_endpoints
+    }
+    return new_anno
--- a/paddle_detection/ppdet/data/reader.py
+++ b/paddle_detection/ppdet/data/reader.py
@@ -0,0 +1,615 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import os
+import traceback
+import six
+import sys
+if sys.version_info >= (3, 0):
+    pass
+else:
+    pass
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+from copy import deepcopy
+
+from paddle.io import DataLoader, DistributedBatchSampler
+from .utils import default_collate_fn
+
+from ppdet.core.workspace import register
+from . import transform
+from .shm_utils import _get_shared_memory_size_in_M
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('reader')
+
+MAIN_PID = os.getpid()
+
+
+class Compose(object):
+    def __init__(self, transforms, num_classes=80):
+        self.transforms = transforms
+        self.transforms_cls = []
+        for t in self.transforms:
+            for k, v in t.items():
+                op_cls = getattr(transform, k)
+                f = op_cls(**v)
+                if hasattr(f, 'num_classes'):
+                    f.num_classes = num_classes
+
+                self.transforms_cls.append(f)
+
+    def __call__(self, data):
+        for f in self.transforms_cls:
+            try:
+                data = f(data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map sample transform [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        return data
+
+
+class BatchCompose(Compose):
+    def __init__(self, transforms, num_classes=80, collate_batch=True):
+        super(BatchCompose, self).__init__(transforms, num_classes)
+        self.collate_batch = collate_batch
+
+    def __call__(self, data):
+        for f in self.transforms_cls:
+            try:
+                data = f(data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map batch transform [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        # remove keys which is not needed by model
+        extra_key = ['h', 'w', 'flipped']
+        for k in extra_key:
+            for sample in data:
+                if k in sample:
+                    sample.pop(k)
+
+        # batch data, if user-define batch function needed
+        # use user-defined here
+        if self.collate_batch:
+            batch_data = default_collate_fn(data)
+        else:
+            batch_data = {}
+            for k in data[0].keys():
+                tmp_data = []
+                for i in range(len(data)):
+                    tmp_data.append(data[i][k])
+                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
+                    tmp_data = np.stack(tmp_data, axis=0)
+                batch_data[k] = tmp_data
+        return batch_data
+
+
+class BaseDataLoader(object):
+    """
+    Base DataLoader implementation for detection models
+
+    Args:
+        sample_transforms (list): a list of transforms to perform
+                                  on each sample
+        batch_transforms (list): a list of transforms to perform
+                                 on batch
+        batch_size (int): batch size for batch collating, default 1.
+        shuffle (bool): whether to shuffle samples
+        drop_last (bool): whether to drop the last incomplete,
+                          default False
+        num_classes (int): class number of dataset, default 80
+        collate_batch (bool): whether to collate batch in dataloader.
+            If set to True, the samples will collate into batch according
+            to the batch size. Otherwise, the ground-truth will not collate,
+            which is used when the number of ground-truch is different in 
+            samples.
+        use_shared_memory (bool): whether to use shared memory to
+                accelerate data loading, enable this only if you
+                are sure that the shared memory size of your OS
+                is larger than memory cost of input datas of model.
+                Note that shared memory will be automatically
+                disabled if the shared memory of OS is less than
+                1G, which is not enough for detection models.
+                Default False.
+    """
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=False,
+                 num_classes=80,
+                 collate_batch=True,
+                 use_shared_memory=False,
+                 **kwargs):
+        # sample transform
+        self._sample_transforms = Compose(
+            sample_transforms, num_classes=num_classes)
+
+        # batch transfrom 
+        self._batch_transforms = BatchCompose(batch_transforms, num_classes,
+                                              collate_batch)
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.use_shared_memory = use_shared_memory
+        self.kwargs = kwargs
+
+    def __call__(self,
+                 dataset,
+                 worker_num,
+                 batch_sampler=None,
+                 return_list=False):
+        self.dataset = dataset
+        self.dataset.check_or_download_dataset()
+        self.dataset.parse_dataset()
+        # get data
+        self.dataset.set_transform(self._sample_transforms)
+        # set kwargs
+        self.dataset.set_kwargs(**self.kwargs)
+        # batch sampler
+        if batch_sampler is None:
+            self._batch_sampler = DistributedBatchSampler(
+                self.dataset,
+                batch_size=self.batch_size,
+                shuffle=self.shuffle,
+                drop_last=self.drop_last)
+        else:
+            self._batch_sampler = batch_sampler
+
+        # DataLoader do not start sub-process in Windows and Mac
+        # system, do not need to use shared memory
+        use_shared_memory = self.use_shared_memory and \
+                            sys.platform not in ['win32', 'darwin']
+        # check whether shared memory size is bigger than 1G(1024M)
+        if use_shared_memory:
+            shm_size = _get_shared_memory_size_in_M()
+            if shm_size is not None and shm_size < 1024.:
+                logger.warning("Shared memory size is less than 1G, "
+                               "disable shared_memory in DataLoader")
+                use_shared_memory = False
+
+        self.dataloader = DataLoader(
+            dataset=self.dataset,
+            batch_sampler=self._batch_sampler,
+            collate_fn=self._batch_transforms,
+            num_workers=worker_num,
+            return_list=return_list,
+            use_shared_memory=use_shared_memory)
+        self.loader = iter(self.dataloader)
+
+        return self
+
+    def __len__(self):
+        return len(self._batch_sampler)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        try:
+            return next(self.loader)
+        except StopIteration:
+            self.loader = iter(self.dataloader)
+            six.reraise(*sys.exc_info())
+
+    def next(self):
+        # python2 compatibility
+        return self.__next__()
+
+
+@register
+class TrainReader(BaseDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=True,
+                 drop_last=True,
+                 num_classes=80,
+                 collate_batch=True,
+                 **kwargs):
+        super(TrainReader, self).__init__(sample_transforms, batch_transforms,
+                                          batch_size, shuffle, drop_last,
+                                          num_classes, collate_batch, **kwargs)
+
+
+@register
+class EvalReader(BaseDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=False,
+                 num_classes=80,
+                 **kwargs):
+        super(EvalReader, self).__init__(sample_transforms, batch_transforms,
+                                         batch_size, shuffle, drop_last,
+                                         num_classes, **kwargs)
+
+
+@register
+class TestReader(BaseDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=False,
+                 num_classes=80,
+                 **kwargs):
+        super(TestReader, self).__init__(sample_transforms, batch_transforms,
+                                         batch_size, shuffle, drop_last,
+                                         num_classes, **kwargs)
+
+
+@register
+class EvalMOTReader(BaseDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=False,
+                 num_classes=1,
+                 **kwargs):
+        super(EvalMOTReader, self).__init__(sample_transforms, batch_transforms,
+                                            batch_size, shuffle, drop_last,
+                                            num_classes, **kwargs)
+
+
+@register
+class TestMOTReader(BaseDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 batch_transforms=[],
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=False,
+                 num_classes=1,
+                 **kwargs):
+        super(TestMOTReader, self).__init__(sample_transforms, batch_transforms,
+                                            batch_size, shuffle, drop_last,
+                                            num_classes, **kwargs)
+
+
+# For Semi-Supervised Object Detection (SSOD)
+class Compose_SSOD(object):
+    def __init__(self, base_transforms, weak_aug, strong_aug, num_classes=80):
+        self.base_transforms = base_transforms
+        self.base_transforms_cls = []
+        for t in self.base_transforms:
+            for k, v in t.items():
+                op_cls = getattr(transform, k)
+                f = op_cls(**v)
+                if hasattr(f, 'num_classes'):
+                    f.num_classes = num_classes
+                self.base_transforms_cls.append(f)
+
+        self.weak_augs = weak_aug
+        self.weak_augs_cls = []
+        for t in self.weak_augs:
+            for k, v in t.items():
+                op_cls = getattr(transform, k)
+                f = op_cls(**v)
+                if hasattr(f, 'num_classes'):
+                    f.num_classes = num_classes
+                self.weak_augs_cls.append(f)
+
+        self.strong_augs = strong_aug
+        self.strong_augs_cls = []
+        for t in self.strong_augs:
+            for k, v in t.items():
+                op_cls = getattr(transform, k)
+                f = op_cls(**v)
+                if hasattr(f, 'num_classes'):
+                    f.num_classes = num_classes
+                self.strong_augs_cls.append(f)
+
+    def __call__(self, data):
+        for f in self.base_transforms_cls:
+            try:
+                data = f(data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map sample transform [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        weak_data = deepcopy(data)
+        strong_data = deepcopy(data)
+        for f in self.weak_augs_cls:
+            try:
+                weak_data = f(weak_data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map weak aug [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        for f in self.strong_augs_cls:
+            try:
+                strong_data = f(strong_data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map strong aug [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        weak_data['strong_aug'] = strong_data
+        return weak_data
+
+
+class BatchCompose_SSOD(Compose):
+    def __init__(self, transforms, num_classes=80, collate_batch=True):
+        super(BatchCompose_SSOD, self).__init__(transforms, num_classes)
+        self.collate_batch = collate_batch
+
+    def __call__(self, data):
+        # split strong_data from data(weak_data)
+        strong_data = []
+        for sample in data:
+            strong_data.append(sample['strong_aug'])
+            sample.pop('strong_aug')
+
+        for f in self.transforms_cls:
+            try:
+                data = f(data)
+                if 'BatchRandomResizeForSSOD' in f._id:
+                    strong_data = f(strong_data, data[1])[0]
+                    data = data[0]
+                else:
+                    strong_data = f(strong_data)
+            except Exception as e:
+                stack_info = traceback.format_exc()
+                logger.warning("fail to map batch transform [{}] "
+                               "with error: {} and stack:\n{}".format(
+                                   f, e, str(stack_info)))
+                raise e
+
+        # remove keys which is not needed by model
+        extra_key = ['h', 'w', 'flipped']
+        for k in extra_key:
+            for sample in data:
+                if k in sample:
+                    sample.pop(k)
+            for sample in strong_data:
+                if k in sample:
+                    sample.pop(k)
+
+        # batch data, if user-define batch function needed
+        # use user-defined here
+        if self.collate_batch:
+            batch_data = default_collate_fn(data)
+            strong_batch_data = default_collate_fn(strong_data)
+            return batch_data, strong_batch_data
+        else:
+            batch_data = {}
+            for k in data[0].keys():
+                tmp_data = []
+                for i in range(len(data)):
+                    tmp_data.append(data[i][k])
+                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
+                    tmp_data = np.stack(tmp_data, axis=0)
+                batch_data[k] = tmp_data
+
+            strong_batch_data = {}
+            for k in strong_data[0].keys():
+                tmp_data = []
+                for i in range(len(strong_data)):
+                    tmp_data.append(strong_data[i][k])
+                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
+                    tmp_data = np.stack(tmp_data, axis=0)
+                strong_batch_data[k] = tmp_data
+
+        return batch_data, strong_batch_data
+
+
+class CombineSSODLoader(object):
+    def __init__(self, label_loader, unlabel_loader):
+        self.label_loader = label_loader
+        self.unlabel_loader = unlabel_loader
+
+    def __iter__(self):
+        while True:
+            try:
+                label_samples = next(self.label_loader_iter)
+            except:
+                self.label_loader_iter = iter(self.label_loader)
+                label_samples = next(self.label_loader_iter)
+
+            try:
+                unlabel_samples = next(self.unlabel_loader_iter)
+            except:
+                self.unlabel_loader_iter = iter(self.unlabel_loader)
+                unlabel_samples = next(self.unlabel_loader_iter)
+
+            yield (
+                label_samples[0],  # sup weak
+                label_samples[1],  # sup strong
+                unlabel_samples[0],  # unsup weak
+                unlabel_samples[1]  # unsup strong
+            )
+
+    def __call__(self):
+        return self.__iter__()
+
+
+class BaseSemiDataLoader(object):
+    def __init__(self,
+                 sample_transforms=[],
+                 weak_aug=[],
+                 strong_aug=[],
+                 sup_batch_transforms=[],
+                 unsup_batch_transforms=[],
+                 sup_batch_size=1,
+                 unsup_batch_size=1,
+                 shuffle=True,
+                 drop_last=True,
+                 num_classes=80,
+                 collate_batch=True,
+                 use_shared_memory=False,
+                 **kwargs):
+        # sup transforms
+        self._sample_transforms_label = Compose_SSOD(
+            sample_transforms, weak_aug, strong_aug, num_classes=num_classes)
+        self._batch_transforms_label = BatchCompose_SSOD(
+            sup_batch_transforms, num_classes, collate_batch)
+        self.batch_size_label = sup_batch_size
+
+        # unsup transforms
+        self._sample_transforms_unlabel = Compose_SSOD(
+            sample_transforms, weak_aug, strong_aug, num_classes=num_classes)
+        self._batch_transforms_unlabel = BatchCompose_SSOD(
+            unsup_batch_transforms, num_classes, collate_batch)
+        self.batch_size_unlabel = unsup_batch_size
+
+        # common
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.use_shared_memory = use_shared_memory
+        self.kwargs = kwargs
+
+    def __call__(self,
+                 dataset_label,
+                 dataset_unlabel,
+                 worker_num,
+                 batch_sampler_label=None,
+                 batch_sampler_unlabel=None,
+                 return_list=False):
+        # sup dataset 
+        self.dataset_label = dataset_label
+        self.dataset_label.check_or_download_dataset()
+        self.dataset_label.parse_dataset()
+        self.dataset_label.set_transform(self._sample_transforms_label)
+        self.dataset_label.set_kwargs(**self.kwargs)
+        if batch_sampler_label is None:
+            self._batch_sampler_label = DistributedBatchSampler(
+                self.dataset_label,
+                batch_size=self.batch_size_label,
+                shuffle=self.shuffle,
+                drop_last=self.drop_last)
+        else:
+            self._batch_sampler_label = batch_sampler_label
+
+        # unsup dataset
+        self.dataset_unlabel = dataset_unlabel
+        self.dataset_unlabel.length = self.dataset_label.__len__()
+        self.dataset_unlabel.check_or_download_dataset()
+        self.dataset_unlabel.parse_dataset()
+        self.dataset_unlabel.set_transform(self._sample_transforms_unlabel)
+        self.dataset_unlabel.set_kwargs(**self.kwargs)
+        if batch_sampler_unlabel is None:
+            self._batch_sampler_unlabel = DistributedBatchSampler(
+                self.dataset_unlabel,
+                batch_size=self.batch_size_unlabel,
+                shuffle=self.shuffle,
+                drop_last=self.drop_last)
+        else:
+            self._batch_sampler_unlabel = batch_sampler_unlabel
+
+        # DataLoader do not start sub-process in Windows and Mac
+        # system, do not need to use shared memory
+        use_shared_memory = self.use_shared_memory and \
+                            sys.platform not in ['win32', 'darwin']
+        # check whether shared memory size is bigger than 1G(1024M)
+        if use_shared_memory:
+            shm_size = _get_shared_memory_size_in_M()
+            if shm_size is not None and shm_size < 1024.:
+                logger.warning("Shared memory size is less than 1G, "
+                               "disable shared_memory in DataLoader")
+                use_shared_memory = False
+
+        self.dataloader_label = DataLoader(
+            dataset=self.dataset_label,
+            batch_sampler=self._batch_sampler_label,
+            collate_fn=self._batch_transforms_label,
+            num_workers=worker_num,
+            return_list=return_list,
+            use_shared_memory=use_shared_memory)
+
+        self.dataloader_unlabel = DataLoader(
+            dataset=self.dataset_unlabel,
+            batch_sampler=self._batch_sampler_unlabel,
+            collate_fn=self._batch_transforms_unlabel,
+            num_workers=worker_num,
+            return_list=return_list,
+            use_shared_memory=use_shared_memory)
+
+        self.dataloader = CombineSSODLoader(self.dataloader_label,
+                                            self.dataloader_unlabel)
+        self.loader = iter(self.dataloader)
+        return self
+
+    def __len__(self):
+        return len(self._batch_sampler_label)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        return next(self.loader)
+
+    def next(self):
+        # python2 compatibility
+        return self.__next__()
+
+
+@register
+class SemiTrainReader(BaseSemiDataLoader):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 sample_transforms=[],
+                 weak_aug=[],
+                 strong_aug=[],
+                 sup_batch_transforms=[],
+                 unsup_batch_transforms=[],
+                 sup_batch_size=1,
+                 unsup_batch_size=1,
+                 shuffle=True,
+                 drop_last=True,
+                 num_classes=80,
+                 collate_batch=True,
+                 **kwargs):
+        super(SemiTrainReader, self).__init__(
+            sample_transforms, weak_aug, strong_aug, sup_batch_transforms,
+            unsup_batch_transforms, sup_batch_size, unsup_batch_size, shuffle,
+            drop_last, num_classes, collate_batch, **kwargs)
--- a/paddle_detection/ppdet/data/shm_utils.py
+++ b/paddle_detection/ppdet/data/shm_utils.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+SIZE_UNIT = ['K', 'M', 'G', 'T']
+SHM_QUERY_CMD = 'df -h'
+SHM_KEY = 'shm'
+SHM_DEFAULT_MOUNT = '/dev/shm'
+
+# [ shared memory size check ]
+# In detection models, image/target data occupies a lot of memory, and
+# will occupy lots of shared memory in multi-process DataLoader, we use
+# following code to get shared memory size and perform a size check to
+# disable shared memory use if shared memory size is not enough.
+# Shared memory getting process as follows:
+# 1. use `df -h` get all mount info
+# 2. pick up spaces whose mount info contains 'shm'
+# 3. if 'shm' space number is only 1, return its size
+# 4. if there are multiple 'shm' space, try to find the default mount
+#    directory '/dev/shm' is Linux-like system, otherwise return the
+#    biggest space size.
+
+
+def _parse_size_in_M(size_str):
+    if size_str[-1] == 'B':
+        num, unit = size_str[:-2], size_str[-2]
+    else:
+        num, unit = size_str[:-1], size_str[-1]
+    assert unit in SIZE_UNIT, \
+            "unknown shm size unit {}".format(unit)
+    return float(num) * \
+            (1024 ** (SIZE_UNIT.index(unit) - 1))
+
+
+def _get_shared_memory_size_in_M():
+    try:
+        df_infos = os.popen(SHM_QUERY_CMD).readlines()
+    except:
+        return None
+    else:
+        shm_infos = []
+        for df_info in df_infos:
+            info = df_info.strip()
+            if info.find(SHM_KEY) >= 0:
+                shm_infos.append(info.split())
+
+        if len(shm_infos) == 0:
+            return None
+        elif len(shm_infos) == 1:
+            return _parse_size_in_M(shm_infos[0][3])
+        else:
+            default_mount_infos = [
+                si for si in shm_infos if si[-1] == SHM_DEFAULT_MOUNT
+            ]
+            if default_mount_infos:
+                return _parse_size_in_M(default_mount_infos[0][3])
+            else:
+                return max([_parse_size_in_M(si[3]) for si in shm_infos])
--- a/paddle_detection/ppdet/data/source/init.py
+++ b/paddle_detection/ppdet/data/source/init.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import coco
+from . import voc
+from . import widerface
+from . import category
+from . import keypoint_coco
+from . import mot
+from . import sniper_coco
+from . import culane
+
+from .coco import *
+from .voc import *
+from .widerface import *
+from .category import *
+from .keypoint_coco import *
+from .mot import *
+from .sniper_coco import SniperCOCODataSet
+from .dataset import ImageFolder
+from .pose3d_cmb import *
+from .culane import *
--- a/paddle_detection/ppdet/data/source/category.py
+++ b/paddle_detection/ppdet/data/source/category.py
@@ -0,0 +1,942 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from ppdet.data.source.voc import pascalvoc_label
+from ppdet.data.source.widerface import widerface_label
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = ['get_categories']
+
+
+def get_categories(metric_type, anno_file=None, arch=None):
+    """
+    Get class id to category id map and category id
+    to category name map from annotation file.
+
+    Args:
+        metric_type (str): metric type, currently support 'coco', 'voc', 'oid'
+            and 'widerface'.
+        anno_file (str): annotation file path
+    """
+    if arch == 'keypoint_arch':
+        return (None, {'id': 'keypoint'})
+
+    if anno_file == None or (not os.path.isfile(anno_file)):
+        logger.warning(
+            "anno_file '{}' is None or not set or not exist, "
+            "please recheck TrainDataset/EvalDataset/TestDataset.anno_path, "
+            "otherwise the default categories will be used by metric_type.".
+            format(anno_file))
+
+    if metric_type.lower() == 'coco' or metric_type.lower(
+    ) == 'rbox' or metric_type.lower() == 'snipercoco':
+        if anno_file and os.path.isfile(anno_file):
+            if anno_file.endswith('json'):
+                # lazy import pycocotools here
+                from pycocotools.coco import COCO
+                coco = COCO(anno_file)
+                cats = coco.loadCats(coco.getCatIds())
+
+                clsid2catid = {i: cat['id'] for i, cat in enumerate(cats)}
+                catid2name = {cat['id']: cat['name'] for cat in cats}
+
+            elif anno_file.endswith('txt'):
+                cats = []
+                with open(anno_file) as f:
+                    for line in f.readlines():
+                        cats.append(line.strip())
+                if cats[0] == 'background': cats = cats[1:]
+
+                clsid2catid = {i: i for i in range(len(cats))}
+                catid2name = {i: name for i, name in enumerate(cats)}
+
+            else:
+                raise ValueError("anno_file {} should be json or txt.".format(
+                    anno_file))
+            return clsid2catid, catid2name
+
+        # anno file not exist, load default categories of COCO17
+        else:
+            if metric_type.lower() == 'rbox':
+                logger.warning(
+                    "metric_type: {}, load default categories of DOTA.".format(
+                        metric_type))
+                return _dota_category()
+            logger.warning("metric_type: {}, load default categories of COCO.".
+                           format(metric_type))
+            return _coco17_category()
+
+    elif metric_type.lower() == 'voc':
+        if anno_file and os.path.isfile(anno_file):
+            cats = []
+            with open(anno_file) as f:
+                for line in f.readlines():
+                    cats.append(line.strip())
+
+            if cats[0] == 'background':
+                cats = cats[1:]
+
+            clsid2catid = {i: i for i in range(len(cats))}
+            catid2name = {i: name for i, name in enumerate(cats)}
+
+            return clsid2catid, catid2name
+
+        # anno file not exist, load default categories of
+        # VOC all 20 categories
+        else:
+            logger.warning("metric_type: {}, load default categories of VOC.".
+                           format(metric_type))
+            return _vocall_category()
+
+    elif metric_type.lower() == 'oid':
+        if anno_file and os.path.isfile(anno_file):
+            logger.warning("only default categories support for OID19")
+        return _oid19_category()
+
+    elif metric_type.lower() == 'widerface':
+        return _widerface_category()
+
+    elif metric_type.lower() in [
+            'keypointtopdowncocoeval', 'keypointtopdownmpiieval',
+            'keypointtopdowncocowholebadyhandeval'
+    ]:
+        return (None, {'id': 'keypoint'})
+
+    elif metric_type.lower() == 'pose3deval':
+        return (None, {'id': 'pose3d'})
+
+    elif metric_type.lower() in ['mot', 'motdet', 'reid']:
+        if anno_file and os.path.isfile(anno_file):
+            cats = []
+            with open(anno_file) as f:
+                for line in f.readlines():
+                    cats.append(line.strip())
+            if cats[0] == 'background':
+                cats = cats[1:]
+            clsid2catid = {i: i for i in range(len(cats))}
+            catid2name = {i: name for i, name in enumerate(cats)}
+            return clsid2catid, catid2name
+        # anno file not exist, load default category 'pedestrian'.
+        else:
+            logger.warning(
+                "metric_type: {}, load default categories of pedestrian MOT.".
+                format(metric_type))
+            return _mot_category(category='pedestrian')
+
+    elif metric_type.lower() in ['kitti', 'bdd100kmot']:
+        return _mot_category(category='vehicle')
+
+    elif metric_type.lower() in ['mcmot']:
+        if anno_file and os.path.isfile(anno_file):
+            cats = []
+            with open(anno_file) as f:
+                for line in f.readlines():
+                    cats.append(line.strip())
+            if cats[0] == 'background':
+                cats = cats[1:]
+            clsid2catid = {i: i for i in range(len(cats))}
+            catid2name = {i: name for i, name in enumerate(cats)}
+            return clsid2catid, catid2name
+        # anno file not exist, load default categories of visdrone all 10 categories
+        else:
+            logger.warning(
+                "metric_type: {}, load default categories of VisDrone.".format(
+                    metric_type))
+            return _visdrone_category()
+
+    else:
+        raise ValueError("unknown metric type {}".format(metric_type))
+
+
+def _mot_category(category='pedestrian'):
+    """
+    Get class id to category id map and category id
+    to category name map of mot dataset
+    """
+    label_map = {category: 0}
+    label_map = sorted(label_map.items(), key=lambda x: x[1])
+    cats = [l[0] for l in label_map]
+
+    clsid2catid = {i: i for i in range(len(cats))}
+    catid2name = {i: name for i, name in enumerate(cats)}
+
+    return clsid2catid, catid2name
+
+
+def _coco17_category():
+    """
+    Get class id to category id map and category id
+    to category name map of COCO2017 dataset
+
+    """
+    clsid2catid = {
+        1: 1,
+        2: 2,
+        3: 3,
+        4: 4,
+        5: 5,
+        6: 6,
+        7: 7,
+        8: 8,
+        9: 9,
+        10: 10,
+        11: 11,
+        12: 13,
+        13: 14,
+        14: 15,
+        15: 16,
+        16: 17,
+        17: 18,
+        18: 19,
+        19: 20,
+        20: 21,
+        21: 22,
+        22: 23,
+        23: 24,
+        24: 25,
+        25: 27,
+        26: 28,
+        27: 31,
+        28: 32,
+        29: 33,
+        30: 34,
+        31: 35,
+        32: 36,
+        33: 37,
+        34: 38,
+        35: 39,
+        36: 40,
+        37: 41,
+        38: 42,
+        39: 43,
+        40: 44,
+        41: 46,
+        42: 47,
+        43: 48,
+        44: 49,
+        45: 50,
+        46: 51,
+        47: 52,
+        48: 53,
+        49: 54,
+        50: 55,
+        51: 56,
+        52: 57,
+        53: 58,
+        54: 59,
+        55: 60,
+        56: 61,
+        57: 62,
+        58: 63,
+        59: 64,
+        60: 65,
+        61: 67,
+        62: 70,
+        63: 72,
+        64: 73,
+        65: 74,
+        66: 75,
+        67: 76,
+        68: 77,
+        69: 78,
+        70: 79,
+        71: 80,
+        72: 81,
+        73: 82,
+        74: 84,
+        75: 85,
+        76: 86,
+        77: 87,
+        78: 88,
+        79: 89,
+        80: 90
+    }
+
+    catid2name = {
+        0: 'background',
+        1: 'person',
+        2: 'bicycle',
+        3: 'car',
+        4: 'motorcycle',
+        5: 'airplane',
+        6: 'bus',
+        7: 'train',
+        8: 'truck',
+        9: 'boat',
+        10: 'traffic light',
+        11: 'fire hydrant',
+        13: 'stop sign',
+        14: 'parking meter',
+        15: 'bench',
+        16: 'bird',
+        17: 'cat',
+        18: 'dog',
+        19: 'horse',
+        20: 'sheep',
+        21: 'cow',
+        22: 'elephant',
+        23: 'bear',
+        24: 'zebra',
+        25: 'giraffe',
+        27: 'backpack',
+        28: 'umbrella',
+        31: 'handbag',
+        32: 'tie',
+        33: 'suitcase',
+        34: 'frisbee',
+        35: 'skis',
+        36: 'snowboard',
+        37: 'sports ball',
+        38: 'kite',
+        39: 'baseball bat',
+        40: 'baseball glove',
+        41: 'skateboard',
+        42: 'surfboard',
+        43: 'tennis racket',
+        44: 'bottle',
+        46: 'wine glass',
+        47: 'cup',
+        48: 'fork',
+        49: 'knife',
+        50: 'spoon',
+        51: 'bowl',
+        52: 'banana',
+        53: 'apple',
+        54: 'sandwich',
+        55: 'orange',
+        56: 'broccoli',
+        57: 'carrot',
+        58: 'hot dog',
+        59: 'pizza',
+        60: 'donut',
+        61: 'cake',
+        62: 'chair',
+        63: 'couch',
+        64: 'potted plant',
+        65: 'bed',
+        67: 'dining table',
+        70: 'toilet',
+        72: 'tv',
+        73: 'laptop',
+        74: 'mouse',
+        75: 'remote',
+        76: 'keyboard',
+        77: 'cell phone',
+        78: 'microwave',
+        79: 'oven',
+        80: 'toaster',
+        81: 'sink',
+        82: 'refrigerator',
+        84: 'book',
+        85: 'clock',
+        86: 'vase',
+        87: 'scissors',
+        88: 'teddy bear',
+        89: 'hair drier',
+        90: 'toothbrush'
+    }
+
+    clsid2catid = {k - 1: v for k, v in clsid2catid.items()}
+    catid2name.pop(0)
+
+    return clsid2catid, catid2name
+
+
+def _dota_category():
+    """
+    Get class id to category id map and category id
+    to category name map of dota dataset
+    """
+    catid2name = {
+        0: 'background',
+        1: 'plane',
+        2: 'baseball-diamond',
+        3: 'bridge',
+        4: 'ground-track-field',
+        5: 'small-vehicle',
+        6: 'large-vehicle',
+        7: 'ship',
+        8: 'tennis-court',
+        9: 'basketball-court',
+        10: 'storage-tank',
+        11: 'soccer-ball-field',
+        12: 'roundabout',
+        13: 'harbor',
+        14: 'swimming-pool',
+        15: 'helicopter'
+    }
+    catid2name.pop(0)
+    clsid2catid = {i: i + 1 for i in range(len(catid2name))}
+    return clsid2catid, catid2name
+
+
+def _vocall_category():
+    """
+    Get class id to category id map and category id
+    to category name map of mixup voc dataset
+
+    """
+    label_map = pascalvoc_label()
+    label_map = sorted(label_map.items(), key=lambda x: x[1])
+    cats = [l[0] for l in label_map]
+
+    clsid2catid = {i: i for i in range(len(cats))}
+    catid2name = {i: name for i, name in enumerate(cats)}
+
+    return clsid2catid, catid2name
+
+
+def _widerface_category():
+    label_map = widerface_label()
+    label_map = sorted(label_map.items(), key=lambda x: x[1])
+    cats = [l[0] for l in label_map]
+    clsid2catid = {i: i for i in range(len(cats))}
+    catid2name = {i: name for i, name in enumerate(cats)}
+
+    return clsid2catid, catid2name
+
+
+def _oid19_category():
+    clsid2catid = {k: k + 1 for k in range(500)}
+
+    catid2name = {
+        0: "background",
+        1: "Infant bed",
+        2: "Rose",
+        3: "Flag",
+        4: "Flashlight",
+        5: "Sea turtle",
+        6: "Camera",
+        7: "Animal",
+        8: "Glove",
+        9: "Crocodile",
+        10: "Cattle",
+        11: "House",
+        12: "Guacamole",
+        13: "Penguin",
+        14: "Vehicle registration plate",
+        15: "Bench",
+        16: "Ladybug",
+        17: "Human nose",
+        18: "Watermelon",
+        19: "Flute",
+        20: "Butterfly",
+        21: "Washing machine",
+        22: "Raccoon",
+        23: "Segway",
+        24: "Taco",
+        25: "Jellyfish",
+        26: "Cake",
+        27: "Pen",
+        28: "Cannon",
+        29: "Bread",
+        30: "Tree",
+        31: "Shellfish",
+        32: "Bed",
+        33: "Hamster",
+        34: "Hat",
+        35: "Toaster",
+        36: "Sombrero",
+        37: "Tiara",
+        38: "Bowl",
+        39: "Dragonfly",
+        40: "Moths and butterflies",
+        41: "Antelope",
+        42: "Vegetable",
+        43: "Torch",
+        44: "Building",
+        45: "Power plugs and sockets",
+        46: "Blender",
+        47: "Billiard table",
+        48: "Cutting board",
+        49: "Bronze sculpture",
+        50: "Turtle",
+        51: "Broccoli",
+        52: "Tiger",
+        53: "Mirror",
+        54: "Bear",
+        55: "Zucchini",
+        56: "Dress",
+        57: "Volleyball",
+        58: "Guitar",
+        59: "Reptile",
+        60: "Golf cart",
+        61: "Tart",
+        62: "Fedora",
+        63: "Carnivore",
+        64: "Car",
+        65: "Lighthouse",
+        66: "Coffeemaker",
+        67: "Food processor",
+        68: "Truck",
+        69: "Bookcase",
+        70: "Surfboard",
+        71: "Footwear",
+        72: "Bench",
+        73: "Necklace",
+        74: "Flower",
+        75: "Radish",
+        76: "Marine mammal",
+        77: "Frying pan",
+        78: "Tap",
+        79: "Peach",
+        80: "Knife",
+        81: "Handbag",
+        82: "Laptop",
+        83: "Tent",
+        84: "Ambulance",
+        85: "Christmas tree",
+        86: "Eagle",
+        87: "Limousine",
+        88: "Kitchen & dining room table",
+        89: "Polar bear",
+        90: "Tower",
+        91: "Football",
+        92: "Willow",
+        93: "Human head",
+        94: "Stop sign",
+        95: "Banana",
+        96: "Mixer",
+        97: "Binoculars",
+        98: "Dessert",
+        99: "Bee",
+        100: "Chair",
+        101: "Wood-burning stove",
+        102: "Flowerpot",
+        103: "Beaker",
+        104: "Oyster",
+        105: "Woodpecker",
+        106: "Harp",
+        107: "Bathtub",
+        108: "Wall clock",
+        109: "Sports uniform",
+        110: "Rhinoceros",
+        111: "Beehive",
+        112: "Cupboard",
+        113: "Chicken",
+        114: "Man",
+        115: "Blue jay",
+        116: "Cucumber",
+        117: "Balloon",
+        118: "Kite",
+        119: "Fireplace",
+        120: "Lantern",
+        121: "Missile",
+        122: "Book",
+        123: "Spoon",
+        124: "Grapefruit",
+        125: "Squirrel",
+        126: "Orange",
+        127: "Coat",
+        128: "Punching bag",
+        129: "Zebra",
+        130: "Billboard",
+        131: "Bicycle",
+        132: "Door handle",
+        133: "Mechanical fan",
+        134: "Ring binder",
+        135: "Table",
+        136: "Parrot",
+        137: "Sock",
+        138: "Vase",
+        139: "Weapon",
+        140: "Shotgun",
+        141: "Glasses",
+        142: "Seahorse",
+        143: "Belt",
+        144: "Watercraft",
+        145: "Window",
+        146: "Giraffe",
+        147: "Lion",
+        148: "Tire",
+        149: "Vehicle",
+        150: "Canoe",
+        151: "Tie",
+        152: "Shelf",
+        153: "Picture frame",
+        154: "Printer",
+        155: "Human leg",
+        156: "Boat",
+        157: "Slow cooker",
+        158: "Croissant",
+        159: "Candle",
+        160: "Pancake",
+        161: "Pillow",
+        162: "Coin",
+        163: "Stretcher",
+        164: "Sandal",
+        165: "Woman",
+        166: "Stairs",
+        167: "Harpsichord",
+        168: "Stool",
+        169: "Bus",
+        170: "Suitcase",
+        171: "Human mouth",
+        172: "Juice",
+        173: "Skull",
+        174: "Door",
+        175: "Violin",
+        176: "Chopsticks",
+        177: "Digital clock",
+        178: "Sunflower",
+        179: "Leopard",
+        180: "Bell pepper",
+        181: "Harbor seal",
+        182: "Snake",
+        183: "Sewing machine",
+        184: "Goose",
+        185: "Helicopter",
+        186: "Seat belt",
+        187: "Coffee cup",
+        188: "Microwave oven",
+        189: "Hot dog",
+        190: "Countertop",
+        191: "Serving tray",
+        192: "Dog bed",
+        193: "Beer",
+        194: "Sunglasses",
+        195: "Golf ball",
+        196: "Waffle",
+        197: "Palm tree",
+        198: "Trumpet",
+        199: "Ruler",
+        200: "Helmet",
+        201: "Ladder",
+        202: "Office building",
+        203: "Tablet computer",
+        204: "Toilet paper",
+        205: "Pomegranate",
+        206: "Skirt",
+        207: "Gas stove",
+        208: "Cookie",
+        209: "Cart",
+        210: "Raven",
+        211: "Egg",
+        212: "Burrito",
+        213: "Goat",
+        214: "Kitchen knife",
+        215: "Skateboard",
+        216: "Salt and pepper shakers",
+        217: "Lynx",
+        218: "Boot",
+        219: "Platter",
+        220: "Ski",
+        221: "Swimwear",
+        222: "Swimming pool",
+        223: "Drinking straw",
+        224: "Wrench",
+        225: "Drum",
+        226: "Ant",
+        227: "Human ear",
+        228: "Headphones",
+        229: "Fountain",
+        230: "Bird",
+        231: "Jeans",
+        232: "Television",
+        233: "Crab",
+        234: "Microphone",
+        235: "Home appliance",
+        236: "Snowplow",
+        237: "Beetle",
+        238: "Artichoke",
+        239: "Jet ski",
+        240: "Stationary bicycle",
+        241: "Human hair",
+        242: "Brown bear",
+        243: "Starfish",
+        244: "Fork",
+        245: "Lobster",
+        246: "Corded phone",
+        247: "Drink",
+        248: "Saucer",
+        249: "Carrot",
+        250: "Insect",
+        251: "Clock",
+        252: "Castle",
+        253: "Tennis racket",
+        254: "Ceiling fan",
+        255: "Asparagus",
+        256: "Jaguar",
+        257: "Musical instrument",
+        258: "Train",
+        259: "Cat",
+        260: "Rifle",
+        261: "Dumbbell",
+        262: "Mobile phone",
+        263: "Taxi",
+        264: "Shower",
+        265: "Pitcher",
+        266: "Lemon",
+        267: "Invertebrate",
+        268: "Turkey",
+        269: "High heels",
+        270: "Bust",
+        271: "Elephant",
+        272: "Scarf",
+        273: "Barrel",
+        274: "Trombone",
+        275: "Pumpkin",
+        276: "Box",
+        277: "Tomato",
+        278: "Frog",
+        279: "Bidet",
+        280: "Human face",
+        281: "Houseplant",
+        282: "Van",
+        283: "Shark",
+        284: "Ice cream",
+        285: "Swim cap",
+        286: "Falcon",
+        287: "Ostrich",
+        288: "Handgun",
+        289: "Whiteboard",
+        290: "Lizard",
+        291: "Pasta",
+        292: "Snowmobile",
+        293: "Light bulb",
+        294: "Window blind",
+        295: "Muffin",
+        296: "Pretzel",
+        297: "Computer monitor",
+        298: "Horn",
+        299: "Furniture",
+        300: "Sandwich",
+        301: "Fox",
+        302: "Convenience store",
+        303: "Fish",
+        304: "Fruit",
+        305: "Earrings",
+        306: "Curtain",
+        307: "Grape",
+        308: "Sofa bed",
+        309: "Horse",
+        310: "Luggage and bags",
+        311: "Desk",
+        312: "Crutch",
+        313: "Bicycle helmet",
+        314: "Tick",
+        315: "Airplane",
+        316: "Canary",
+        317: "Spatula",
+        318: "Watch",
+        319: "Lily",
+        320: "Kitchen appliance",
+        321: "Filing cabinet",
+        322: "Aircraft",
+        323: "Cake stand",
+        324: "Candy",
+        325: "Sink",
+        326: "Mouse",
+        327: "Wine",
+        328: "Wheelchair",
+        329: "Goldfish",
+        330: "Refrigerator",
+        331: "French fries",
+        332: "Drawer",
+        333: "Treadmill",
+        334: "Picnic basket",
+        335: "Dice",
+        336: "Cabbage",
+        337: "Football helmet",
+        338: "Pig",
+        339: "Person",
+        340: "Shorts",
+        341: "Gondola",
+        342: "Honeycomb",
+        343: "Doughnut",
+        344: "Chest of drawers",
+        345: "Land vehicle",
+        346: "Bat",
+        347: "Monkey",
+        348: "Dagger",
+        349: "Tableware",
+        350: "Human foot",
+        351: "Mug",
+        352: "Alarm clock",
+        353: "Pressure cooker",
+        354: "Human hand",
+        355: "Tortoise",
+        356: "Baseball glove",
+        357: "Sword",
+        358: "Pear",
+        359: "Miniskirt",
+        360: "Traffic sign",
+        361: "Girl",
+        362: "Roller skates",
+        363: "Dinosaur",
+        364: "Porch",
+        365: "Human beard",
+        366: "Submarine sandwich",
+        367: "Screwdriver",
+        368: "Strawberry",
+        369: "Wine glass",
+        370: "Seafood",
+        371: "Racket",
+        372: "Wheel",
+        373: "Sea lion",
+        374: "Toy",
+        375: "Tea",
+        376: "Tennis ball",
+        377: "Waste container",
+        378: "Mule",
+        379: "Cricket ball",
+        380: "Pineapple",
+        381: "Coconut",
+        382: "Doll",
+        383: "Coffee table",
+        384: "Snowman",
+        385: "Lavender",
+        386: "Shrimp",
+        387: "Maple",
+        388: "Cowboy hat",
+        389: "Goggles",
+        390: "Rugby ball",
+        391: "Caterpillar",
+        392: "Poster",
+        393: "Rocket",
+        394: "Organ",
+        395: "Saxophone",
+        396: "Traffic light",
+        397: "Cocktail",
+        398: "Plastic bag",
+        399: "Squash",
+        400: "Mushroom",
+        401: "Hamburger",
+        402: "Light switch",
+        403: "Parachute",
+        404: "Teddy bear",
+        405: "Winter melon",
+        406: "Deer",
+        407: "Musical keyboard",
+        408: "Plumbing fixture",
+        409: "Scoreboard",
+        410: "Baseball bat",
+        411: "Envelope",
+        412: "Adhesive tape",
+        413: "Briefcase",
+        414: "Paddle",
+        415: "Bow and arrow",
+        416: "Telephone",
+        417: "Sheep",
+        418: "Jacket",
+        419: "Boy",
+        420: "Pizza",
+        421: "Otter",
+        422: "Office supplies",
+        423: "Couch",
+        424: "Cello",
+        425: "Bull",
+        426: "Camel",
+        427: "Ball",
+        428: "Duck",
+        429: "Whale",
+        430: "Shirt",
+        431: "Tank",
+        432: "Motorcycle",
+        433: "Accordion",
+        434: "Owl",
+        435: "Porcupine",
+        436: "Sun hat",
+        437: "Nail",
+        438: "Scissors",
+        439: "Swan",
+        440: "Lamp",
+        441: "Crown",
+        442: "Piano",
+        443: "Sculpture",
+        444: "Cheetah",
+        445: "Oboe",
+        446: "Tin can",
+        447: "Mango",
+        448: "Tripod",
+        449: "Oven",
+        450: "Mouse",
+        451: "Barge",
+        452: "Coffee",
+        453: "Snowboard",
+        454: "Common fig",
+        455: "Salad",
+        456: "Marine invertebrates",
+        457: "Umbrella",
+        458: "Kangaroo",
+        459: "Human arm",
+        460: "Measuring cup",
+        461: "Snail",
+        462: "Loveseat",
+        463: "Suit",
+        464: "Teapot",
+        465: "Bottle",
+        466: "Alpaca",
+        467: "Kettle",
+        468: "Trousers",
+        469: "Popcorn",
+        470: "Centipede",
+        471: "Spider",
+        472: "Sparrow",
+        473: "Plate",
+        474: "Bagel",
+        475: "Personal care",
+        476: "Apple",
+        477: "Brassiere",
+        478: "Bathroom cabinet",
+        479: "studio couch",
+        480: "Computer keyboard",
+        481: "Table tennis racket",
+        482: "Sushi",
+        483: "Cabinetry",
+        484: "Street light",
+        485: "Towel",
+        486: "Nightstand",
+        487: "Rabbit",
+        488: "Dolphin",
+        489: "Dog",
+        490: "Jug",
+        491: "Wok",
+        492: "Fire hydrant",
+        493: "Human eye",
+        494: "Skyscraper",
+        495: "Backpack",
+        496: "Potato",
+        497: "Paper towel",
+        498: "Lifejacket",
+        499: "Bicycle wheel",
+        500: "Toilet",
+    }
+
+    return clsid2catid, catid2name
+
+
+def _visdrone_category():
+    clsid2catid = {i: i for i in range(10)}
+
+    catid2name = {
+        0: 'pedestrian',
+        1: 'people',
+        2: 'bicycle',
+        3: 'car',
+        4: 'van',
+        5: 'truck',
+        6: 'tricycle',
+        7: 'awning-tricycle',
+        8: 'bus',
+        9: 'motor'
+    }
+    return clsid2catid, catid2name
--- a/paddle_detection/ppdet/data/source/coco.py
+++ b/paddle_detection/ppdet/data/source/coco.py
@@ -0,0 +1,596 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import os
+import copy
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+import numpy as np
+from ppdet.core.workspace import register, serializable
+from .dataset import DetDataset
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+__all__ = [
+    'COCODataSet', 'SlicedCOCODataSet', 'SemiCOCODataSet', 'COCODetDataset'
+]
+
+
+@register
+@serializable
+class COCODataSet(DetDataset):
+    """
+    Load dataset with COCO format.
+
+    Args:
+        dataset_dir (str): root directory for dataset.
+        image_dir (str): directory for images.
+        anno_path (str): coco annotation file path.
+        data_fields (list): key name of data dictionary, at least have 'image'.
+        sample_num (int): number of samples to load, -1 means all.
+        load_crowd (bool): whether to load crowded ground-truth. 
+            False as default
+        allow_empty (bool): whether to load empty entry. False as default
+        empty_ratio (float): the ratio of empty record number to total 
+            record's, if empty_ratio is out of [0. ,1.), do not sample the 
+            records and use all the empty entries. 1. as default
+        repeat (int): repeat times for dataset, use in benchmark.
+    """
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 load_crowd=False,
+                 allow_empty=False,
+                 empty_ratio=1.,
+                 repeat=1):
+        super(COCODataSet, self).__init__(
+            dataset_dir,
+            image_dir,
+            anno_path,
+            data_fields,
+            sample_num,
+            repeat=repeat)
+        self.load_image_only = False
+        self.load_semantic = False
+        self.load_crowd = load_crowd
+        self.allow_empty = allow_empty
+        self.empty_ratio = empty_ratio
+
+    def _sample_empty(self, records, num):
+        # if empty_ratio is out of [0. ,1.), do not sample the records
+        if self.empty_ratio < 0. or self.empty_ratio >= 1.:
+            return records
+        import random
+        sample_num = min(
+            int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records))
+        records = random.sample(records, sample_num)
+        return records
+
+    def parse_dataset(self):
+        anno_path = os.path.join(self.dataset_dir, self.anno_path)
+        image_dir = os.path.join(self.dataset_dir, self.image_dir)
+
+        assert anno_path.endswith('.json'), \
+            'invalid coco annotation file: ' + anno_path
+        from pycocotools.coco import COCO
+        coco = COCO(anno_path)
+        img_ids = coco.getImgIds()
+        img_ids.sort()
+        cat_ids = coco.getCatIds()
+        records = []
+        empty_records = []
+        ct = 0
+
+        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
+        self.cname2cid = dict({
+            coco.loadCats(catid)[0]['name']: clsid
+            for catid, clsid in self.catid2clsid.items()
+        })
+
+        if 'annotations' not in coco.dataset:
+            self.load_image_only = True
+            logger.warning('Annotation file: {} does not contains ground truth '
+                           'and load image information only.'.format(anno_path))
+
+        for img_id in img_ids:
+            img_anno = coco.loadImgs([img_id])[0]
+            im_fname = img_anno['file_name']
+            im_w = float(img_anno['width'])
+            im_h = float(img_anno['height'])
+
+            im_path = os.path.join(image_dir,
+                                   im_fname) if image_dir else im_fname
+            is_empty = False
+            if not os.path.exists(im_path):
+                logger.warning('Illegal image file: {}, and it will be '
+                               'ignored'.format(im_path))
+                continue
+
+            if im_w < 0 or im_h < 0:
+                logger.warning('Illegal width: {} or height: {} in annotation, '
+                               'and im_id: {} will be ignored'.format(
+                                   im_w, im_h, img_id))
+                continue
+
+            coco_rec = {
+                'im_file': im_path,
+                'im_id': np.array([img_id]),
+                'h': im_h,
+                'w': im_w,
+            } if 'image' in self.data_fields else {}
+
+            if not self.load_image_only:
+                ins_anno_ids = coco.getAnnIds(
+                    imgIds=[img_id], iscrowd=None if self.load_crowd else False)
+                instances = coco.loadAnns(ins_anno_ids)
+
+                bboxes = []
+                is_rbox_anno = False
+                for inst in instances:
+                    # check gt bbox
+                    if inst.get('ignore', False):
+                        continue
+                    if 'bbox' not in inst.keys():
+                        continue
+                    else:
+                        if not any(np.array(inst['bbox'])):
+                            continue
+
+                    x1, y1, box_w, box_h = inst['bbox']
+                    x2 = x1 + box_w
+                    y2 = y1 + box_h
+                    eps = 1e-5
+                    if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:
+                        inst['clean_bbox'] = [
+                            round(float(x), 3) for x in [x1, y1, x2, y2]
+                        ]
+                        bboxes.append(inst)
+                    else:
+                        logger.warning(
+                            'Found an invalid bbox in annotations: im_id: {}, '
+                            'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
+                                img_id, float(inst['area']), x1, y1, x2, y2))
+
+                num_bbox = len(bboxes)
+                if num_bbox <= 0 and not self.allow_empty:
+                    continue
+                elif num_bbox <= 0:
+                    is_empty = True
+
+                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
+                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
+                is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
+                gt_poly = [None] * num_bbox
+                gt_track_id = -np.ones((num_bbox, 1), dtype=np.int32)
+
+                has_segmentation = False
+                has_track_id = False
+                for i, box in enumerate(bboxes):
+                    catid = box['category_id']
+                    gt_class[i][0] = self.catid2clsid[catid]
+                    gt_bbox[i, :] = box['clean_bbox']
+                    is_crowd[i][0] = box['iscrowd']
+                    # check RLE format 
+                    if 'segmentation' in box and box['iscrowd'] == 1:
+                        gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
+                    elif 'segmentation' in box and box['segmentation']:
+                        if not np.array(
+                                box['segmentation'],
+                                dtype=object).size > 0 and not self.allow_empty:
+                            bboxes.pop(i)
+                            gt_poly.pop(i)
+                            np.delete(is_crowd, i)
+                            np.delete(gt_class, i)
+                            np.delete(gt_bbox, i)
+                        else:
+                            gt_poly[i] = box['segmentation']
+                        has_segmentation = True
+
+                    if 'track_id' in box:
+                        gt_track_id[i][0] = box['track_id']
+                        has_track_id = True
+
+                if has_segmentation and not any(
+                        gt_poly) and not self.allow_empty:
+                    continue
+
+                gt_rec = {
+                    'is_crowd': is_crowd,
+                    'gt_class': gt_class,
+                    'gt_bbox': gt_bbox,
+                    'gt_poly': gt_poly,
+                }
+                if has_track_id:
+                    gt_rec.update({'gt_track_id': gt_track_id})
+
+                for k, v in gt_rec.items():
+                    if k in self.data_fields:
+                        coco_rec[k] = v
+
+                # TODO: remove load_semantic
+                if self.load_semantic and 'semantic' in self.data_fields:
+                    seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps',
+                                            'train2017', im_fname[:-3] + 'png')
+                    coco_rec.update({'semantic': seg_path})
+
+            logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format(
+                im_path, img_id, im_h, im_w))
+            if is_empty:
+                empty_records.append(coco_rec)
+            else:
+                records.append(coco_rec)
+            ct += 1
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+        assert ct > 0, 'not found any coco record in %s' % (anno_path)
+        logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.
+                    format(ct, len(img_ids) - ct, anno_path))
+        if self.allow_empty and len(empty_records) > 0:
+            empty_records = self._sample_empty(empty_records, len(records))
+            records += empty_records
+        self.roidbs = records
+
+
+@register
+@serializable
+class SlicedCOCODataSet(COCODataSet):
+    """Sliced COCODataSet"""
+
+    def __init__(
+            self,
+            dataset_dir=None,
+            image_dir=None,
+            anno_path=None,
+            data_fields=['image'],
+            sample_num=-1,
+            load_crowd=False,
+            allow_empty=False,
+            empty_ratio=1.,
+            repeat=1,
+            sliced_size=[640, 640],
+            overlap_ratio=[0.25, 0.25], ):
+        super(SlicedCOCODataSet, self).__init__(
+            dataset_dir=dataset_dir,
+            image_dir=image_dir,
+            anno_path=anno_path,
+            data_fields=data_fields,
+            sample_num=sample_num,
+            load_crowd=load_crowd,
+            allow_empty=allow_empty,
+            empty_ratio=empty_ratio,
+            repeat=repeat, )
+        self.sliced_size = sliced_size
+        self.overlap_ratio = overlap_ratio
+
+    def parse_dataset(self):
+        anno_path = os.path.join(self.dataset_dir, self.anno_path)
+        image_dir = os.path.join(self.dataset_dir, self.image_dir)
+
+        assert anno_path.endswith('.json'), \
+            'invalid coco annotation file: ' + anno_path
+        from pycocotools.coco import COCO
+        coco = COCO(anno_path)
+        img_ids = coco.getImgIds()
+        img_ids.sort()
+        cat_ids = coco.getCatIds()
+        records = []
+        empty_records = []
+        ct = 0
+        ct_sub = 0
+
+        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
+        self.cname2cid = dict({
+            coco.loadCats(catid)[0]['name']: clsid
+            for catid, clsid in self.catid2clsid.items()
+        })
+
+        if 'annotations' not in coco.dataset:
+            self.load_image_only = True
+            logger.warning('Annotation file: {} does not contains ground truth '
+                           'and load image information only.'.format(anno_path))
+        try:
+            import sahi
+            from sahi.slicing import slice_image
+        except Exception as e:
+            logger.error(
+                'sahi not found, plaese install sahi. '
+                'for example: `pip install sahi`, see https://github.com/obss/sahi.'
+            )
+            raise e
+
+        sub_img_ids = 0
+        for img_id in img_ids:
+            img_anno = coco.loadImgs([img_id])[0]
+            im_fname = img_anno['file_name']
+            im_w = float(img_anno['width'])
+            im_h = float(img_anno['height'])
+
+            im_path = os.path.join(image_dir,
+                                   im_fname) if image_dir else im_fname
+            is_empty = False
+            if not os.path.exists(im_path):
+                logger.warning('Illegal image file: {}, and it will be '
+                               'ignored'.format(im_path))
+                continue
+
+            if im_w < 0 or im_h < 0:
+                logger.warning('Illegal width: {} or height: {} in annotation, '
+                               'and im_id: {} will be ignored'.format(
+                                   im_w, im_h, img_id))
+                continue
+
+            slice_image_result = sahi.slicing.slice_image(
+                image=im_path,
+                slice_height=self.sliced_size[0],
+                slice_width=self.sliced_size[1],
+                overlap_height_ratio=self.overlap_ratio[0],
+                overlap_width_ratio=self.overlap_ratio[1])
+
+            sub_img_num = len(slice_image_result)
+            for _ind in range(sub_img_num):
+                im = slice_image_result.images[_ind]
+                coco_rec = {
+                    'image': im,
+                    'im_id': np.array([sub_img_ids + _ind]),
+                    'h': im.shape[0],
+                    'w': im.shape[1],
+                    'ori_im_id': np.array([img_id]),
+                    'st_pix': np.array(
+                        slice_image_result.starting_pixels[_ind],
+                        dtype=np.float32),
+                    'is_last': 1 if _ind == sub_img_num - 1 else 0,
+                } if 'image' in self.data_fields else {}
+                records.append(coco_rec)
+            ct_sub += sub_img_num
+            ct += 1
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+        assert ct > 0, 'not found any coco record in %s' % (anno_path)
+        logger.info('{} samples and slice to {} sub_samples in file {}'.format(
+            ct, ct_sub, anno_path))
+        if self.allow_empty and len(empty_records) > 0:
+            empty_records = self._sample_empty(empty_records, len(records))
+            records += empty_records
+        self.roidbs = records
+
+
+@register
+@serializable
+class SemiCOCODataSet(COCODataSet):
+    """Semi-COCODataSet used for supervised and unsupervised dataSet"""
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 load_crowd=False,
+                 allow_empty=False,
+                 empty_ratio=1.,
+                 repeat=1,
+                 supervised=True):
+        super(SemiCOCODataSet, self).__init__(
+            dataset_dir, image_dir, anno_path, data_fields, sample_num,
+            load_crowd, allow_empty, empty_ratio, repeat)
+        self.supervised = supervised
+        self.length = -1  # defalut -1 means all
+
+    def parse_dataset(self):
+        anno_path = os.path.join(self.dataset_dir, self.anno_path)
+        image_dir = os.path.join(self.dataset_dir, self.image_dir)
+
+        assert anno_path.endswith('.json'), \
+            'invalid coco annotation file: ' + anno_path
+        from pycocotools.coco import COCO
+        coco = COCO(anno_path)
+        img_ids = coco.getImgIds()
+        img_ids.sort()
+        cat_ids = coco.getCatIds()
+        records = []
+        empty_records = []
+        ct = 0
+
+        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
+        self.cname2cid = dict({
+            coco.loadCats(catid)[0]['name']: clsid
+            for catid, clsid in self.catid2clsid.items()
+        })
+
+        if 'annotations' not in coco.dataset or self.supervised == False:
+            self.load_image_only = True
+            logger.warning('Annotation file: {} does not contains ground truth '
+                           'and load image information only.'.format(anno_path))
+
+        for img_id in img_ids:
+            img_anno = coco.loadImgs([img_id])[0]
+            im_fname = img_anno['file_name']
+            im_w = float(img_anno['width'])
+            im_h = float(img_anno['height'])
+
+            im_path = os.path.join(image_dir,
+                                   im_fname) if image_dir else im_fname
+            is_empty = False
+            if not os.path.exists(im_path):
+                logger.warning('Illegal image file: {}, and it will be '
+                               'ignored'.format(im_path))
+                continue
+
+            if im_w < 0 or im_h < 0:
+                logger.warning('Illegal width: {} or height: {} in annotation, '
+                               'and im_id: {} will be ignored'.format(
+                                   im_w, im_h, img_id))
+                continue
+
+            coco_rec = {
+                'im_file': im_path,
+                'im_id': np.array([img_id]),
+                'h': im_h,
+                'w': im_w,
+            } if 'image' in self.data_fields else {}
+
+            if not self.load_image_only:
+                ins_anno_ids = coco.getAnnIds(
+                    imgIds=[img_id], iscrowd=None if self.load_crowd else False)
+                instances = coco.loadAnns(ins_anno_ids)
+
+                bboxes = []
+                is_rbox_anno = False
+                for inst in instances:
+                    # check gt bbox
+                    if inst.get('ignore', False):
+                        continue
+                    if 'bbox' not in inst.keys():
+                        continue
+                    else:
+                        if not any(np.array(inst['bbox'])):
+                            continue
+
+                    x1, y1, box_w, box_h = inst['bbox']
+                    x2 = x1 + box_w
+                    y2 = y1 + box_h
+                    eps = 1e-5
+                    if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:
+                        inst['clean_bbox'] = [
+                            round(float(x), 3) for x in [x1, y1, x2, y2]
+                        ]
+                        bboxes.append(inst)
+                    else:
+                        logger.warning(
+                            'Found an invalid bbox in annotations: im_id: {}, '
+                            'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
+                                img_id, float(inst['area']), x1, y1, x2, y2))
+
+                num_bbox = len(bboxes)
+                if num_bbox <= 0 and not self.allow_empty:
+                    continue
+                elif num_bbox <= 0:
+                    is_empty = True
+
+                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
+                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
+                is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
+                gt_poly = [None] * num_bbox
+
+                has_segmentation = False
+                for i, box in enumerate(bboxes):
+                    catid = box['category_id']
+                    gt_class[i][0] = self.catid2clsid[catid]
+                    gt_bbox[i, :] = box['clean_bbox']
+                    is_crowd[i][0] = box['iscrowd']
+                    # check RLE format 
+                    if 'segmentation' in box and box['iscrowd'] == 1:
+                        gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
+                    elif 'segmentation' in box and box['segmentation']:
+                        if not np.array(box['segmentation']
+                                        ).size > 0 and not self.allow_empty:
+                            bboxes.pop(i)
+                            gt_poly.pop(i)
+                            np.delete(is_crowd, i)
+                            np.delete(gt_class, i)
+                            np.delete(gt_bbox, i)
+                        else:
+                            gt_poly[i] = box['segmentation']
+                        has_segmentation = True
+
+                if has_segmentation and not any(
+                        gt_poly) and not self.allow_empty:
+                    continue
+
+                gt_rec = {
+                    'is_crowd': is_crowd,
+                    'gt_class': gt_class,
+                    'gt_bbox': gt_bbox,
+                    'gt_poly': gt_poly,
+                }
+
+                for k, v in gt_rec.items():
+                    if k in self.data_fields:
+                        coco_rec[k] = v
+
+                # TODO: remove load_semantic
+                if self.load_semantic and 'semantic' in self.data_fields:
+                    seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps',
+                                            'train2017', im_fname[:-3] + 'png')
+                    coco_rec.update({'semantic': seg_path})
+
+            logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format(
+                im_path, img_id, im_h, im_w))
+            if is_empty:
+                empty_records.append(coco_rec)
+            else:
+                records.append(coco_rec)
+            ct += 1
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+        assert ct > 0, 'not found any coco record in %s' % (anno_path)
+        logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.
+                    format(ct, len(img_ids) - ct, anno_path))
+        if self.allow_empty and len(empty_records) > 0:
+            empty_records = self._sample_empty(empty_records, len(records))
+            records += empty_records
+        self.roidbs = records
+
+        if self.supervised:
+            logger.info(f'Use {len(self.roidbs)} sup_samples data as LABELED')
+        else:
+            if self.length > 0:  # unsup length will be decide by sup length
+                all_roidbs = self.roidbs.copy()
+                selected_idxs = [
+                    np.random.choice(len(all_roidbs))
+                    for _ in range(self.length)
+                ]
+                self.roidbs = [all_roidbs[i] for i in selected_idxs]
+            logger.info(
+                f'Use {len(self.roidbs)} unsup_samples data as UNLABELED')
+
+    def __getitem__(self, idx):
+        n = len(self.roidbs)
+        if self.repeat > 1:
+            idx %= n
+        # data batch
+        roidb = copy.deepcopy(self.roidbs[idx])
+        if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch:
+            idx = np.random.randint(n)
+            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
+        elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch:
+            idx = np.random.randint(n)
+            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
+        elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch:
+            roidb = [roidb, ] + [
+                copy.deepcopy(self.roidbs[np.random.randint(n)])
+                for _ in range(4)
+            ]
+        if isinstance(roidb, Sequence):
+            for r in roidb:
+                r['curr_iter'] = self._curr_iter
+        else:
+            roidb['curr_iter'] = self._curr_iter
+        self._curr_iter += 1
+
+        return self.transform(roidb)
+
+
+# for PaddleX
+@register
+@serializable
+class COCODetDataset(COCODataSet):
+    pass
--- a/paddle_detection/ppdet/data/source/culane.py
+++ b/paddle_detection/ppdet/data/source/culane.py
@@ -0,0 +1,206 @@
+from ppdet.core.workspace import register, serializable
+import cv2
+import os
+import tarfile
+import numpy as np
+import os.path as osp
+from ppdet.data.source.dataset import DetDataset
+from imgaug.augmentables.lines import LineStringsOnImage
+from imgaug.augmentables.segmaps import SegmentationMapsOnImage
+from ppdet.data.culane_utils import lane_to_linestrings
+import pickle as pkl
+from ppdet.utils.logger import setup_logger
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+from .dataset import DetDataset, _make_dataset, _is_valid_file
+from ppdet.utils.download import download_dataset
+
+logger = setup_logger(__name__)
+
+
+@register
+@serializable
+class CULaneDataSet(DetDataset):
+    def __init__(
+            self,
+            dataset_dir,
+            cut_height,
+            list_path,
+            split='train',
+            data_fields=['image'],
+            video_file=None,
+            frame_rate=-1, ):
+        super(CULaneDataSet, self).__init__(
+            dataset_dir=dataset_dir,
+            cut_height=cut_height,
+            split=split,
+            data_fields=data_fields)
+        self.dataset_dir = dataset_dir
+        self.list_path = osp.join(dataset_dir, list_path)
+        self.cut_height = cut_height
+        self.data_fields = data_fields
+        self.split = split
+        self.training = 'train' in split
+        self.data_infos = []
+        self.video_file = video_file
+        self.frame_rate = frame_rate
+        self._imid2path = {}
+        self.predict_dir = None
+
+    def __len__(self):
+        return len(self.data_infos)
+
+    def check_or_download_dataset(self):
+        if not osp.exists(self.dataset_dir):
+            download_dataset("dataset", dataset="culane")
+            # extract .tar files in self.dataset_dir
+            for fname in os.listdir(self.dataset_dir):
+                logger.info("Decompressing {}...".format(fname))
+                # ignore .* files
+                if fname.startswith('.'):
+                    continue
+                if fname.find('.tar.gz') >= 0:
+                    with tarfile.open(osp.join(self.dataset_dir, fname)) as tf:
+                        tf.extractall(path=self.dataset_dir)
+        logger.info("Dataset files are ready.")
+
+    def parse_dataset(self):
+        logger.info('Loading CULane annotations...')
+        if self.predict_dir is not None:
+            logger.info('switch to predict mode')
+            return
+        # Waiting for the dataset to load is tedious, let's cache it
+        os.makedirs('cache', exist_ok=True)
+        cache_path = 'cache/culane_paddle_{}.pkl'.format(self.split)
+        if os.path.exists(cache_path):
+            with open(cache_path, 'rb') as cache_file:
+                self.data_infos = pkl.load(cache_file)
+                self.max_lanes = max(
+                    len(anno['lanes']) for anno in self.data_infos)
+                return
+
+        with open(self.list_path) as list_file:
+            for line in list_file:
+                infos = self.load_annotation(line.split())
+                self.data_infos.append(infos)
+
+        # cache data infos to file
+        with open(cache_path, 'wb') as cache_file:
+            pkl.dump(self.data_infos, cache_file)
+
+    def load_annotation(self, line):
+        infos = {}
+        img_line = line[0]
+        img_line = img_line[1 if img_line[0] == '/' else 0::]
+        img_path = os.path.join(self.dataset_dir, img_line)
+        infos['img_name'] = img_line
+        infos['img_path'] = img_path
+        if len(line) > 1:
+            mask_line = line[1]
+            mask_line = mask_line[1 if mask_line[0] == '/' else 0::]
+            mask_path = os.path.join(self.dataset_dir, mask_line)
+            infos['mask_path'] = mask_path
+
+        if len(line) > 2:
+            exist_list = [int(l) for l in line[2:]]
+            infos['lane_exist'] = np.array(exist_list)
+
+        anno_path = img_path[:
+                             -3] + 'lines.txt'  # remove sufix jpg and add lines.txt
+        with open(anno_path, 'r') as anno_file:
+            data = [
+                list(map(float, line.split())) for line in anno_file.readlines()
+            ]
+        lanes = [[(lane[i], lane[i + 1]) for i in range(0, len(lane), 2)
+                  if lane[i] >= 0 and lane[i + 1] >= 0] for lane in data]
+        lanes = [list(set(lane)) for lane in lanes]  # remove duplicated points
+        lanes = [lane for lane in lanes
+                 if len(lane) > 2]  # remove lanes with less than 2 points
+
+        lanes = [sorted(
+            lane, key=lambda x: x[1]) for lane in lanes]  # sort by y
+        infos['lanes'] = lanes
+
+        return infos
+
+    def set_images(self, images):
+        self.predict_dir = images
+        self.data_infos = self._load_images()
+
+    def _find_images(self):
+        predict_dir = self.predict_dir
+        if not isinstance(predict_dir, Sequence):
+            predict_dir = [predict_dir]
+        images = []
+        for im_dir in predict_dir:
+            if os.path.isdir(im_dir):
+                im_dir = os.path.join(self.predict_dir, im_dir)
+                images.extend(_make_dataset(im_dir))
+            elif os.path.isfile(im_dir) and _is_valid_file(im_dir):
+                images.append(im_dir)
+        return images
+
+    def _load_images(self):
+        images = self._find_images()
+        ct = 0
+        records = []
+        for image in images:
+            assert image != '' and os.path.isfile(image), \
+                    "Image {} not found".format(image)
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+            rec = {
+                'im_id': np.array([ct]),
+                "img_path": os.path.abspath(image),
+                "img_name": os.path.basename(image),
+                "lanes": []
+            }
+            self._imid2path[ct] = image
+            ct += 1
+            records.append(rec)
+        assert len(records) > 0, "No image file found"
+        return records
+
+    def get_imid2path(self):
+        return self._imid2path
+
+    def __getitem__(self, idx):
+        data_info = self.data_infos[idx]
+        img = cv2.imread(data_info['img_path'])
+        img = img[self.cut_height:, :, :]
+        sample = data_info.copy()
+        sample.update({'image': img})
+        img_org = sample['image']
+
+        if self.training:
+            label = cv2.imread(sample['mask_path'], cv2.IMREAD_UNCHANGED)
+            if len(label.shape) > 2:
+                label = label[:, :, 0]
+            label = label.squeeze()
+            label = label[self.cut_height:, :]
+            sample.update({'mask': label})
+            if self.cut_height != 0:
+                new_lanes = []
+                for i in sample['lanes']:
+                    lanes = []
+                    for p in i:
+                        lanes.append((p[0], p[1] - self.cut_height))
+                    new_lanes.append(lanes)
+                sample.update({'lanes': new_lanes})
+
+            sample['mask'] = SegmentationMapsOnImage(
+                sample['mask'], shape=img_org.shape)
+
+        sample['full_img_path'] = data_info['img_path']
+        sample['img_name'] = data_info['img_name']
+        sample['im_id'] = np.array([idx])
+
+        sample['image'] = sample['image'].copy().astype(np.uint8)
+        sample['lanes'] = lane_to_linestrings(sample['lanes'])
+        sample['lanes'] = LineStringsOnImage(
+            sample['lanes'], shape=img_org.shape)
+        sample['seg'] = np.zeros(img_org.shape)
+
+        return sample
--- a/paddle_detection/ppdet/data/source/dataset.py
+++ b/paddle_detection/ppdet/data/source/dataset.py
@@ -0,0 +1,307 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import os
+import copy
+import numpy as np
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+from paddle.io import Dataset
+from ppdet.core.workspace import register, serializable
+from ppdet.utils.download import get_dataset_path
+from ppdet.data import source
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+@serializable
+class DetDataset(Dataset):
+    """
+    Load detection dataset.
+
+    Args:
+        dataset_dir (str): root directory for dataset.
+        image_dir (str): directory for images.
+        anno_path (str): annotation file path.
+        data_fields (list): key name of data dictionary, at least have 'image'.
+        sample_num (int): number of samples to load, -1 means all.
+        use_default_label (bool): whether to load default label list.
+        repeat (int): repeat times for dataset, use in benchmark.
+    """
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 use_default_label=None,
+                 repeat=1,
+                 **kwargs):
+        super(DetDataset, self).__init__()
+        self.dataset_dir = dataset_dir if dataset_dir is not None else ''
+        self.anno_path = anno_path
+        self.image_dir = image_dir if image_dir is not None else ''
+        self.data_fields = data_fields
+        self.sample_num = sample_num
+        self.use_default_label = use_default_label
+        self.repeat = repeat
+        self._epoch = 0
+        self._curr_iter = 0
+
+    def __len__(self, ):
+        return len(self.roidbs) * self.repeat
+
+    def __call__(self, *args, **kwargs):
+        return self
+
+    def __getitem__(self, idx):
+        n = len(self.roidbs)
+        if self.repeat > 1:
+            idx %= n
+        # data batch
+        roidb = copy.deepcopy(self.roidbs[idx])
+        if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch:
+            idx = np.random.randint(n)
+            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
+        elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch:
+            idx = np.random.randint(n)
+            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
+        elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch:
+            roidb = [roidb, ] + [
+                copy.deepcopy(self.roidbs[np.random.randint(n)])
+                for _ in range(4)
+            ]
+        elif self.pre_img_epoch == 0 or self._epoch < self.pre_img_epoch:
+            # Add previous image as input, only used in CenterTrack
+            idx_pre_img = idx - 1
+            if idx_pre_img < 0:
+                idx_pre_img = idx + 1
+            roidb = [roidb, ] + [copy.deepcopy(self.roidbs[idx_pre_img])]
+        if isinstance(roidb, Sequence):
+            for r in roidb:
+                r['curr_iter'] = self._curr_iter
+        else:
+            roidb['curr_iter'] = self._curr_iter
+        self._curr_iter += 1
+
+        return self.transform(roidb)
+
+    def check_or_download_dataset(self):
+        self.dataset_dir = get_dataset_path(self.dataset_dir, self.anno_path,
+                                            self.image_dir)
+
+    def set_kwargs(self, **kwargs):
+        self.mixup_epoch = kwargs.get('mixup_epoch', -1)
+        self.cutmix_epoch = kwargs.get('cutmix_epoch', -1)
+        self.mosaic_epoch = kwargs.get('mosaic_epoch', -1)
+        self.pre_img_epoch = kwargs.get('pre_img_epoch', -1)
+
+    def set_transform(self, transform):
+        self.transform = transform
+
+    def set_epoch(self, epoch_id):
+        self._epoch = epoch_id
+
+    def parse_dataset(self, ):
+        raise NotImplementedError(
+            "Need to implement parse_dataset method of Dataset")
+
+    def get_anno(self):
+        if self.anno_path is None:
+            return
+        return os.path.join(self.dataset_dir, self.anno_path)
+
+
+def _is_valid_file(f, extensions=('.jpg', '.jpeg', '.png', '.bmp')):
+    return f.lower().endswith(extensions)
+
+
+def _make_dataset(dir):
+    dir = os.path.expanduser(dir)
+    if not os.path.isdir(dir):
+        raise ('{} should be a dir'.format(dir))
+    images = []
+    for root, _, fnames in sorted(os.walk(dir, followlinks=True)):
+        for fname in sorted(fnames):
+            path = os.path.join(root, fname)
+            if _is_valid_file(path):
+                images.append(path)
+    return images
+
+
+@register
+@serializable
+class ImageFolder(DetDataset):
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 sample_num=-1,
+                 use_default_label=None,
+                 **kwargs):
+        super(ImageFolder, self).__init__(
+            dataset_dir,
+            image_dir,
+            anno_path,
+            sample_num=sample_num,
+            use_default_label=use_default_label)
+        self._imid2path = {}
+        self.roidbs = None
+        self.sample_num = sample_num
+
+    def check_or_download_dataset(self):
+        return
+
+    def get_anno(self):
+        if self.anno_path is None:
+            return
+        if self.dataset_dir:
+            return os.path.join(self.dataset_dir, self.anno_path)
+        else:
+            return self.anno_path
+
+    def parse_dataset(self, ):
+        if not self.roidbs:
+            self.roidbs = self._load_images()
+
+    def _parse(self):
+        image_dir = self.image_dir
+        if not isinstance(image_dir, Sequence):
+            image_dir = [image_dir]
+        images = []
+        for im_dir in image_dir:
+            if os.path.isdir(im_dir):
+                im_dir = os.path.join(self.dataset_dir, im_dir)
+                images.extend(_make_dataset(im_dir))
+            elif os.path.isfile(im_dir) and _is_valid_file(im_dir):
+                images.append(im_dir)
+        return images
+
+    def _load_images(self):
+        images = self._parse()
+        ct = 0
+        records = []
+        for image in images:
+            assert image != '' and os.path.isfile(image), \
+                    "Image {} not found".format(image)
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+            rec = {'im_id': np.array([ct]), 'im_file': image}
+            self._imid2path[ct] = image
+            ct += 1
+            records.append(rec)
+        assert len(records) > 0, "No image file found"
+        return records
+
+    def get_imid2path(self):
+        return self._imid2path
+
+    def set_images(self, images):
+        self.image_dir = images
+        self.roidbs = self._load_images()
+
+    def set_slice_images(self,
+                         images,
+                         slice_size=[640, 640],
+                         overlap_ratio=[0.25, 0.25]):
+        self.image_dir = images
+        ori_records = self._load_images()
+        try:
+            import sahi
+            from sahi.slicing import slice_image
+        except Exception as e:
+            logger.error(
+                'sahi not found, plaese install sahi. '
+                'for example: `pip install sahi`, see https://github.com/obss/sahi.'
+            )
+            raise e
+
+        sub_img_ids = 0
+        ct = 0
+        ct_sub = 0
+        records = []
+        for i, ori_rec in enumerate(ori_records):
+            im_path = ori_rec['im_file']
+            slice_image_result = sahi.slicing.slice_image(
+                image=im_path,
+                slice_height=slice_size[0],
+                slice_width=slice_size[1],
+                overlap_height_ratio=overlap_ratio[0],
+                overlap_width_ratio=overlap_ratio[1])
+
+            sub_img_num = len(slice_image_result)
+            for _ind in range(sub_img_num):
+                im = slice_image_result.images[_ind]
+                rec = {
+                    'image': im,
+                    'im_id': np.array([sub_img_ids + _ind]),
+                    'h': im.shape[0],
+                    'w': im.shape[1],
+                    'ori_im_id': np.array([ori_rec['im_id'][0]]),
+                    'st_pix': np.array(
+                        slice_image_result.starting_pixels[_ind],
+                        dtype=np.float32),
+                    'is_last': 1 if _ind == sub_img_num - 1 else 0,
+                } if 'image' in self.data_fields else {}
+                records.append(rec)
+            ct_sub += sub_img_num
+            ct += 1
+        logger.info('{} samples and slice to {} sub_samples.'.format(ct,
+                                                                     ct_sub))
+        self.roidbs = records
+
+    def get_label_list(self):
+        # Only VOC dataset needs label list in ImageFold 
+        return self.anno_path
+
+
+@register
+class CommonDataset(object):
+    def __init__(self, **dataset_args):
+        super(CommonDataset, self).__init__()
+        dataset_args = copy.deepcopy(dataset_args)
+        type = dataset_args.pop("name")
+        self.dataset = getattr(source, type)(**dataset_args)
+
+    def __call__(self):
+        return self.dataset
+
+
+@register
+class TrainDataset(CommonDataset):
+    pass
+
+
+@register
+class EvalMOTDataset(CommonDataset):
+    pass
+
+
+@register
+class TestMOTDataset(CommonDataset):
+    pass
+
+
+@register
+class EvalDataset(CommonDataset):
+    pass
+
+
+@register
+class TestDataset(CommonDataset):
+    pass
--- a/paddle_detection/ppdet/data/source/keypoint_coco.py
+++ b/paddle_detection/ppdet/data/source/keypoint_coco.py
@@ -0,0 +1,845 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+"""
+this code is base on https://github.com/open-mmlab/mmpose
+"""
+import os
+import cv2
+import numpy as np
+import json
+import copy
+import pycocotools
+from pycocotools.coco import COCO
+from .dataset import DetDataset
+from ppdet.core.workspace import register, serializable
+
+
+@serializable
+class KeypointBottomUpBaseDataset(DetDataset):
+    """Base class for bottom-up datasets. 
+
+    All datasets should subclass it.
+    All subclasses should overwrite:
+        Methods:`_get_imganno`
+
+    Args:
+        dataset_dir (str): Root path to the dataset.
+        anno_path (str): Relative path to the annotation file.
+        image_dir (str): Path to a directory where images are held.
+            Default: None.
+        num_joints (int): keypoint numbers
+        transform (composed(operators)): A sequence of data transforms.
+        shard (list): [rank, worldsize], the distributed env params
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 dataset_dir,
+                 image_dir,
+                 anno_path,
+                 num_joints,
+                 transform=[],
+                 shard=[0, 1],
+                 test_mode=False):
+        super().__init__(dataset_dir, image_dir, anno_path)
+        self.image_info = {}
+        self.ann_info = {}
+
+        self.img_prefix = os.path.join(dataset_dir, image_dir)
+        self.transform = transform
+        self.test_mode = test_mode
+
+        self.ann_info['num_joints'] = num_joints
+        self.img_ids = []
+
+    def parse_dataset(self):
+        pass
+
+    def __len__(self):
+        """Get dataset length."""
+        return len(self.img_ids)
+
+    def _get_imganno(self, idx):
+        """Get anno for a single image."""
+        raise NotImplementedError
+
+    def __getitem__(self, idx):
+        """Prepare image for training given the index."""
+        records = copy.deepcopy(self._get_imganno(idx))
+        records['image'] = cv2.imread(records['image_file'])
+        records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)
+        if 'mask' in records:
+            records['mask'] = (records['mask'] + 0).astype('uint8')
+        records = self.transform(records)
+        return records
+
+    def parse_dataset(self):
+        return
+
+
+@register
+@serializable
+class KeypointBottomUpCocoDataset(KeypointBottomUpBaseDataset):
+    """COCO dataset for bottom-up pose estimation. 
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    COCO keypoint indexes::
+
+        0: 'nose',
+        1: 'left_eye',
+        2: 'right_eye',
+        3: 'left_ear',
+        4: 'right_ear',
+        5: 'left_shoulder',
+        6: 'right_shoulder',
+        7: 'left_elbow',
+        8: 'right_elbow',
+        9: 'left_wrist',
+        10: 'right_wrist',
+        11: 'left_hip',
+        12: 'right_hip',
+        13: 'left_knee',
+        14: 'right_knee',
+        15: 'left_ankle',
+        16: 'right_ankle'
+
+    Args:
+        dataset_dir (str): Root path to the dataset.
+        anno_path (str): Relative path to the annotation file.
+        image_dir (str): Path to a directory where images are held.
+            Default: None.
+        num_joints (int): keypoint numbers
+        transform (composed(operators)): A sequence of data transforms.
+        shard (list): [rank, worldsize], the distributed env params
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 dataset_dir,
+                 image_dir,
+                 anno_path,
+                 num_joints,
+                 transform=[],
+                 shard=[0, 1],
+                 test_mode=False,
+                 return_mask=True,
+                 return_bbox=True,
+                 return_area=True,
+                 return_class=True):
+        super().__init__(dataset_dir, image_dir, anno_path, num_joints,
+                         transform, shard, test_mode)
+
+        self.ann_file = os.path.join(dataset_dir, anno_path)
+        self.shard = shard
+        self.test_mode = test_mode
+        self.return_mask = return_mask
+        self.return_bbox = return_bbox
+        self.return_area = return_area
+        self.return_class = return_class
+
+    def parse_dataset(self):
+        self.coco = COCO(self.ann_file)
+
+        self.img_ids = self.coco.getImgIds()
+        if not self.test_mode:
+            self.img_ids_tmp = []
+            for img_id in self.img_ids:
+                ann_ids = self.coco.getAnnIds(imgIds=img_id)
+                anno = self.coco.loadAnns(ann_ids)
+                anno = [obj for obj in anno if obj['iscrowd'] == 0]
+                if len(anno) == 0:
+                    continue
+                self.img_ids_tmp.append(img_id)
+            self.img_ids = self.img_ids_tmp
+
+        blocknum = int(len(self.img_ids) / self.shard[1])
+        self.img_ids = self.img_ids[(blocknum * self.shard[0]):(blocknum * (
+            self.shard[0] + 1))]
+        self.num_images = len(self.img_ids)
+        self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs)
+        self.dataset_name = 'coco'
+
+        cat_ids = self.coco.getCatIds()
+        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
+        print('=> num_images: {}'.format(self.num_images))
+
+    @staticmethod
+    def _get_mapping_id_name(imgs):
+        """
+        Args:
+            imgs (dict): dict of image info.
+
+        Returns:
+            tuple: Image name & id mapping dicts.
+
+            - id2name (dict): Mapping image id to name.
+            - name2id (dict): Mapping image name to id.
+        """
+        id2name = {}
+        name2id = {}
+        for image_id, image in imgs.items():
+            file_name = image['file_name']
+            id2name[image_id] = file_name
+            name2id[file_name] = image_id
+
+        return id2name, name2id
+
+    def _get_imganno(self, idx):
+        """Get anno for a single image.
+
+        Args:
+            idx (int): image idx
+
+        Returns:
+            dict: info for model training
+        """
+        coco = self.coco
+        img_id = self.img_ids[idx]
+        ann_ids = coco.getAnnIds(imgIds=img_id)
+        anno = coco.loadAnns(ann_ids)
+
+        anno = [
+            obj for obj in anno
+            if obj['iscrowd'] == 0 and obj['num_keypoints'] > 0
+        ]
+
+        db_rec = {}
+        joints, orgsize = self._get_joints(anno, idx)
+        db_rec['gt_joints'] = joints
+        db_rec['im_shape'] = orgsize
+
+        if self.return_bbox:
+            db_rec['gt_bbox'] = self._get_bboxs(anno, idx)
+
+        if self.return_class:
+            db_rec['gt_class'] = self._get_labels(anno, idx)
+
+        if self.return_area:
+            db_rec['gt_areas'] = self._get_areas(anno, idx)
+
+        if self.return_mask:
+            db_rec['mask'] = self._get_mask(anno, idx)
+
+        db_rec['im_id'] = img_id
+        db_rec['image_file'] = os.path.join(self.img_prefix,
+                                            self.id2name[img_id])
+
+        return db_rec
+
+    def _get_joints(self, anno, idx):
+        """Get joints for all people in an image."""
+        num_people = len(anno)
+
+        joints = np.zeros(
+            (num_people, self.ann_info['num_joints'], 3), dtype=np.float32)
+
+        for i, obj in enumerate(anno):
+            joints[i, :self.ann_info['num_joints'], :3] = \
+                np.array(obj['keypoints']).reshape([-1, 3])
+
+        img_info = self.coco.loadImgs(self.img_ids[idx])[0]
+        orgsize = np.array([img_info['height'], img_info['width'], 1])
+
+        return joints, orgsize
+
+    def _get_bboxs(self, anno, idx):
+        num_people = len(anno)
+        gt_bboxes = np.zeros((num_people, 4), dtype=np.float32)
+
+        for idx, obj in enumerate(anno):
+            if 'bbox' in obj:
+                gt_bboxes[idx, :] = obj['bbox']
+
+        gt_bboxes[:, 2] += gt_bboxes[:, 0]
+        gt_bboxes[:, 3] += gt_bboxes[:, 1]
+        return gt_bboxes
+
+    def _get_labels(self, anno, idx):
+        num_people = len(anno)
+        gt_labels = np.zeros((num_people, 1), dtype=np.float32)
+
+        for idx, obj in enumerate(anno):
+            if 'category_id' in obj:
+                catid = obj['category_id']
+                gt_labels[idx, 0] = self.catid2clsid[catid]
+        return gt_labels
+
+    def _get_areas(self, anno, idx):
+        num_people = len(anno)
+        gt_areas = np.zeros((num_people, ), dtype=np.float32)
+
+        for idx, obj in enumerate(anno):
+            if 'area' in obj:
+                gt_areas[idx, ] = obj['area']
+        return gt_areas
+
+    def _get_mask(self, anno, idx):
+        """Get ignore masks to mask out losses."""
+        coco = self.coco
+        img_info = coco.loadImgs(self.img_ids[idx])[0]
+
+        m = np.zeros((img_info['height'], img_info['width']), dtype=np.float32)
+
+        for obj in anno:
+            if 'segmentation' in obj:
+                if obj['iscrowd']:
+                    rle = pycocotools.mask.frPyObjects(obj['segmentation'],
+                                                       img_info['height'],
+                                                       img_info['width'])
+                    m += pycocotools.mask.decode(rle)
+                elif obj['num_keypoints'] == 0:
+                    rles = pycocotools.mask.frPyObjects(obj['segmentation'],
+                                                        img_info['height'],
+                                                        img_info['width'])
+                    for rle in rles:
+                        m += pycocotools.mask.decode(rle)
+
+        return m < 0.5
+
+
+@register
+@serializable
+class KeypointBottomUpCrowdPoseDataset(KeypointBottomUpCocoDataset):
+    """CrowdPose dataset for bottom-up pose estimation. 
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    CrowdPose keypoint indexes::
+
+        0: 'left_shoulder',
+        1: 'right_shoulder',
+        2: 'left_elbow',
+        3: 'right_elbow',
+        4: 'left_wrist',
+        5: 'right_wrist',
+        6: 'left_hip',
+        7: 'right_hip',
+        8: 'left_knee',
+        9: 'right_knee',
+        10: 'left_ankle',
+        11: 'right_ankle',
+        12: 'top_head',
+        13: 'neck'
+
+    Args:
+        dataset_dir (str): Root path to the dataset.
+        anno_path (str): Relative path to the annotation file.
+        image_dir (str): Path to a directory where images are held.
+            Default: None.
+        num_joints (int): keypoint numbers
+        transform (composed(operators)): A sequence of data transforms.
+        shard (list): [rank, worldsize], the distributed env params
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+    """
+
+    def __init__(self,
+                 dataset_dir,
+                 image_dir,
+                 anno_path,
+                 num_joints,
+                 transform=[],
+                 shard=[0, 1],
+                 test_mode=False):
+        super().__init__(dataset_dir, image_dir, anno_path, num_joints,
+                         transform, shard, test_mode)
+
+        self.ann_file = os.path.join(dataset_dir, anno_path)
+        self.shard = shard
+        self.test_mode = test_mode
+
+    def parse_dataset(self):
+        self.coco = COCO(self.ann_file)
+
+        self.img_ids = self.coco.getImgIds()
+        if not self.test_mode:
+            self.img_ids = [
+                img_id for img_id in self.img_ids
+                if len(self.coco.getAnnIds(
+                    imgIds=img_id, iscrowd=None)) > 0
+            ]
+        blocknum = int(len(self.img_ids) / self.shard[1])
+        self.img_ids = self.img_ids[(blocknum * self.shard[0]):(blocknum * (
+            self.shard[0] + 1))]
+        self.num_images = len(self.img_ids)
+        self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs)
+
+        self.dataset_name = 'crowdpose'
+        print('=> num_images: {}'.format(self.num_images))
+
+
+@serializable
+class KeypointTopDownBaseDataset(DetDataset):
+    """Base class for top_down datasets.
+
+    All datasets should subclass it.
+    All subclasses should overwrite:
+        Methods:`_get_db`
+
+    Args:
+        dataset_dir (str): Root path to the dataset.
+        image_dir (str): Path to a directory where images are held.
+        anno_path (str): Relative path to the annotation file.
+        num_joints (int): keypoint numbers
+        transform (composed(operators)): A sequence of data transforms.
+    """
+
+    def __init__(self,
+                 dataset_dir,
+                 image_dir,
+                 anno_path,
+                 num_joints,
+                 transform=[]):
+        super().__init__(dataset_dir, image_dir, anno_path)
+        self.image_info = {}
+        self.ann_info = {}
+
+        self.img_prefix = os.path.join(dataset_dir, image_dir)
+        self.transform = transform
+
+        self.ann_info['num_joints'] = num_joints
+        self.db = []
+
+    def __len__(self):
+        """Get dataset length."""
+        return len(self.db)
+
+    def _get_db(self):
+        """Get a sample"""
+        raise NotImplementedError
+
+    def __getitem__(self, idx):
+        """Prepare sample for training given the index."""
+        records = copy.deepcopy(self.db[idx])
+        records['image'] = cv2.imread(records['image_file'], cv2.IMREAD_COLOR |
+                                      cv2.IMREAD_IGNORE_ORIENTATION)
+        records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)
+        records['score'] = records['score'] if 'score' in records else 1
+        records = self.transform(records)
+        # print('records', records)
+        return records
+
+
+@register
+@serializable
+class KeypointTopDownCocoDataset(KeypointTopDownBaseDataset):
+    """COCO dataset for top-down pose estimation. 
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    COCO keypoint indexes:
+
+        0: 'nose',
+        1: 'left_eye',
+        2: 'right_eye',
+        3: 'left_ear',
+        4: 'right_ear',
+        5: 'left_shoulder',
+        6: 'right_shoulder',
+        7: 'left_elbow',
+        8: 'right_elbow',
+        9: 'left_wrist',
+        10: 'right_wrist',
+        11: 'left_hip',
+        12: 'right_hip',
+        13: 'left_knee',
+        14: 'right_knee',
+        15: 'left_ankle',
+        16: 'right_ankle'
+
+    Args:
+        dataset_dir (str): Root path to the dataset.
+        image_dir (str): Path to a directory where images are held.
+        anno_path (str): Relative path to the annotation file.
+        num_joints (int): Keypoint numbers
+        trainsize (list):[w, h] Image target size
+        transform (composed(operators)): A sequence of data transforms.
+        bbox_file (str): Path to a detection bbox file
+            Default: None.
+        use_gt_bbox (bool): Whether to use ground truth bbox
+            Default: True.
+        pixel_std (int): The pixel std of the scale
+            Default: 200.
+        image_thre (float): The threshold to filter the detection box
+            Default: 0.0.
+    """
+
+    def __init__(self,
+                 dataset_dir,
+                 image_dir,
+                 anno_path,
+                 num_joints,
+                 trainsize,
+                 transform=[],
+                 bbox_file=None,
+                 use_gt_bbox=True,
+                 pixel_std=200,
+                 image_thre=0.0,
+                 center_scale=None):
+        super().__init__(dataset_dir, image_dir, anno_path, num_joints,
+                         transform)
+
+        self.bbox_file = bbox_file
+        self.use_gt_bbox = use_gt_bbox
+        self.trainsize = trainsize
+        self.pixel_std = pixel_std
+        self.image_thre = image_thre
+        self.center_scale = center_scale
+        self.dataset_name = 'coco'
+
+    def parse_dataset(self):
+        if self.use_gt_bbox:
+            self.db = self._load_coco_keypoint_annotations()
+        else:
+            self.db = self._load_coco_person_detection_results()
+
+    def _load_coco_keypoint_annotations(self):
+        coco = COCO(self.get_anno())
+        img_ids = coco.getImgIds()
+        gt_db = []
+        for index in img_ids:
+            im_ann = coco.loadImgs(index)[0]
+            width = im_ann['width']
+            height = im_ann['height']
+            file_name = im_ann['file_name']
+            im_id = int(im_ann["id"])
+
+            annIds = coco.getAnnIds(imgIds=index, iscrowd=False)
+            objs = coco.loadAnns(annIds)
+
+            valid_objs = []
+            for obj in objs:
+                x, y, w, h = obj['bbox']
+                x1 = np.max((0, x))
+                y1 = np.max((0, y))
+                x2 = np.min((width - 1, x1 + np.max((0, w - 1))))
+                y2 = np.min((height - 1, y1 + np.max((0, h - 1))))
+                if obj['area'] > 0 and x2 >= x1 and y2 >= y1:
+                    obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
+                    valid_objs.append(obj)
+            objs = valid_objs
+
+            rec = []
+            for obj in objs:
+                if max(obj['keypoints']) == 0:
+                    continue
+
+                joints = np.zeros(
+                    (self.ann_info['num_joints'], 3), dtype=np.float32)
+                joints_vis = np.zeros(
+                    (self.ann_info['num_joints'], 3), dtype=np.float32)
+                for ipt in range(self.ann_info['num_joints']):
+                    joints[ipt, 0] = obj['keypoints'][ipt * 3 + 0]
+                    joints[ipt, 1] = obj['keypoints'][ipt * 3 + 1]
+                    joints[ipt, 2] = 0
+                    t_vis = obj['keypoints'][ipt * 3 + 2]
+                    if t_vis > 1:
+                        t_vis = 1
+                    joints_vis[ipt, 0] = t_vis
+                    joints_vis[ipt, 1] = t_vis
+                    joints_vis[ipt, 2] = 0
+
+                center, scale = self._box2cs(obj['clean_bbox'][:4])
+                rec.append({
+                    'image_file': os.path.join(self.img_prefix, file_name),
+                    'center': center,
+                    'scale': scale,
+                    'gt_joints': joints,
+                    'joints_vis': joints_vis,
+                    'im_id': im_id,
+                })
+            gt_db.extend(rec)
+
+        return gt_db
+
+    def _box2cs(self, box):
+        x, y, w, h = box[:4]
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        aspect_ratio = self.trainsize[0] * 1.0 / self.trainsize[1]
+
+        if self.center_scale is not None and np.random.rand() < 0.3:
+            center += self.center_scale * (np.random.rand(2) - 0.5) * [w, h]
+
+        if w > aspect_ratio * h:
+            h = w * 1.0 / aspect_ratio
+        elif w < aspect_ratio * h:
+            w = h * aspect_ratio
+        scale = np.array(
+            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
+            dtype=np.float32)
+        if center[0] != -1:
+            scale = scale * 1.25
+
+        return center, scale
+
+    def _load_coco_person_detection_results(self):
+        all_boxes = None
+        bbox_file_path = os.path.join(self.dataset_dir, self.bbox_file)
+        with open(bbox_file_path, 'r') as f:
+            all_boxes = json.load(f)
+
+        if not all_boxes:
+            print('=> Load %s fail!' % bbox_file_path)
+            return None
+
+        kpt_db = []
+        for n_img in range(0, len(all_boxes)):
+            det_res = all_boxes[n_img]
+            if det_res['category_id'] != 1:
+                continue
+            file_name = det_res[
+                'filename'] if 'filename' in det_res else '%012d.jpg' % det_res[
+                    'image_id']
+            img_name = os.path.join(self.img_prefix, file_name)
+            box = det_res['bbox']
+            score = det_res['score']
+            im_id = int(det_res['image_id'])
+
+            if score < self.image_thre:
+                continue
+
+            center, scale = self._box2cs(box)
+            joints = np.zeros(
+                (self.ann_info['num_joints'], 3), dtype=np.float32)
+            joints_vis = np.ones(
+                (self.ann_info['num_joints'], 3), dtype=np.float32)
+            kpt_db.append({
+                'image_file': img_name,
+                'im_id': im_id,
+                'center': center,
+                'scale': scale,
+                'score': score,
+                'gt_joints': joints,
+                'joints_vis': joints_vis,
+            })
+
+        return kpt_db
+
+
+@register
+@serializable
+class KeypointTopDownCocoWholeBodyHandDataset(KeypointTopDownBaseDataset):
+    """CocoWholeBody dataset for top-down hand pose estimation. 
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    COCO-WholeBody Hand keypoint indexes:
+
+        0: 'wrist',
+        1: 'thumb1',
+        2: 'thumb2',
+        3: 'thumb3',
+        4: 'thumb4',
+        5: 'forefinger1',
+        6: 'forefinger2',
+        7: 'forefinger3',
+        8: 'forefinger4',
+        9: 'middle_finger1',
+        10: 'middle_finger2',
+        11: 'middle_finger3',
+        12: 'middle_finger4',
+        13: 'ring_finger1',
+        14: 'ring_finger2',
+        15: 'ring_finger3',
+        16: 'ring_finger4',
+        17: 'pinky_finger1',
+        18: 'pinky_finger2',
+        19: 'pinky_finger3',
+        20: 'pinky_finger4'
+
+    Args:
+        dataset_dir (str): Root path to the dataset.
+        image_dir (str): Path to a directory where images are held.
+        anno_path (str): Relative path to the annotation file.
+        num_joints (int): Keypoint numbers
+        trainsize (list):[w, h] Image target size
+        transform (composed(operators)): A sequence of data transforms.
+        pixel_std (int): The pixel std of the scale
+            Default: 200.
+    """
+
+    def __init__(self,
+                 dataset_dir,
+                 image_dir,
+                 anno_path,
+                 num_joints,
+                 trainsize,
+                 transform=[],
+                 pixel_std=200):
+        super().__init__(dataset_dir, image_dir, anno_path, num_joints,
+                         transform)
+
+        self.trainsize = trainsize
+        self.pixel_std = pixel_std
+        self.dataset_name = 'coco_wholebady_hand'
+
+    def _box2cs(self, box):
+        x, y, w, h = box[:4]
+        center = np.zeros((2), dtype=np.float32)
+        center[0] = x + w * 0.5
+        center[1] = y + h * 0.5
+        aspect_ratio = self.trainsize[0] * 1.0 / self.trainsize[1]
+
+        if w > aspect_ratio * h:
+            h = w * 1.0 / aspect_ratio
+        elif w < aspect_ratio * h:
+            w = h * aspect_ratio
+        scale = np.array(
+            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
+            dtype=np.float32)
+        if center[0] != -1:
+            scale = scale * 1.25
+
+        return center, scale
+
+    def parse_dataset(self):
+        gt_db = []
+        num_joints = self.ann_info['num_joints']
+        coco = COCO(self.get_anno())
+        img_ids = list(coco.imgs.keys())
+        for img_id in img_ids:
+            im_ann = coco.loadImgs(img_id)[0]
+            image_file = os.path.join(self.img_prefix, im_ann['file_name'])
+            im_id = int(im_ann["id"])
+
+            ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False)
+            objs = coco.loadAnns(ann_ids)
+
+            for obj in objs:
+                for type in ['left', 'right']:
+                    if (obj[f'{type}hand_valid'] and
+                            max(obj[f'{type}hand_kpts']) > 0):
+
+                        joints = np.zeros((num_joints, 3), dtype=np.float32)
+                        joints_vis = np.zeros((num_joints, 3), dtype=np.float32)
+
+                        keypoints = np.array(obj[f'{type}hand_kpts'])
+                        keypoints = keypoints.reshape(-1, 3)
+                        joints[:, :2] = keypoints[:, :2]
+                        joints_vis[:, :2] = np.minimum(1, keypoints[:, 2:3])
+
+                        center, scale = self._box2cs(obj[f'{type}hand_box'][:4])
+                        gt_db.append({
+                            'image_file': image_file,
+                            'center': center,
+                            'scale': scale,
+                            'gt_joints': joints,
+                            'joints_vis': joints_vis,
+                            'im_id': im_id,
+                        })
+
+        self.db = gt_db
+
+
+@register
+@serializable
+class KeypointTopDownMPIIDataset(KeypointTopDownBaseDataset):
+    """MPII dataset for topdown pose estimation.
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    MPII keypoint indexes::
+
+        0: 'right_ankle',
+        1: 'right_knee',
+        2: 'right_hip',
+        3: 'left_hip',
+        4: 'left_knee',
+        5: 'left_ankle',
+        6: 'pelvis',
+        7: 'thorax',
+        8: 'upper_neck',
+        9: 'head_top',
+        10: 'right_wrist',
+        11: 'right_elbow',
+        12: 'right_shoulder',
+        13: 'left_shoulder',
+        14: 'left_elbow',
+        15: 'left_wrist',
+
+    Args:
+        dataset_dir (str): Root path to the dataset.
+        image_dir (str): Path to a directory where images are held.
+        anno_path (str): Relative path to the annotation file.
+        num_joints (int): Keypoint numbers
+        trainsize (list):[w, h] Image target size
+        transform (composed(operators)): A sequence of data transforms.
+    """
+
+    def __init__(self,
+                 dataset_dir,
+                 image_dir,
+                 anno_path,
+                 num_joints,
+                 transform=[]):
+        super().__init__(dataset_dir, image_dir, anno_path, num_joints,
+                         transform)
+
+        self.dataset_name = 'mpii'
+
+    def parse_dataset(self):
+        with open(self.get_anno()) as anno_file:
+            anno = json.load(anno_file)
+
+        gt_db = []
+        for a in anno:
+            image_name = a['image']
+            im_id = a['image_id'] if 'image_id' in a else int(
+                os.path.splitext(image_name)[0])
+
+            c = np.array(a['center'], dtype=np.float32)
+            s = np.array([a['scale'], a['scale']], dtype=np.float32)
+
+            # Adjust center/scale slightly to avoid cropping limbs
+            if c[0] != -1:
+                c[1] = c[1] + 15 * s[1]
+                s = s * 1.25
+            c = c - 1
+
+            joints = np.zeros(
+                (self.ann_info['num_joints'], 3), dtype=np.float32)
+            joints_vis = np.zeros(
+                (self.ann_info['num_joints'], 3), dtype=np.float32)
+            if 'gt_joints' in a:
+                joints_ = np.array(a['gt_joints'])
+                joints_[:, 0:2] = joints_[:, 0:2] - 1
+                joints_vis_ = np.array(a['joints_vis'])
+                assert len(joints_) == self.ann_info[
+                    'num_joints'], 'joint num diff: {} vs {}'.format(
+                        len(joints_), self.ann_info['num_joints'])
+
+                joints[:, 0:2] = joints_[:, 0:2]
+                joints_vis[:, 0] = joints_vis_[:]
+                joints_vis[:, 1] = joints_vis_[:]
+
+            gt_db.append({
+                'image_file': os.path.join(self.img_prefix, image_name),
+                'im_id': im_id,
+                'center': c,
+                'scale': s,
+                'gt_joints': joints,
+                'joints_vis': joints_vis
+            })
+        print("number length: {}".format(len(gt_db)))
+        self.db = gt_db
--- a/paddle_detection/ppdet/data/source/mot.py
+++ b/paddle_detection/ppdet/data/source/mot.py
@@ -0,0 +1,638 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import cv2
+import glob
+import numpy as np
+from collections import OrderedDict, defaultdict
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+from .dataset import DetDataset, _make_dataset, _is_valid_file
+from ppdet.core.workspace import register, serializable
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+@register
+@serializable
+class MOTDataSet(DetDataset):
+    """
+    Load dataset with MOT format, only support single class MOT.
+
+    Args:
+        dataset_dir (str): root directory for dataset.
+        image_lists (str|list): mot data image lists, muiti-source mot dataset.
+        data_fields (list): key name of data dictionary, at least have 'image'.
+        sample_num (int): number of samples to load, -1 means all.
+        repeat (int): repeat times for dataset, use in benchmark.
+
+    Notes:
+        MOT datasets root directory following this:
+            dataset/mot
+            |——————image_lists
+            |        |——————caltech.train  
+            |        |——————caltech.val   
+            |        |——————mot16.train  
+            |        |——————mot17.train  
+            |        ......
+            |——————Caltech
+            |——————MOT17
+            |——————......
+
+        All the MOT datasets have the following structure:
+            Caltech
+            |——————images
+            |        └——————00001.jpg
+            |        |—————— ...
+            |        └——————0000N.jpg
+            └——————labels_with_ids
+                        └——————00001.txt
+                        |—————— ...
+                        └——————0000N.txt
+            or
+
+            MOT17
+            |——————images
+            |        └——————train
+            |        └——————test
+            └——————labels_with_ids
+                        └——————train
+    """
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_lists=[],
+                 data_fields=['image'],
+                 sample_num=-1,
+                 repeat=1):
+        super(MOTDataSet, self).__init__(
+            dataset_dir=dataset_dir,
+            data_fields=data_fields,
+            sample_num=sample_num,
+            repeat=repeat)
+        self.dataset_dir = dataset_dir
+        self.image_lists = image_lists
+        if isinstance(self.image_lists, str):
+            self.image_lists = [self.image_lists]
+        self.roidbs = None
+        self.cname2cid = None
+
+    def get_anno(self):
+        if self.image_lists == []:
+            return
+        # only used to get categories and metric
+        # only check first data, but the label_list of all data should be same.
+        first_mot_data = self.image_lists[0].split('.')[0]
+        anno_file = os.path.join(self.dataset_dir, first_mot_data,
+                                 'label_list.txt')
+        return anno_file
+
+    def parse_dataset(self):
+        self.img_files = OrderedDict()
+        self.img_start_index = OrderedDict()
+        self.label_files = OrderedDict()
+        self.tid_num = OrderedDict()
+        self.tid_start_index = OrderedDict()
+
+        img_index = 0
+        for data_name in self.image_lists:
+            # check every data image list
+            image_lists_dir = os.path.join(self.dataset_dir, 'image_lists')
+            assert os.path.isdir(image_lists_dir), \
+                "The {} is not a directory.".format(image_lists_dir)
+
+            list_path = os.path.join(image_lists_dir, data_name)
+            assert os.path.exists(list_path), \
+                "The list path {} does not exist.".format(list_path)
+
+            # record img_files, filter out empty ones
+            with open(list_path, 'r') as file:
+                self.img_files[data_name] = file.readlines()
+                self.img_files[data_name] = [
+                    os.path.join(self.dataset_dir, x.strip())
+                    for x in self.img_files[data_name]
+                ]
+                self.img_files[data_name] = list(
+                    filter(lambda x: len(x) > 0, self.img_files[data_name]))
+
+                self.img_start_index[data_name] = img_index
+                img_index += len(self.img_files[data_name])
+
+            # record label_files
+            self.label_files[data_name] = [
+                x.replace('images', 'labels_with_ids').replace(
+                    '.png', '.txt').replace('.jpg', '.txt')
+                for x in self.img_files[data_name]
+            ]
+
+        for data_name, label_paths in self.label_files.items():
+            max_index = -1
+            for lp in label_paths:
+                lb = np.loadtxt(lp)
+                if len(lb) < 1:
+                    continue
+                if len(lb.shape) < 2:
+                    img_max = lb[1]
+                else:
+                    img_max = np.max(lb[:, 1])
+                if img_max > max_index:
+                    max_index = img_max
+            self.tid_num[data_name] = int(max_index + 1)
+
+        last_index = 0
+        for i, (k, v) in enumerate(self.tid_num.items()):
+            self.tid_start_index[k] = last_index
+            last_index += v
+
+        self.num_identities_dict = defaultdict(int)
+        self.num_identities_dict[0] = int(last_index + 1)  # single class
+        self.num_imgs_each_data = [len(x) for x in self.img_files.values()]
+        self.total_imgs = sum(self.num_imgs_each_data)
+
+        logger.info('MOT dataset summary: ')
+        logger.info(self.tid_num)
+        logger.info('Total images: {}'.format(self.total_imgs))
+        logger.info('Image start index: {}'.format(self.img_start_index))
+        logger.info('Total identities: {}'.format(self.num_identities_dict[0]))
+        logger.info('Identity start index: {}'.format(self.tid_start_index))
+
+        records = []
+        cname2cid = mot_label()
+
+        for img_index in range(self.total_imgs):
+            for i, (k, v) in enumerate(self.img_start_index.items()):
+                if img_index >= v:
+                    data_name = list(self.label_files.keys())[i]
+                    start_index = v
+            img_file = self.img_files[data_name][img_index - start_index]
+            lbl_file = self.label_files[data_name][img_index - start_index]
+
+            if not os.path.exists(img_file):
+                logger.warning('Illegal image file: {}, and it will be ignored'.
+                               format(img_file))
+                continue
+            if not os.path.isfile(lbl_file):
+                logger.warning('Illegal label file: {}, and it will be ignored'.
+                               format(lbl_file))
+                continue
+
+            labels = np.loadtxt(lbl_file, dtype=np.float32).reshape(-1, 6)
+            # each row in labels (N, 6) is [gt_class, gt_identity, cx, cy, w, h]
+
+            cx, cy = labels[:, 2], labels[:, 3]
+            w, h = labels[:, 4], labels[:, 5]
+            gt_bbox = np.stack((cx, cy, w, h)).T.astype('float32')
+            gt_class = labels[:, 0:1].astype('int32')
+            gt_score = np.ones((len(labels), 1)).astype('float32')
+            gt_ide = labels[:, 1:2].astype('int32')
+            for i, _ in enumerate(gt_ide):
+                if gt_ide[i] > -1:
+                    gt_ide[i] += self.tid_start_index[data_name]
+
+            mot_rec = {
+                'im_file': img_file,
+                'im_id': img_index,
+            } if 'image' in self.data_fields else {}
+
+            gt_rec = {
+                'gt_class': gt_class,
+                'gt_score': gt_score,
+                'gt_bbox': gt_bbox,
+                'gt_ide': gt_ide,
+            }
+
+            for k, v in gt_rec.items():
+                if k in self.data_fields:
+                    mot_rec[k] = v
+
+            records.append(mot_rec)
+            if self.sample_num > 0 and img_index >= self.sample_num:
+                break
+        assert len(records) > 0, 'not found any mot record in %s' % (
+            self.image_lists)
+        self.roidbs, self.cname2cid = records, cname2cid
+
+
+@register
+@serializable
+class MCMOTDataSet(DetDataset):
+    """
+    Load dataset with MOT format, support multi-class MOT.
+
+    Args:
+        dataset_dir (str): root directory for dataset.
+        image_lists (list(str)): mcmot data image lists, muiti-source mcmot dataset.
+        data_fields (list): key name of data dictionary, at least have 'image'.
+        label_list (str): if use_default_label is False, will load
+            mapping between category and class index.
+        sample_num (int): number of samples to load, -1 means all.
+
+    Notes:
+        MCMOT datasets root directory following this:
+            dataset/mot
+            |——————image_lists
+            |        |——————visdrone_mcmot.train  
+            |        |——————visdrone_mcmot.val   
+            visdrone_mcmot
+            |——————images
+            |        └——————train
+            |        └——————val
+            └——————labels_with_ids
+                        └——————train
+    """
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_lists=[],
+                 data_fields=['image'],
+                 label_list=None,
+                 sample_num=-1):
+        super(MCMOTDataSet, self).__init__(
+            dataset_dir=dataset_dir,
+            data_fields=data_fields,
+            sample_num=sample_num)
+        self.dataset_dir = dataset_dir
+        self.image_lists = image_lists
+        if isinstance(self.image_lists, str):
+            self.image_lists = [self.image_lists]
+        self.label_list = label_list
+        self.roidbs = None
+        self.cname2cid = None
+
+    def get_anno(self):
+        if self.image_lists == []:
+            return
+        # only used to get categories and metric
+        # only check first data, but the label_list of all data should be same.
+        first_mot_data = self.image_lists[0].split('.')[0]
+        anno_file = os.path.join(self.dataset_dir, first_mot_data,
+                                 'label_list.txt')
+        return anno_file
+
+    def parse_dataset(self):
+        self.img_files = OrderedDict()
+        self.img_start_index = OrderedDict()
+        self.label_files = OrderedDict()
+        self.tid_num = OrderedDict()
+        self.tid_start_idx_of_cls_ids = defaultdict(dict)  # for MCMOT
+
+        img_index = 0
+        for data_name in self.image_lists:
+            # check every data image list
+            image_lists_dir = os.path.join(self.dataset_dir, 'image_lists')
+            assert os.path.isdir(image_lists_dir), \
+                "The {} is not a directory.".format(image_lists_dir)
+
+            list_path = os.path.join(image_lists_dir, data_name)
+            assert os.path.exists(list_path), \
+                "The list path {} does not exist.".format(list_path)
+
+            # record img_files, filter out empty ones
+            with open(list_path, 'r') as file:
+                self.img_files[data_name] = file.readlines()
+                self.img_files[data_name] = [
+                    os.path.join(self.dataset_dir, x.strip())
+                    for x in self.img_files[data_name]
+                ]
+                self.img_files[data_name] = list(
+                    filter(lambda x: len(x) > 0, self.img_files[data_name]))
+
+                self.img_start_index[data_name] = img_index
+                img_index += len(self.img_files[data_name])
+
+            # record label_files
+            self.label_files[data_name] = [
+                x.replace('images', 'labels_with_ids').replace(
+                    '.png', '.txt').replace('.jpg', '.txt')
+                for x in self.img_files[data_name]
+            ]
+
+        for data_name, label_paths in self.label_files.items():
+            # using max_ids_dict rather than max_index
+            max_ids_dict = defaultdict(int)
+            for lp in label_paths:
+                lb = np.loadtxt(lp)
+                if len(lb) < 1:
+                    continue
+                lb = lb.reshape(-1, 6)
+                for item in lb:
+                    if item[1] > max_ids_dict[int(item[0])]:
+                        # item[0]: cls_id
+                        # item[1]: track id
+                        max_ids_dict[int(item[0])] = int(item[1])
+            # track id number
+            self.tid_num[data_name] = max_ids_dict
+
+        last_idx_dict = defaultdict(int)
+        for i, (k, v) in enumerate(self.tid_num.items()):  # each sub dataset
+            for cls_id, id_num in v.items():  # v is a max_ids_dict
+                self.tid_start_idx_of_cls_ids[k][cls_id] = last_idx_dict[cls_id]
+                last_idx_dict[cls_id] += id_num
+
+        self.num_identities_dict = defaultdict(int)
+        for k, v in last_idx_dict.items():
+            self.num_identities_dict[k] = int(v)  # total ids of each category
+
+        self.num_imgs_each_data = [len(x) for x in self.img_files.values()]
+        self.total_imgs = sum(self.num_imgs_each_data)
+
+        # cname2cid and cid2cname 
+        cname2cid = {}
+        if self.label_list is not None:
+            # if use label_list for multi source mix dataset, 
+            # please make sure label_list in the first sub_dataset at least.
+            sub_dataset = self.image_lists[0].split('.')[0]
+            label_path = os.path.join(self.dataset_dir, sub_dataset,
+                                      self.label_list)
+            if not os.path.exists(label_path):
+                logger.info(
+                    "Note: label_list {} does not exists, use VisDrone 10 classes labels as default.".
+                    format(label_path))
+                cname2cid = visdrone_mcmot_label()
+            else:
+                with open(label_path, 'r') as fr:
+                    label_id = 0
+                    for line in fr.readlines():
+                        cname2cid[line.strip()] = label_id
+                        label_id += 1
+        else:
+            cname2cid = visdrone_mcmot_label()
+
+        cid2cname = dict([(v, k) for (k, v) in cname2cid.items()])
+
+        logger.info('MCMOT dataset summary: ')
+        logger.info(self.tid_num)
+        logger.info('Total images: {}'.format(self.total_imgs))
+        logger.info('Image start index: {}'.format(self.img_start_index))
+
+        logger.info('Total identities of each category: ')
+        num_identities_dict = sorted(
+            self.num_identities_dict.items(), key=lambda x: x[0])
+        total_IDs_all_cats = 0
+        for (k, v) in num_identities_dict:
+            logger.info('Category {} [{}] has {} IDs.'.format(k, cid2cname[k],
+                                                              v))
+            total_IDs_all_cats += v
+        logger.info('Total identities of all categories: {}'.format(
+            total_IDs_all_cats))
+
+        logger.info('Identity start index of each category: ')
+        for k, v in self.tid_start_idx_of_cls_ids.items():
+            sorted_v = sorted(v.items(), key=lambda x: x[0])
+            for (cls_id, start_idx) in sorted_v:
+                logger.info('Start index of dataset {} category {:d} is {:d}'
+                            .format(k, cls_id, start_idx))
+
+        records = []
+        for img_index in range(self.total_imgs):
+            for i, (k, v) in enumerate(self.img_start_index.items()):
+                if img_index >= v:
+                    data_name = list(self.label_files.keys())[i]
+                    start_index = v
+            img_file = self.img_files[data_name][img_index - start_index]
+            lbl_file = self.label_files[data_name][img_index - start_index]
+
+            if not os.path.exists(img_file):
+                logger.warning('Illegal image file: {}, and it will be ignored'.
+                               format(img_file))
+                continue
+            if not os.path.isfile(lbl_file):
+                logger.warning('Illegal label file: {}, and it will be ignored'.
+                               format(lbl_file))
+                continue
+
+            labels = np.loadtxt(lbl_file, dtype=np.float32).reshape(-1, 6)
+            # each row in labels (N, 6) is [gt_class, gt_identity, cx, cy, w, h]
+
+            cx, cy = labels[:, 2], labels[:, 3]
+            w, h = labels[:, 4], labels[:, 5]
+            gt_bbox = np.stack((cx, cy, w, h)).T.astype('float32')
+            gt_class = labels[:, 0:1].astype('int32')
+            gt_score = np.ones((len(labels), 1)).astype('float32')
+            gt_ide = labels[:, 1:2].astype('int32')
+            for i, _ in enumerate(gt_ide):
+                if gt_ide[i] > -1:
+                    cls_id = int(gt_class[i])
+                    start_idx = self.tid_start_idx_of_cls_ids[data_name][cls_id]
+                    gt_ide[i] += start_idx
+
+            mot_rec = {
+                'im_file': img_file,
+                'im_id': img_index,
+            } if 'image' in self.data_fields else {}
+
+            gt_rec = {
+                'gt_class': gt_class,
+                'gt_score': gt_score,
+                'gt_bbox': gt_bbox,
+                'gt_ide': gt_ide,
+            }
+
+            for k, v in gt_rec.items():
+                if k in self.data_fields:
+                    mot_rec[k] = v
+
+            records.append(mot_rec)
+            if self.sample_num > 0 and img_index >= self.sample_num:
+                break
+        assert len(records) > 0, 'not found any mot record in %s' % (
+            self.image_lists)
+        self.roidbs, self.cname2cid = records, cname2cid
+
+
+@register
+@serializable
+class MOTImageFolder(DetDataset):
+    """
+    Load MOT dataset with MOT format from image folder or video .
+    Args:
+        video_file (str): path of the video file, default ''.
+        frame_rate (int): frame rate of the video, use cv2 VideoCapture if not set.
+        dataset_dir (str): root directory for dataset.
+        keep_ori_im (bool): whether to keep original image, default False. 
+            Set True when used during MOT model inference while saving
+            images or video, or used in DeepSORT.
+    """
+
+    def __init__(self,
+                 video_file=None,
+                 frame_rate=-1,
+                 dataset_dir=None,
+                 data_root=None,
+                 image_dir=None,
+                 sample_num=-1,
+                 keep_ori_im=False,
+                 anno_path=None,
+                 **kwargs):
+        super(MOTImageFolder, self).__init__(
+            dataset_dir, image_dir, sample_num=sample_num)
+        self.video_file = video_file
+        self.data_root = data_root
+        self.keep_ori_im = keep_ori_im
+        self._imid2path = {}
+        self.roidbs = None
+        self.frame_rate = frame_rate
+        self.anno_path = anno_path
+
+    def check_or_download_dataset(self):
+        return
+
+    def parse_dataset(self, ):
+        if not self.roidbs:
+            if self.video_file is None:
+                self.frame_rate = 30  # set as default if infer image folder
+                self.roidbs = self._load_images()
+            else:
+                self.roidbs = self._load_video_images()
+
+    def _load_video_images(self):
+        if self.frame_rate == -1:
+            # if frame_rate is not set for video, use cv2.VideoCapture
+            cap = cv2.VideoCapture(self.video_file)
+            self.frame_rate = int(cap.get(cv2.CAP_PROP_FPS))
+
+        extension = self.video_file.split('.')[-1]
+        output_path = self.video_file.replace('.{}'.format(extension), '')
+        frames_path = video2frames(self.video_file, output_path,
+                                   self.frame_rate)
+        self.video_frames = sorted(
+            glob.glob(os.path.join(frames_path, '*.png')))
+
+        self.video_length = len(self.video_frames)
+        logger.info('Length of the video: {:d} frames.'.format(
+            self.video_length))
+        ct = 0
+        records = []
+        for image in self.video_frames:
+            assert image != '' and os.path.isfile(image), \
+                    "Image {} not found".format(image)
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+            rec = {'im_id': np.array([ct]), 'im_file': image}
+            if self.keep_ori_im:
+                rec.update({'keep_ori_im': 1})
+            self._imid2path[ct] = image
+            ct += 1
+            records.append(rec)
+        assert len(records) > 0, "No image file found"
+        return records
+
+    def _find_images(self):
+        image_dir = self.image_dir
+        if not isinstance(image_dir, Sequence):
+            image_dir = [image_dir]
+        images = []
+        for im_dir in image_dir:
+            if os.path.isdir(im_dir):
+                im_dir = os.path.join(self.dataset_dir, im_dir)
+                images.extend(_make_dataset(im_dir))
+            elif os.path.isfile(im_dir) and _is_valid_file(im_dir):
+                images.append(im_dir)
+        return images
+
+    def _load_images(self):
+        images = self._find_images()
+        ct = 0
+        records = []
+        for image in images:
+            assert image != '' and os.path.isfile(image), \
+                    "Image {} not found".format(image)
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+            rec = {'im_id': np.array([ct]), 'im_file': image}
+            if self.keep_ori_im:
+                rec.update({'keep_ori_im': 1})
+            self._imid2path[ct] = image
+            ct += 1
+            records.append(rec)
+        assert len(records) > 0, "No image file found"
+        return records
+
+    def get_imid2path(self):
+        return self._imid2path
+
+    def set_images(self, images):
+        self.image_dir = images
+        self.roidbs = self._load_images()
+
+    def set_video(self, video_file, frame_rate):
+        # update video_file and frame_rate by command line of tools/infer_mot.py
+        self.video_file = video_file
+        self.frame_rate = frame_rate
+        assert os.path.isfile(self.video_file) and _is_valid_video(self.video_file), \
+                "wrong or unsupported file format: {}".format(self.video_file)
+        self.roidbs = self._load_video_images()
+
+    def get_anno(self):
+        return self.anno_path
+
+
+def _is_valid_video(f, extensions=('.mp4', '.avi', '.mov', '.rmvb', 'flv')):
+    return f.lower().endswith(extensions)
+
+
+def video2frames(video_path, outpath, frame_rate, **kargs):
+    def _dict2str(kargs):
+        cmd_str = ''
+        for k, v in kargs.items():
+            cmd_str += (' ' + str(k) + ' ' + str(v))
+        return cmd_str
+
+    ffmpeg = ['ffmpeg ', ' -y -loglevel ', ' error ']
+    vid_name = os.path.basename(video_path).split('.')[0]
+    out_full_path = os.path.join(outpath, vid_name)
+
+    if not os.path.exists(out_full_path):
+        os.makedirs(out_full_path)
+
+    # video file name
+    outformat = os.path.join(out_full_path, '%08d.png')
+
+    cmd = ffmpeg
+    cmd = ffmpeg + [
+        ' -i ', video_path, ' -r ', str(frame_rate), ' -f image2 ', outformat
+    ]
+    cmd = ''.join(cmd) + _dict2str(kargs)
+
+    if os.system(cmd) != 0:
+        raise RuntimeError('ffmpeg process video: {} error'.format(video_path))
+        sys.exit(-1)
+
+    sys.stdout.flush()
+    return out_full_path
+
+
+def mot_label():
+    labels_map = {'person': 0}
+    return labels_map
+
+
+def visdrone_mcmot_label():
+    labels_map = {
+        'pedestrian': 0,
+        'people': 1,
+        'bicycle': 2,
+        'car': 3,
+        'van': 4,
+        'truck': 5,
+        'tricycle': 6,
+        'awning-tricycle': 7,
+        'bus': 8,
+        'motor': 9,
+    }
+    return labels_map
--- a/paddle_detection/ppdet/data/source/pose3d_cmb.py
+++ b/paddle_detection/ppdet/data/source/pose3d_cmb.py
@@ -0,0 +1,380 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import os
+import cv2
+import numpy as np
+import json
+import copy
+import pycocotools
+from pycocotools.coco import COCO
+from .dataset import DetDataset
+from ppdet.core.workspace import register, serializable
+from paddle.io import Dataset
+
+
+@serializable
+class Pose3DDataset(DetDataset):
+    """Pose3D Dataset class. 
+
+    Args:
+        dataset_dir (str): Root path to the dataset.
+        anno_list (list of str): each of the element is a relative path to the annotation file.
+        image_dirs (list of str): each of path is a relative path where images are held.
+        transform (composed(operators)): A sequence of data transforms.
+        test_mode (bool): Store True when building test or
+            validation dataset. Default: False.
+        24 joints order:
+        0-2: 'R_Ankle', 'R_Knee', 'R_Hip', 
+        3-5:'L_Hip', 'L_Knee', 'L_Ankle', 
+        6-8:'R_Wrist', 'R_Elbow', 'R_Shoulder', 
+        9-11:'L_Shoulder','L_Elbow','L_Wrist',
+        12-14:'Neck','Top_of_Head','Pelvis',
+        15-18:'Thorax','Spine','Jaw','Head',
+        19-23:'Nose','L_Eye','R_Eye','L_Ear','R_Ear'
+    """
+
+    def __init__(self,
+                 dataset_dir,
+                 image_dirs,
+                 anno_list,
+                 transform=[],
+                 num_joints=24,
+                 test_mode=False):
+        super().__init__(dataset_dir, image_dirs, anno_list)
+        self.image_info = {}
+        self.ann_info = {}
+        self.num_joints = num_joints
+
+        self.transform = transform
+        self.test_mode = test_mode
+
+        self.img_ids = []
+        self.dataset_dir = dataset_dir
+        self.image_dirs = image_dirs
+        self.anno_list = anno_list
+
+    def get_mask(self, mvm_percent=0.3):
+        num_joints = self.num_joints
+        mjm_mask = np.ones((num_joints, 1)).astype(np.float32)
+        if self.test_mode == False:
+            pb = np.random.random_sample()
+            masked_num = int(
+                pb * mvm_percent *
+                num_joints)  # at most x% of the joints could be masked
+            indices = np.random.choice(
+                np.arange(num_joints), replace=False, size=masked_num)
+            mjm_mask[indices, :] = 0.0
+        # return mjm_mask
+
+        num_joints = 10
+        mvm_mask = np.ones((num_joints, 1)).astype(np.float)
+        if self.test_mode == False:
+            num_vertices = num_joints
+            pb = np.random.random_sample()
+            masked_num = int(
+                pb * mvm_percent *
+                num_vertices)  # at most x% of the vertices could be masked
+            indices = np.random.choice(
+                np.arange(num_vertices), replace=False, size=masked_num)
+            mvm_mask[indices, :] = 0.0
+
+        mjm_mask = np.concatenate([mjm_mask, mvm_mask], axis=0)
+        return mjm_mask
+
+    def filterjoints(self, x):
+        if self.num_joints == 24:
+            return x
+        elif self.num_joints == 14:
+            return x[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18], :]
+        elif self.num_joints == 17:
+            return x[
+                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19], :]
+        else:
+            raise ValueError(
+                "unsupported joint numbers, only [24 or 17 or 14] is supported!")
+
+    def parse_dataset(self):
+        print("Loading annotations..., please wait")
+        self.annos = []
+        im_id = 0
+        self.human36m_num = 0
+        for idx, annof in enumerate(self.anno_list):
+            img_prefix = os.path.join(self.dataset_dir, self.image_dirs[idx])
+            dataf = os.path.join(self.dataset_dir, annof)
+            with open(dataf, 'r') as rf:
+                anno_data = json.load(rf)
+                annos = anno_data['data']
+                new_annos = []
+                print("{} has annos numbers: {}".format(dataf, len(annos)))
+                for anno in annos:
+                    new_anno = {}
+                    new_anno['im_id'] = im_id
+                    im_id += 1
+                    imagename = anno['imageName']
+                    if imagename.startswith("COCO_train2014_"):
+                        imagename = imagename[len("COCO_train2014_"):]
+                    elif imagename.startswith("COCO_val2014_"):
+                        imagename = imagename[len("COCO_val2014_"):]
+                    imagename = os.path.join(img_prefix, imagename)
+                    if not os.path.exists(imagename):
+                        if "train2017" in imagename:
+                            imagename = imagename.replace("train2017",
+                                                          "val2017")
+                            if not os.path.exists(imagename):
+                                print("cannot find imagepath:{}".format(
+                                    imagename))
+                                continue
+                        else:
+                            print("cannot find imagepath:{}".format(imagename))
+                            continue
+                    new_anno['imageName'] = imagename
+                    if 'human3.6m' in imagename:
+                        self.human36m_num += 1
+                    new_anno['bbox_center'] = anno['bbox_center']
+                    new_anno['bbox_scale'] = anno['bbox_scale']
+                    new_anno['joints_2d'] = np.array(anno[
+                        'gt_keypoint_2d']).astype(np.float32)
+                    if new_anno['joints_2d'].shape[0] == 49:
+                        #if the joints_2d is in SPIN format(which generated by eft), choose the last 24 public joints
+                        #for detail please refer: https://github.com/nkolot/SPIN/blob/master/constants.py
+                        new_anno['joints_2d'] = new_anno['joints_2d'][25:]
+                    new_anno['joints_3d'] = np.array(anno[
+                        'pose3d'])[:, :3].astype(np.float32)
+                    new_anno['mjm_mask'] = self.get_mask()
+                    if not 'has_3d_joints' in anno:
+                        new_anno['has_3d_joints'] = int(1)
+                        new_anno['has_2d_joints'] = int(1)
+                    else:
+                        new_anno['has_3d_joints'] = int(anno['has_3d_joints'])
+                        new_anno['has_2d_joints'] = int(anno['has_2d_joints'])
+                    new_anno['joints_2d'] = self.filterjoints(new_anno[
+                        'joints_2d'])
+                    self.annos.append(new_anno)
+                del annos
+
+    def get_temp_num(self):
+        """get temporal data number, like human3.6m"""
+        return self.human36m_num
+
+    def __len__(self):
+        """Get dataset length."""
+        return len(self.annos)
+
+    def _get_imganno(self, idx):
+        """Get anno for a single image."""
+        return self.annos[idx]
+
+    def __getitem__(self, idx):
+        """Prepare image for training given the index."""
+        records = copy.deepcopy(self._get_imganno(idx))
+        imgpath = records['imageName']
+        assert os.path.exists(imgpath), "cannot find image {}".format(imgpath)
+        records['image'] = cv2.imread(imgpath)
+        records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)
+        records = self.transform(records)
+        return records
+
+    def check_or_download_dataset(self):
+        alldatafind = True
+        for image_dir in self.image_dirs:
+            image_dir = os.path.join(self.dataset_dir, image_dir)
+            if not os.path.isdir(image_dir):
+                print("dataset [{}] is not found".format(image_dir))
+                alldatafind = False
+        if not alldatafind:
+            raise ValueError(
+                "Some dataset is not valid and cannot download automatically now, please prepare the dataset first"
+            )
+
+
+@register
+@serializable
+class Keypoint3DMultiFramesDataset(Dataset):
+    """24 keypoints 3D dataset for pose estimation. 
+
+    each item is a list of images
+
+    The dataset loads raw features and apply specified transforms
+    to return a dict containing the image tensors and other information.
+
+    Args:
+        dataset_dir (str): Root path to the dataset.
+        image_dir (str): Path to a directory where images are held.
+    """
+
+    def __init__(
+            self,
+            dataset_dir,  # 数据集根目录
+            image_dir,  # 图像文件夹
+            p3d_dir,  # 3D关键点文件夹
+            json_path,
+            img_size,  #图像resize大小
+            num_frames,  # 帧序列长度
+            anno_path=None, ):
+
+        self.dataset_dir = dataset_dir
+        self.image_dir = image_dir
+        self.p3d_dir = p3d_dir
+        self.json_path = json_path
+        self.img_size = img_size
+        self.num_frames = num_frames
+        self.anno_path = anno_path
+
+        self.data_labels, self.mf_inds = self._generate_multi_frames_list()
+
+    def _generate_multi_frames_list(self):
+        act_list = os.listdir(self.dataset_dir)  # 动作列表
+        count = 0
+        mf_list = []
+        annos_dict = {'images': [], 'annotations': [], 'act_inds': []}
+        for act in act_list:  #对每个动作，生成帧序列
+            if '.' in act:
+                continue
+
+            json_path = os.path.join(self.dataset_dir, act, self.json_path)
+            with open(json_path, 'r') as j:
+                annos = json.load(j)
+            length = len(annos['images'])
+            for k, v in annos.items():
+                if k in annos_dict:
+                    annos_dict[k].extend(v)
+            annos_dict['act_inds'].extend([act] * length)
+
+            mf = [[i + j + count for j in range(self.num_frames)]
+                  for i in range(0, length - self.num_frames + 1)]
+            mf_list.extend(mf)
+            count += length
+
+        print("total data number:", len(mf_list))
+        return annos_dict, mf_list
+
+    def __call__(self, *args, **kwargs):
+        return self
+
+    def __getitem__(self, index):  # 拿一个连续的序列
+        inds = self.mf_inds[
+            index]  # 如[568, 569, 570, 571, 572, 573]，长度为num_frames
+
+        images = self.data_labels['images']  # all images
+        annots = self.data_labels['annotations']  # all annots
+
+        act = self.data_labels['act_inds'][inds[0]]  # 动作名（文件夹名）
+
+        kps3d_list = []
+        kps3d_vis_list = []
+        names = []
+
+        h, w = 0, 0
+        for ind in inds:  # one image
+            height = float(images[ind]['height'])
+            width = float(images[ind]['width'])
+            name = images[ind]['file_name']  # 图像名称，带有后缀
+
+            kps3d_name = name.split('.')[0] + '.obj'
+            kps3d_path = os.path.join(self.dataset_dir, act, self.p3d_dir,
+                                      kps3d_name)
+
+            joints, joints_vis = self.kps3d_process(kps3d_path)
+            joints_vis = np.array(joints_vis, dtype=np.float32)
+
+            kps3d_list.append(joints)
+            kps3d_vis_list.append(joints_vis)
+            names.append(name)
+
+        kps3d = np.array(kps3d_list)  # (6, 24, 3),(num_frames, joints_num, 3)
+        kps3d_vis = np.array(kps3d_vis_list)
+
+        # read image
+        imgs = []
+        for name in names:
+            img_path = os.path.join(self.dataset_dir, act, self.image_dir, name)
+
+            image = cv2.imread(img_path, cv2.IMREAD_COLOR |
+                               cv2.IMREAD_IGNORE_ORIENTATION)
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+            imgs.append(np.expand_dims(image, axis=0))
+
+        imgs = np.concatenate(imgs, axis=0)
+        imgs = imgs.astype(
+            np.float32)  # (6, 1080, 1920, 3),(num_frames, h, w, c)
+
+        # attention: 此时图像和标注是镜像的
+        records = {
+            'kps3d': kps3d,
+            'kps3d_vis': kps3d_vis,
+            "image": imgs,
+            'act': act,
+            'names': names,
+            'im_id': index
+        }
+
+        return self.transform(records)
+
+    def kps3d_process(self, kps3d_path):
+        count = 0
+        kps = []
+        kps_vis = []
+
+        with open(kps3d_path, 'r') as f:
+            lines = f.readlines()
+            for line in lines:
+                if line[0] == 'v':
+                    kps.append([])
+                    line = line.strip('\n').split(' ')[1:]
+                    for kp in line:
+                        kps[-1].append(float(kp))
+                    count += 1
+
+                    kps_vis.append([1, 1, 1])
+
+        kps = np.array(kps)  # 52，3
+        kps_vis = np.array(kps_vis)
+
+        kps *= 10  # scale points
+        kps -= kps[[0], :]  # set root point to zero
+
+        kps = np.concatenate((kps[0:23], kps[[37]]), axis=0)  # 24,3
+
+        kps *= 10
+
+        kps_vis = np.concatenate((kps_vis[0:23], kps_vis[[37]]), axis=0)  # 24,3
+
+        return kps, kps_vis
+
+    def __len__(self):
+        return len(self.mf_inds)
+
+    def get_anno(self):
+        if self.anno_path is None:
+            return
+        return os.path.join(self.dataset_dir, self.anno_path)
+
+    def check_or_download_dataset(self):
+        return
+
+    def parse_dataset(self, ):
+        return
+
+    def set_transform(self, transform):
+        self.transform = transform
+
+    def set_epoch(self, epoch_id):
+        self._epoch = epoch_id
+
+    def set_kwargs(self, **kwargs):
+        self.mixup_epoch = kwargs.get('mixup_epoch', -1)
+        self.cutmix_epoch = kwargs.get('cutmix_epoch', -1)
+        self.mosaic_epoch = kwargs.get('mosaic_epoch', -1)
--- a/paddle_detection/ppdet/data/source/sniper_coco.py
+++ b/paddle_detection/ppdet/data/source/sniper_coco.py
@@ -0,0 +1,194 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import cv2
+import json
+import copy
+import numpy as np
+
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+
+from ppdet.core.workspace import register, serializable
+from ppdet.data.crop_utils.annotation_cropper import AnnoCropper
+from .coco import COCODataSet
+from .dataset import _make_dataset, _is_valid_file
+from ppdet.utils.logger import setup_logger
+
+logger = setup_logger('sniper_coco_dataset')
+
+
+@register
+@serializable
+class SniperCOCODataSet(COCODataSet):
+    """SniperCOCODataSet"""
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 proposals_file=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 load_crowd=False,
+                 allow_empty=True,
+                 empty_ratio=1.,
+                 is_trainset=True,
+                 image_target_sizes=[2000, 1000],
+                 valid_box_ratio_ranges=[[-1, 0.1],[0.08, -1]],
+                 chip_target_size=500,
+                 chip_target_stride=200,
+                 use_neg_chip=False,
+                 max_neg_num_per_im=8,
+                 max_per_img=-1,
+                 nms_thresh=0.5):
+        super(SniperCOCODataSet, self).__init__(
+            dataset_dir=dataset_dir,
+            image_dir=image_dir,
+            anno_path=anno_path,
+            data_fields=data_fields,
+            sample_num=sample_num,
+            load_crowd=load_crowd,
+            allow_empty=allow_empty,
+            empty_ratio=empty_ratio
+        )
+        self.proposals_file = proposals_file
+        self.proposals = None
+        self.anno_cropper = None
+        self.is_trainset = is_trainset
+        self.image_target_sizes = image_target_sizes
+        self.valid_box_ratio_ranges = valid_box_ratio_ranges
+        self.chip_target_size = chip_target_size
+        self.chip_target_stride = chip_target_stride
+        self.use_neg_chip = use_neg_chip
+        self.max_neg_num_per_im = max_neg_num_per_im
+        self.max_per_img = max_per_img
+        self.nms_thresh = nms_thresh
+
+
+    def parse_dataset(self):
+        if not hasattr(self, "roidbs"):
+            super(SniperCOCODataSet, self).parse_dataset()
+        if self.is_trainset:
+            self._parse_proposals()
+            self._merge_anno_proposals()
+        self.ori_roidbs = copy.deepcopy(self.roidbs)
+        self.init_anno_cropper()
+        self.roidbs = self.generate_chips_roidbs(self.roidbs, self.is_trainset)
+
+    def set_proposals_file(self, file_path):
+        self.proposals_file = file_path
+
+    def init_anno_cropper(self):
+        logger.info("Init AnnoCropper...")
+        self.anno_cropper = AnnoCropper(
+            image_target_sizes=self.image_target_sizes,
+            valid_box_ratio_ranges=self.valid_box_ratio_ranges,
+            chip_target_size=self.chip_target_size,
+            chip_target_stride=self.chip_target_stride,
+            use_neg_chip=self.use_neg_chip,
+            max_neg_num_per_im=self.max_neg_num_per_im,
+            max_per_img=self.max_per_img,
+            nms_thresh=self.nms_thresh
+        )
+
+    def generate_chips_roidbs(self, roidbs, is_trainset):
+        if is_trainset:
+            roidbs = self.anno_cropper.crop_anno_records(roidbs)
+        else:
+            roidbs = self.anno_cropper.crop_infer_anno_records(roidbs)
+        return roidbs
+
+    def _parse_proposals(self):
+        if self.proposals_file:
+            self.proposals = {}
+            logger.info("Parse proposals file:{}".format(self.proposals_file))
+            with open(self.proposals_file, 'r') as f:
+                proposals = json.load(f)
+            for prop in proposals:
+                image_id = prop["image_id"]
+                if image_id not in self.proposals:
+                    self.proposals[image_id] = []
+                x, y, w, h = prop["bbox"]
+                self.proposals[image_id].append([x, y, x + w, y + h])
+
+    def _merge_anno_proposals(self):
+        assert self.roidbs
+        if self.proposals and len(self.proposals.keys()) > 0:
+            logger.info("merge proposals to annos")
+            for id, record in enumerate(self.roidbs):
+                image_id = int(record["im_id"])
+                if image_id not in self.proposals.keys():
+                    logger.info("image id :{} no proposals".format(image_id))
+                record["proposals"] = np.array(self.proposals.get(image_id, []), dtype=np.float32)
+                self.roidbs[id] = record
+
+    def get_ori_roidbs(self):
+        if not hasattr(self, "ori_roidbs"):
+            return None
+        return self.ori_roidbs
+
+    def get_roidbs(self):
+        if not hasattr(self, "roidbs"):
+            self.parse_dataset()
+        return self.roidbs
+
+    def set_roidbs(self, roidbs):
+        self.roidbs = roidbs
+
+    def check_or_download_dataset(self):
+        return
+
+    def _parse(self):
+        image_dir = self.image_dir
+        if not isinstance(image_dir, Sequence):
+            image_dir = [image_dir]
+        images = []
+        for im_dir in image_dir:
+            if os.path.isdir(im_dir):
+                im_dir = os.path.join(self.dataset_dir, im_dir)
+                images.extend(_make_dataset(im_dir))
+            elif os.path.isfile(im_dir) and _is_valid_file(im_dir):
+                images.append(im_dir)
+        return images
+
+    def _load_images(self):
+        images = self._parse()
+        ct = 0
+        records = []
+        for image in images:
+            assert image != '' and os.path.isfile(image), \
+                "Image {} not found".format(image)
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+            im = cv2.imread(image)
+            h, w, c = im.shape
+            rec = {'im_id': np.array([ct]), 'im_file': image, "h": h, "w": w}
+            self._imid2path[ct] = image
+            ct += 1
+            records.append(rec)
+        assert len(records) > 0, "No image file found"
+        return records
+
+    def get_imid2path(self):
+        return self._imid2path
+
+    def set_images(self, images):
+        self._imid2path = {}
+        self.image_dir = images
+        self.roidbs = self._load_images()
+
--- a/paddle_detection/ppdet/data/source/voc.py
+++ b/paddle_detection/ppdet/data/source/voc.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+
+import xml.etree.ElementTree as ET
+
+from ppdet.core.workspace import register, serializable
+
+from .dataset import DetDataset
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+@register
+@serializable
+class VOCDataSet(DetDataset):
+    """
+    Load dataset with PascalVOC format.
+
+    Notes:
+    `anno_path` must contains xml file and image file path for annotations.
+
+    Args:
+        dataset_dir (str): root directory for dataset.
+        image_dir (str): directory for images.
+        anno_path (str): voc annotation file path.
+        data_fields (list): key name of data dictionary, at least have 'image'.
+        sample_num (int): number of samples to load, -1 means all.
+        label_list (str): if use_default_label is False, will load
+            mapping between category and class index.
+        allow_empty (bool): whether to load empty entry. False as default
+        empty_ratio (float): the ratio of empty record number to total 
+            record's, if empty_ratio is out of [0. ,1.), do not sample the 
+            records and use all the empty entries. 1. as default
+        repeat (int): repeat times for dataset, use in benchmark.
+    """
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 label_list=None,
+                 allow_empty=False,
+                 empty_ratio=1.,
+                 repeat=1):
+        super(VOCDataSet, self).__init__(
+            dataset_dir=dataset_dir,
+            image_dir=image_dir,
+            anno_path=anno_path,
+            data_fields=data_fields,
+            sample_num=sample_num,
+            repeat=repeat)
+        self.label_list = label_list
+        self.allow_empty = allow_empty
+        self.empty_ratio = empty_ratio
+
+    def _sample_empty(self, records, num):
+        # if empty_ratio is out of [0. ,1.), do not sample the records
+        if self.empty_ratio < 0. or self.empty_ratio >= 1.:
+            return records
+        import random
+        sample_num = min(
+            int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records))
+        records = random.sample(records, sample_num)
+        return records
+
+    def parse_dataset(self, ):
+        anno_path = os.path.join(self.dataset_dir, self.anno_path)
+        image_dir = os.path.join(self.dataset_dir, self.image_dir)
+
+        # mapping category name to class id
+        # first_class:0, second_class:1, ...
+        records = []
+        empty_records = []
+        ct = 0
+        cname2cid = {}
+        if self.label_list:
+            label_path = os.path.join(self.dataset_dir, self.label_list)
+            if not os.path.exists(label_path):
+                raise ValueError("label_list {} does not exists".format(
+                    label_path))
+            with open(label_path, 'r') as fr:
+                label_id = 0
+                for line in fr.readlines():
+                    cname2cid[line.strip()] = label_id
+                    label_id += 1
+        else:
+            cname2cid = pascalvoc_label()
+
+        with open(anno_path, 'r') as fr:
+            while True:
+                line = fr.readline()
+                if not line:
+                    break
+                img_file, xml_file = [os.path.join(image_dir, x) \
+                        for x in line.strip().split()[:2]]
+                if not os.path.exists(img_file):
+                    logger.warning(
+                        'Illegal image file: {}, and it will be ignored'.format(
+                            img_file))
+                    continue
+                if not os.path.isfile(xml_file):
+                    logger.warning(
+                        'Illegal xml file: {}, and it will be ignored'.format(
+                            xml_file))
+                    continue
+                tree = ET.parse(xml_file)
+                if tree.find('id') is None:
+                    im_id = np.array([ct])
+                else:
+                    im_id = np.array([int(tree.find('id').text)])
+
+                objs = tree.findall('object')
+                im_w = float(tree.find('size').find('width').text)
+                im_h = float(tree.find('size').find('height').text)
+                if im_w < 0 or im_h < 0:
+                    logger.warning(
+                        'Illegal width: {} or height: {} in annotation, '
+                        'and {} will be ignored'.format(im_w, im_h, xml_file))
+                    continue
+
+                num_bbox, i = len(objs), 0
+                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
+                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
+                gt_score = np.zeros((num_bbox, 1), dtype=np.float32)
+                difficult = np.zeros((num_bbox, 1), dtype=np.int32)
+                for obj in objs:
+                    cname = obj.find('name').text
+
+                    # user dataset may not contain difficult field
+                    _difficult = obj.find('difficult')
+                    _difficult = int(
+                        _difficult.text) if _difficult is not None else 0
+
+                    x1 = float(obj.find('bndbox').find('xmin').text)
+                    y1 = float(obj.find('bndbox').find('ymin').text)
+                    x2 = float(obj.find('bndbox').find('xmax').text)
+                    y2 = float(obj.find('bndbox').find('ymax').text)
+                    x1 = max(0, x1)
+                    y1 = max(0, y1)
+                    x2 = min(im_w - 1, x2)
+                    y2 = min(im_h - 1, y2)
+                    if x2 > x1 and y2 > y1:
+                        gt_bbox[i, :] = [x1, y1, x2, y2]
+                        gt_class[i, 0] = cname2cid[cname]
+                        gt_score[i, 0] = 1.
+                        difficult[i, 0] = _difficult
+                        i += 1
+                    else:
+                        logger.warning(
+                            'Found an invalid bbox in annotations: xml_file: {}'
+                            ', x1: {}, y1: {}, x2: {}, y2: {}.'.format(
+                                xml_file, x1, y1, x2, y2))
+                gt_bbox = gt_bbox[:i, :]
+                gt_class = gt_class[:i, :]
+                gt_score = gt_score[:i, :]
+                difficult = difficult[:i, :]
+
+                voc_rec = {
+                    'im_file': img_file,
+                    'im_id': im_id,
+                    'h': im_h,
+                    'w': im_w
+                } if 'image' in self.data_fields else {}
+
+                gt_rec = {
+                    'gt_class': gt_class,
+                    'gt_score': gt_score,
+                    'gt_bbox': gt_bbox,
+                    'difficult': difficult
+                }
+                for k, v in gt_rec.items():
+                    if k in self.data_fields:
+                        voc_rec[k] = v
+
+                if len(objs) == 0:
+                    empty_records.append(voc_rec)
+                else:
+                    records.append(voc_rec)
+
+                ct += 1
+                if self.sample_num > 0 and ct >= self.sample_num:
+                    break
+        assert ct > 0, 'not found any voc record in %s' % (self.anno_path)
+        logger.debug('{} samples in file {}'.format(ct, anno_path))
+        if self.allow_empty and len(empty_records) > 0:
+            empty_records = self._sample_empty(empty_records, len(records))
+            records += empty_records
+        self.roidbs, self.cname2cid = records, cname2cid
+
+    def get_label_list(self):
+        return os.path.join(self.dataset_dir, self.label_list)
+
+
+def pascalvoc_label():
+    labels_map = {
+        'aeroplane': 0,
+        'bicycle': 1,
+        'bird': 2,
+        'boat': 3,
+        'bottle': 4,
+        'bus': 5,
+        'car': 6,
+        'cat': 7,
+        'chair': 8,
+        'cow': 9,
+        'diningtable': 10,
+        'dog': 11,
+        'horse': 12,
+        'motorbike': 13,
+        'person': 14,
+        'pottedplant': 15,
+        'sheep': 16,
+        'sofa': 17,
+        'train': 18,
+        'tvmonitor': 19
+    }
+    return labels_map
--- a/paddle_detection/ppdet/data/source/widerface.py
+++ b/paddle_detection/ppdet/data/source/widerface.py
@@ -0,0 +1,180 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+
+from ppdet.core.workspace import register, serializable
+from .dataset import DetDataset
+
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+@register
+@serializable
+class WIDERFaceDataSet(DetDataset):
+    """
+    Load WiderFace records with 'anno_path'
+
+    Args:
+        dataset_dir (str): root directory for dataset.
+        image_dir (str): directory for images.
+        anno_path (str): WiderFace annotation data.
+        data_fields (list): key name of data dictionary, at least have 'image'.
+        sample_num (int): number of samples to load, -1 means all.
+        with_lmk (bool): whether to load face landmark keypoint labels.
+    """
+
+    def __init__(self,
+                 dataset_dir=None,
+                 image_dir=None,
+                 anno_path=None,
+                 data_fields=['image'],
+                 sample_num=-1,
+                 with_lmk=False):
+        super(WIDERFaceDataSet, self).__init__(
+            dataset_dir=dataset_dir,
+            image_dir=image_dir,
+            anno_path=anno_path,
+            data_fields=data_fields,
+            sample_num=sample_num,
+            with_lmk=with_lmk)
+        self.anno_path = anno_path
+        self.sample_num = sample_num
+        self.roidbs = None
+        self.cname2cid = None
+        self.with_lmk = with_lmk
+
+    def parse_dataset(self):
+        anno_path = os.path.join(self.dataset_dir, self.anno_path)
+        image_dir = os.path.join(self.dataset_dir, self.image_dir)
+
+        txt_file = anno_path
+
+        records = []
+        ct = 0
+        file_lists = self._load_file_list(txt_file)
+        cname2cid = widerface_label()
+
+        for item in file_lists:
+            im_fname = item[0]
+            im_id = np.array([ct])
+            gt_bbox = np.zeros((len(item) - 1, 4), dtype=np.float32)
+            gt_class = np.zeros((len(item) - 1, 1), dtype=np.int32)
+            gt_lmk_labels = np.zeros((len(item) - 1, 10), dtype=np.float32)
+            lmk_ignore_flag = np.zeros((len(item) - 1, 1), dtype=np.int32)
+            for index_box in range(len(item)):
+                if index_box < 1:
+                    continue
+                gt_bbox[index_box - 1] = item[index_box][0]
+                if self.with_lmk:
+                    gt_lmk_labels[index_box - 1] = item[index_box][1]
+                    lmk_ignore_flag[index_box - 1] = item[index_box][2]
+            im_fname = os.path.join(image_dir,
+                                    im_fname) if image_dir else im_fname
+            widerface_rec = {
+                'im_file': im_fname,
+                'im_id': im_id,
+            } if 'image' in self.data_fields else {}
+            gt_rec = {
+                'gt_bbox': gt_bbox,
+                'gt_class': gt_class,
+            }
+            for k, v in gt_rec.items():
+                if k in self.data_fields:
+                    widerface_rec[k] = v
+            if self.with_lmk:
+                widerface_rec['gt_keypoint'] = gt_lmk_labels
+                widerface_rec['keypoint_ignore'] = lmk_ignore_flag
+
+            if len(item) != 0:
+                records.append(widerface_rec)
+
+            ct += 1
+            if self.sample_num > 0 and ct >= self.sample_num:
+                break
+        assert len(records) > 0, 'not found any widerface in %s' % (anno_path)
+        logger.debug('{} samples in file {}'.format(ct, anno_path))
+        self.roidbs, self.cname2cid = records, cname2cid
+
+    def _load_file_list(self, input_txt):
+        with open(input_txt, 'r') as f_dir:
+            lines_input_txt = f_dir.readlines()
+
+        file_dict = {}
+        num_class = 0
+        exts = ['jpg', 'jpeg', 'png', 'bmp']
+        exts += [ext.upper() for ext in exts]
+        for i in range(len(lines_input_txt)):
+            line_txt = lines_input_txt[i].strip('\n\t\r')
+            split_str = line_txt.split(' ')
+            if len(split_str) == 1:
+                img_file_name = os.path.split(split_str[0])[1]
+                split_txt = img_file_name.split('.')
+                if len(split_txt) < 2:
+                    continue
+                elif split_txt[-1] in exts:
+                    if i != 0:
+                        num_class += 1
+                    file_dict[num_class] = [line_txt]
+            else:
+                if len(line_txt) <= 6:
+                    continue
+                result_boxs = []
+                xmin = float(split_str[0])
+                ymin = float(split_str[1])
+                w = float(split_str[2])
+                h = float(split_str[3])
+                # Filter out wrong labels
+                if w < 0 or h < 0:
+                    logger.warning('Illegal box with w: {}, h: {} in '
+                                   'img: {}, and it will be ignored'.format(
+                                       w, h, file_dict[num_class][0]))
+                    continue
+                xmin = max(0, xmin)
+                ymin = max(0, ymin)
+                xmax = xmin + w
+                ymax = ymin + h
+                gt_bbox = [xmin, ymin, xmax, ymax]
+                result_boxs.append(gt_bbox)
+                if self.with_lmk:
+                    assert len(split_str) > 18, 'When `with_lmk=True`, the number' \
+                            'of characters per line in the annotation file should' \
+                            'exceed 18.'
+                    lmk0_x = float(split_str[5])
+                    lmk0_y = float(split_str[6])
+                    lmk1_x = float(split_str[8])
+                    lmk1_y = float(split_str[9])
+                    lmk2_x = float(split_str[11])
+                    lmk2_y = float(split_str[12])
+                    lmk3_x = float(split_str[14])
+                    lmk3_y = float(split_str[15])
+                    lmk4_x = float(split_str[17])
+                    lmk4_y = float(split_str[18])
+                    lmk_ignore_flag = 0 if lmk0_x == -1 else 1
+                    gt_lmk_label = [
+                        lmk0_x, lmk0_y, lmk1_x, lmk1_y, lmk2_x, lmk2_y, lmk3_x,
+                        lmk3_y, lmk4_x, lmk4_y
+                    ]
+                    result_boxs.append(gt_lmk_label)
+                    result_boxs.append(lmk_ignore_flag)
+                file_dict[num_class].append(result_boxs)
+
+        return list(file_dict.values())
+
+
+def widerface_label():
+    labels_map = {'face': 0}
+    return labels_map
--- a/paddle_detection/ppdet/data/transform/init.py
+++ b/paddle_detection/ppdet/data/transform/init.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import operators
+from . import batch_operators
+from . import keypoint_operators
+from . import mot_operators
+from . import rotated_operators
+from . import keypoints_3d_operators
+from . import culane_operators
+
+from .operators import *
+from .batch_operators import *
+from .keypoint_operators import *
+from .mot_operators import *
+from .rotated_operators import *
+from .keypoints_3d_operators import *
+from .culane_operators import *
+
+__all__ = []
+__all__ += registered_ops
+__all__ += keypoint_operators.__all__
+__all__ += mot_operators.__all__
+__all__ += culane_operators.__all__
--- a/paddle_detection/ppdet/data/transform/atss_assigner.py
+++ b/paddle_detection/ppdet/data/transform/atss_assigner.py
@@ -0,0 +1,421 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The code is based on:
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/assigners/atss_assigner.py
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+
+def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
+    """Calculate overlap between two set of bboxes.
+    If ``is_aligned `` is ``False``, then calculate the overlaps between each
+    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
+    pair of bboxes1 and bboxes2.
+    Args:
+        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
+        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
+            B indicates the batch dim, in shape (B1, B2, ..., Bn).
+            If ``is_aligned `` is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union) or "iof" (intersection over
+            foreground).
+        is_aligned (bool, optional): If True, then m and n must be equal.
+            Default False.
+        eps (float, optional): A value added to the denominator for numerical
+            stability. Default 1e-6.
+    Returns:
+        Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
+    """
+    assert mode in ['iou', 'iof', 'giou', 'diou'], 'Unsupported mode {}'.format(
+        mode)
+    # Either the boxes are empty or the length of boxes's last dimenstion is 4
+    assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0)
+    assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0)
+
+    # Batch dim must be the same
+    # Batch dim: (B1, B2, ... Bn)
+    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
+    batch_shape = bboxes1.shape[:-2]
+
+    rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0
+    cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0
+    if is_aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        if is_aligned:
+            return np.random.random(batch_shape + (rows, ))
+        else:
+            return np.random.random(batch_shape + (rows, cols))
+
+    area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
+        bboxes1[..., 3] - bboxes1[..., 1])
+    area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
+        bboxes2[..., 3] - bboxes2[..., 1])
+
+    if is_aligned:
+        lt = np.maximum(bboxes1[..., :2], bboxes2[..., :2])  # [B, rows, 2]
+        rb = np.minimum(bboxes1[..., 2:], bboxes2[..., 2:])  # [B, rows, 2]
+
+        wh = (rb - lt).clip(min=0)  # [B, rows, 2]
+        overlap = wh[..., 0] * wh[..., 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1 + area2 - overlap
+        else:
+            union = area1
+        if mode == 'giou':
+            enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2])
+            enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:])
+        if mode == 'diou':
+            enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2])
+            enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:])
+            b1_x1, b1_y1 = bboxes1[..., 0], bboxes1[..., 1]
+            b1_x2, b1_y2 = bboxes1[..., 2], bboxes1[..., 3]
+            b2_x1, b2_y1 = bboxes2[..., 0], bboxes2[..., 1]
+            b2_x2, b2_y2 = bboxes2[..., 2], bboxes2[..., 3]
+    else:
+        lt = np.maximum(bboxes1[..., :, None, :2],
+                        bboxes2[..., None, :, :2])  # [B, rows, cols, 2]
+        rb = np.minimum(bboxes1[..., :, None, 2:],
+                        bboxes2[..., None, :, 2:])  # [B, rows, cols, 2]
+
+        wh = (rb - lt).clip(min=0)  # [B, rows, cols, 2]
+        overlap = wh[..., 0] * wh[..., 1]
+
+        if mode in ['iou', 'giou']:
+            union = area1[..., None] + area2[..., None, :] - overlap
+        else:
+            union = area1[..., None]
+        if mode == 'giou':
+            enclosed_lt = np.minimum(bboxes1[..., :, None, :2],
+                                     bboxes2[..., None, :, :2])
+            enclosed_rb = np.maximum(bboxes1[..., :, None, 2:],
+                                     bboxes2[..., None, :, 2:])
+        if mode == 'diou':
+            enclosed_lt = np.minimum(bboxes1[..., :, None, :2],
+                                     bboxes2[..., None, :, :2])
+            enclosed_rb = np.maximum(bboxes1[..., :, None, 2:],
+                                     bboxes2[..., None, :, 2:])
+            b1_x1, b1_y1 = bboxes1[..., :, None, 0], bboxes1[..., :, None, 1]
+            b1_x2, b1_y2 = bboxes1[..., :, None, 2], bboxes1[..., :, None, 3]
+            b2_x1, b2_y1 = bboxes2[..., None, :, 0], bboxes2[..., None, :, 1]
+            b2_x2, b2_y2 = bboxes2[..., None, :, 2], bboxes2[..., None, :, 3]
+
+    eps = np.array([eps])
+    union = np.maximum(union, eps)
+    ious = overlap / union
+    if mode in ['iou', 'iof']:
+        return ious
+    # calculate gious
+    if mode in ['giou']:
+        enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
+        enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
+        enclose_area = np.maximum(enclose_area, eps)
+        gious = ious - (enclose_area - union) / enclose_area
+        return gious
+    if mode in ['diou']:
+        left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4
+        right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4
+        rho2 = left + right
+        enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
+        enclose_c = enclose_wh[..., 0]**2 + enclose_wh[..., 1]**2
+        enclose_c = np.maximum(enclose_c, eps)
+        dious = ious - rho2 / enclose_c
+        return dious
+
+
+def topk_(input, k, axis=1, largest=True):
+    x = -input if largest else input
+    if axis == 0:
+        row_index = np.arange(input.shape[1 - axis])
+        if k == x.shape[0]:  # argpartition requires index < len(input)
+            topk_index = np.argpartition(x, k - 1, axis=axis)[0:k, :]
+        else:
+            topk_index = np.argpartition(x, k, axis=axis)[0:k, :]
+
+        topk_data = x[topk_index, row_index]
+
+        topk_index_sort = np.argsort(topk_data, axis=axis)
+        topk_data_sort = topk_data[topk_index_sort, row_index]
+        topk_index_sort = topk_index[0:k, :][topk_index_sort, row_index]
+    else:
+        column_index = np.arange(x.shape[1 - axis])[:, None]
+        topk_index = np.argpartition(x, k, axis=axis)[:, 0:k]
+        topk_data = x[column_index, topk_index]
+        topk_data = -topk_data if largest else topk_data
+        topk_index_sort = np.argsort(topk_data, axis=axis)
+        topk_data_sort = topk_data[column_index, topk_index_sort]
+        topk_index_sort = topk_index[:, 0:k][column_index, topk_index_sort]
+
+    return topk_data_sort, topk_index_sort
+
+
+class ATSSAssigner(object):
+    """Assign a corresponding gt bbox or background to each bbox.
+
+    Each proposals will be assigned with `0` or a positive integer
+    indicating the ground truth index.
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        topk (float): number of bbox selected in each level
+    """
+
+    def __init__(self, topk=9):
+        self.topk = topk
+
+    def __call__(self,
+                 bboxes,
+                 num_level_bboxes,
+                 gt_bboxes,
+                 gt_bboxes_ignore=None,
+                 gt_labels=None):
+        """Assign gt to bboxes.
+        The assignment is done in following steps
+        1. compute iou between all bbox (bbox of all pyramid levels) and gt
+        2. compute center distance between all bbox and gt
+        3. on each pyramid level, for each gt, select k bbox whose center
+           are closest to the gt center, so we total select k*l bbox as
+           candidates for each gt
+        4. get corresponding iou for the these candidates, and compute the
+           mean and std, set mean + std as the iou threshold
+        5. select these candidates whose iou are greater than or equal to
+           the threshold as postive
+        6. limit the positive sample's center in gt
+        Args:
+            bboxes (np.array): Bounding boxes to be assigned, shape(n, 4).
+            num_level_bboxes (List): num of bboxes in each level
+            gt_bboxes (np.array): Groundtruth boxes, shape (k, 4).
+            gt_bboxes_ignore (np.array, optional): Ground truth bboxes that are
+                labelled as `ignored`, e.g., crowd boxes in COCO.
+            gt_labels (np.array, optional): Label of gt_bboxes, shape (k, ).
+        """
+        bboxes = bboxes[:, :4]
+        num_gt, num_bboxes = gt_bboxes.shape[0], bboxes.shape[0]
+
+        # assign 0 by default
+        assigned_gt_inds = np.zeros((num_bboxes, ), dtype=np.int64)
+
+        if num_gt == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = np.zeros((num_bboxes, ))
+            if num_gt == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            if not np.any(gt_labels):
+                assigned_labels = None
+            else:
+                assigned_labels = -np.ones((num_bboxes, ), dtype=np.int64)
+            return assigned_gt_inds, max_overlaps
+
+        # compute iou between all bbox and gt
+        overlaps = bbox_overlaps(bboxes, gt_bboxes)
+        # compute center distance between all bbox and gt
+        gt_cx = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
+        gt_cy = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
+        gt_points = np.stack((gt_cx, gt_cy), axis=1)
+
+        bboxes_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0
+        bboxes_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0
+        bboxes_points = np.stack((bboxes_cx, bboxes_cy), axis=1)
+
+        distances = np.sqrt(
+            np.power((bboxes_points[:, None, :] - gt_points[None, :, :]), 2)
+            .sum(-1))
+
+        # Selecting candidates based on the center distance
+        candidate_idxs = []
+        start_idx = 0
+        for bboxes_per_level in num_level_bboxes:
+            # on each pyramid level, for each gt,
+            # select k bbox whose center are closest to the gt center
+            end_idx = start_idx + bboxes_per_level
+            distances_per_level = distances[start_idx:end_idx, :]
+            selectable_k = min(self.topk, bboxes_per_level)
+            _, topk_idxs_per_level = topk_(
+                distances_per_level, selectable_k, axis=0, largest=False)
+            candidate_idxs.append(topk_idxs_per_level + start_idx)
+            start_idx = end_idx
+        candidate_idxs = np.concatenate(candidate_idxs, axis=0)
+
+        # get corresponding iou for the these candidates, and compute the
+        # mean and std, set mean + std as the iou threshold
+        candidate_overlaps = overlaps[candidate_idxs, np.arange(num_gt)]
+        overlaps_mean_per_gt = candidate_overlaps.mean(0)
+        overlaps_std_per_gt = candidate_overlaps.std(0)
+        overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt
+
+        is_pos = candidate_overlaps >= overlaps_thr_per_gt[None, :]
+
+        # limit the positive sample's center in gt
+        for gt_idx in range(num_gt):
+            candidate_idxs[:, gt_idx] += gt_idx * num_bboxes
+        ep_bboxes_cx = np.broadcast_to(
+            bboxes_cx.reshape(1, -1), [num_gt, num_bboxes]).reshape(-1)
+        ep_bboxes_cy = np.broadcast_to(
+            bboxes_cy.reshape(1, -1), [num_gt, num_bboxes]).reshape(-1)
+        candidate_idxs = candidate_idxs.reshape(-1)
+
+        # calculate the left, top, right, bottom distance between positive
+        # bbox center and gt side
+        l_ = ep_bboxes_cx[candidate_idxs].reshape(-1, num_gt) - gt_bboxes[:, 0]
+        t_ = ep_bboxes_cy[candidate_idxs].reshape(-1, num_gt) - gt_bboxes[:, 1]
+        r_ = gt_bboxes[:, 2] - ep_bboxes_cx[candidate_idxs].reshape(-1, num_gt)
+        b_ = gt_bboxes[:, 3] - ep_bboxes_cy[candidate_idxs].reshape(-1, num_gt)
+        is_in_gts = np.stack([l_, t_, r_, b_], axis=1).min(axis=1) > 0.01
+        is_pos = is_pos & is_in_gts
+
+        # if an anchor box is assigned to multiple gts,
+        # the one with the highest IoU will be selected.
+        overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1)
+        index = candidate_idxs.reshape(-1)[is_pos.reshape(-1)]
+        overlaps_inf[index] = overlaps.T.reshape(-1)[index]
+        overlaps_inf = overlaps_inf.reshape(num_gt, -1).T
+
+        max_overlaps = overlaps_inf.max(axis=1)
+        argmax_overlaps = overlaps_inf.argmax(axis=1)
+        assigned_gt_inds[max_overlaps !=
+                         -np.inf] = argmax_overlaps[max_overlaps != -np.inf] + 1
+
+        return assigned_gt_inds, max_overlaps
+
+    def get_vlr_region(self,
+                       bboxes,
+                       num_level_bboxes,
+                       gt_bboxes,
+                       gt_bboxes_ignore=None,
+                       gt_labels=None):
+        """get vlr region for ld distillation.
+        Args:
+            bboxes (np.array): Bounding boxes to be assigned, shape(n, 4).
+            num_level_bboxes (List): num of bboxes in each level
+            gt_bboxes (np.array): Groundtruth boxes, shape (k, 4).
+            gt_bboxes_ignore (np.array, optional): Ground truth bboxes that are
+                labelled as `ignored`, e.g., crowd boxes in COCO.
+            gt_labels (np.array, optional): Label of gt_bboxes, shape (k, ).
+        """
+        bboxes = bboxes[:, :4]
+
+        num_gt, num_bboxes = gt_bboxes.shape[0], bboxes.shape[0]
+
+        # compute iou between all bbox and gt
+        overlaps = bbox_overlaps(bboxes, gt_bboxes)
+
+        # compute diou between all bbox and gt
+        diou = bbox_overlaps(bboxes, gt_bboxes, mode='diou')
+
+        # assign 0 by default
+        assigned_gt_inds = np.zeros((num_bboxes, ), dtype=np.int64)
+
+        vlr_region_iou = (assigned_gt_inds + 0).astype(np.float32)
+
+        if num_gt == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            max_overlaps = np.zeros((num_bboxes, ))
+            if num_gt == 0:
+                # No truth, assign everything to background
+                assigned_gt_inds[:] = 0
+            if not np.any(gt_labels):
+                assigned_labels = None
+            else:
+                assigned_labels = -np.ones((num_bboxes, ), dtype=np.int64)
+            return assigned_gt_inds, max_overlaps
+
+        # compute center distance between all bbox and gt
+        gt_cx = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
+        gt_cy = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
+        gt_points = np.stack((gt_cx, gt_cy), axis=1)
+
+        bboxes_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0
+        bboxes_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0
+        bboxes_points = np.stack((bboxes_cx, bboxes_cy), axis=1)
+
+        distances = np.sqrt(
+            np.power((bboxes_points[:, None, :] - gt_points[None, :, :]), 2)
+            .sum(-1))
+
+        # Selecting candidates based on the center distance
+        candidate_idxs = []
+        candidate_idxs_t = []
+        start_idx = 0
+        for bboxes_per_level in num_level_bboxes:
+            # on each pyramid level, for each gt,
+            # select k bbox whose center are closest to the gt center
+            end_idx = start_idx + bboxes_per_level
+            distances_per_level = distances[start_idx:end_idx, :]
+            selectable_t = min(self.topk, bboxes_per_level)
+            selectable_k = bboxes_per_level  #k for all
+            _, topt_idxs_per_level = topk_(
+                distances_per_level, selectable_t, axis=0, largest=False)
+            _, topk_idxs_per_level = topk_(
+                distances_per_level, selectable_k, axis=0, largest=False)
+            candidate_idxs_t.append(topt_idxs_per_level + start_idx)
+            candidate_idxs.append(topk_idxs_per_level + start_idx)
+            start_idx = end_idx
+
+        candidate_idxs_t = np.concatenate(candidate_idxs_t, axis=0)
+        candidate_idxs = np.concatenate(candidate_idxs, axis=0)
+
+        # get corresponding iou for the these candidates, and compute the
+        # mean and std, set mean + std as the iou threshold
+        candidate_overlaps_t = overlaps[candidate_idxs_t, np.arange(num_gt)]
+
+        # compute tdiou
+        t_diou = diou[candidate_idxs, np.arange(num_gt)]
+
+        overlaps_mean_per_gt = candidate_overlaps_t.mean(0)
+        overlaps_std_per_gt = candidate_overlaps_t.std(
+            0, ddof=1)  # NOTE: use Bessel correction
+        overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt
+
+        # compute region        
+        is_pos = (t_diou < overlaps_thr_per_gt[None, :]) & (
+            t_diou >= 0.25 * overlaps_thr_per_gt[None, :])
+
+        # limit the positive sample's center in gt
+        for gt_idx in range(num_gt):
+            candidate_idxs[:, gt_idx] += gt_idx * num_bboxes
+
+        candidate_idxs = candidate_idxs.reshape(-1)
+
+        # if an anchor box is assigned to multiple gts,
+        # the one with the highest IoU will be selected.
+        overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1)
+        index = candidate_idxs.reshape(-1)[is_pos.reshape(-1)]
+
+        overlaps_inf[index] = overlaps.T.reshape(-1)[index]
+        overlaps_inf = overlaps_inf.reshape(num_gt, -1).T
+
+        max_overlaps = overlaps_inf.max(axis=1)
+        argmax_overlaps = overlaps_inf.argmax(axis=1)
+
+        overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1)
+        overlaps_inf = overlaps_inf.reshape(num_gt, -1).T
+
+        assigned_gt_inds[max_overlaps !=
+                         -np.inf] = argmax_overlaps[max_overlaps != -np.inf] + 1
+
+        vlr_region_iou[max_overlaps !=
+                       -np.inf] = max_overlaps[max_overlaps != -np.inf] + 0
+
+        return vlr_region_iou
--- a/paddle_detection/ppdet/data/transform/autoaugment_utils.py
+++ b/paddle_detection/ppdet/data/transform/autoaugment_utils.py
--- a/paddle_detection/ppdet/data/transform/batch_operators.py
+++ b/paddle_detection/ppdet/data/transform/batch_operators.py
--- a/paddle_detection/ppdet/data/transform/culane_operators.py
+++ b/paddle_detection/ppdet/data/transform/culane_operators.py
@@ -0,0 +1,366 @@
+import numpy as np
+import imgaug.augmenters as iaa
+from .operators import BaseOperator, register_op
+from ppdet.utils.logger import setup_logger
+from ppdet.data.culane_utils import linestrings_to_lanes, transform_annotation
+
+logger = setup_logger(__name__)
+
+__all__ = [
+    "CULaneTrainProcess", "CULaneDataProcess", "HorizontalFlip",
+    "ChannelShuffle", "CULaneAffine", "CULaneResize", "OneOfBlur",
+    "MultiplyAndAddToBrightness", "AddToHueAndSaturation"
+]
+
+
+def trainTransforms(img_h, img_w):
+    transforms = [{
+        'name': 'Resize',
+        'parameters': dict(size=dict(
+            height=img_h, width=img_w)),
+        'p': 1.0
+    }, {
+        'name': 'HorizontalFlip',
+        'parameters': dict(p=1.0),
+        'p': 0.5
+    }, {
+        'name': 'ChannelShuffle',
+        'parameters': dict(p=1.0),
+        'p': 0.1
+    }, {
+        'name': 'MultiplyAndAddToBrightness',
+        'parameters': dict(
+            mul=(0.85, 1.15), add=(-10, 10)),
+        'p': 0.6
+    }, {
+        'name': 'AddToHueAndSaturation',
+        'parameters': dict(value=(-10, 10)),
+        'p': 0.7
+    }, {
+        'name': 'OneOf',
+        'transforms': [
+            dict(
+                name='MotionBlur', parameters=dict(k=(3, 5))), dict(
+                    name='MedianBlur', parameters=dict(k=(3, 5)))
+        ],
+        'p': 0.2
+    }, {
+        'name': 'Affine',
+        'parameters': dict(
+            translate_percent=dict(
+                x=(-0.1, 0.1), y=(-0.1, 0.1)),
+            rotate=(-10, 10),
+            scale=(0.8, 1.2)),
+        'p': 0.7
+    }, {
+        'name': 'Resize',
+        'parameters': dict(size=dict(
+            height=img_h, width=img_w)),
+        'p': 1.0
+    }]
+    return transforms
+
+
+@register_op
+class CULaneTrainProcess(BaseOperator):
+    def __init__(self, img_w, img_h):
+        super(CULaneTrainProcess, self).__init__()
+        self.img_w = img_w
+        self.img_h = img_h
+        self.transforms = trainTransforms(self.img_h, self.img_w)
+
+        if self.transforms is not None:
+            img_transforms = []
+            for aug in self.transforms:
+                p = aug['p']
+                if aug['name'] != 'OneOf':
+                    img_transforms.append(
+                        iaa.Sometimes(
+                            p=p,
+                            then_list=getattr(iaa, aug['name'])(**aug[
+                                'parameters'])))
+                else:
+                    img_transforms.append(
+                        iaa.Sometimes(
+                            p=p,
+                            then_list=iaa.OneOf([
+                                getattr(iaa, aug_['name'])(**aug_['parameters'])
+                                for aug_ in aug['transforms']
+                            ])))
+        else:
+            img_transforms = []
+        self.iaa_transform = iaa.Sequential(img_transforms)
+
+    def apply(self, sample, context=None):
+        img, line_strings, seg = self.iaa_transform(
+            image=sample['image'],
+            line_strings=sample['lanes'],
+            segmentation_maps=sample['mask'])
+        sample['image'] = img
+        sample['lanes'] = line_strings
+        sample['mask'] = seg
+        return sample
+
+
+@register_op
+class CULaneDataProcess(BaseOperator):
+    def __init__(self, img_w, img_h, num_points, max_lanes):
+        super(CULaneDataProcess, self).__init__()
+        self.img_w = img_w
+        self.img_h = img_h
+        self.num_points = num_points
+        self.n_offsets = num_points
+        self.n_strips = num_points - 1
+        self.strip_size = self.img_h / self.n_strips
+
+        self.max_lanes = max_lanes
+        self.offsets_ys = np.arange(self.img_h, -1, -self.strip_size)
+
+    def apply(self, sample, context=None):
+        data = {}
+        line_strings = sample['lanes']
+        line_strings.clip_out_of_image_()
+        new_anno = {'lanes': linestrings_to_lanes(line_strings)}
+
+        for i in range(30):
+            try:
+                annos = transform_annotation(
+                    self.img_w, self.img_h, self.max_lanes, self.n_offsets,
+                    self.offsets_ys, self.n_strips, self.strip_size, new_anno)
+                label = annos['label']
+                lane_endpoints = annos['lane_endpoints']
+                break
+            except:
+                if (i + 1) == 30:
+                    logger.critical('Transform annotation failed 30 times :(')
+                    exit()
+
+        sample['image'] = sample['image'].astype(np.float32) / 255.
+        data['image'] = sample['image'].transpose(2, 0, 1)
+        data['lane_line'] = label
+        data['seg'] = sample['seg']
+        data['full_img_path'] = sample['full_img_path']
+        data['img_name'] = sample['img_name']
+        data['im_id'] = sample['im_id']
+
+        if 'mask' in sample.keys():
+            data['seg'] = sample['mask'].get_arr()
+
+        data['im_shape'] = np.array([self.img_w, self.img_h], dtype=np.float32)
+        data['scale_factor'] = np.array([1., 1.], dtype=np.float32)
+
+        return data
+
+
+@register_op
+class CULaneResize(BaseOperator):
+    def __init__(self, img_h, img_w, prob=0.5):
+        super(CULaneResize, self).__init__()
+        self.img_h = img_h
+        self.img_w = img_w
+        self.prob = prob
+
+    def apply(self, sample, context=None):
+        transform = iaa.Sometimes(self.prob,
+                                  iaa.Resize({
+                                      "height": self.img_h,
+                                      "width": self.img_w
+                                  }))
+        if 'mask' in sample.keys():
+            img, line_strings, seg = transform(
+                image=sample['image'],
+                line_strings=sample['lanes'],
+                segmentation_maps=sample['mask'])
+            sample['image'] = img
+            sample['lanes'] = line_strings
+            sample['mask'] = seg
+        else:
+            img, line_strings = transform(
+                image=sample['image'].copy().astype(np.uint8),
+                line_strings=sample['lanes'])
+            sample['image'] = img
+            sample['lanes'] = line_strings
+
+        return sample
+
+
+@register_op
+class HorizontalFlip(BaseOperator):
+    def __init__(self, prob=0.5):
+        super(HorizontalFlip, self).__init__()
+        self.prob = prob
+
+    def apply(self, sample, context=None):
+        transform = iaa.Sometimes(self.prob, iaa.HorizontalFlip(1.0))
+        if 'mask' in sample.keys():
+            img, line_strings, seg = transform(
+                image=sample['image'],
+                line_strings=sample['lanes'],
+                segmentation_maps=sample['mask'])
+            sample['image'] = img
+            sample['lanes'] = line_strings
+            sample['mask'] = seg
+        else:
+            img, line_strings = transform(
+                image=sample['image'], line_strings=sample['lanes'])
+            sample['image'] = img
+            sample['lanes'] = line_strings
+
+        return sample
+
+
+@register_op
+class ChannelShuffle(BaseOperator):
+    def __init__(self, prob=0.1):
+        super(ChannelShuffle, self).__init__()
+        self.prob = prob
+
+    def apply(self, sample, context=None):
+        transform = iaa.Sometimes(self.prob, iaa.ChannelShuffle(1.0))
+        if 'mask' in sample.keys():
+            img, line_strings, seg = transform(
+                image=sample['image'],
+                line_strings=sample['lanes'],
+                segmentation_maps=sample['mask'])
+            sample['image'] = img
+            sample['lanes'] = line_strings
+            sample['mask'] = seg
+        else:
+            img, line_strings = transform(
+                image=sample['image'], line_strings=sample['lanes'])
+            sample['image'] = img
+            sample['lanes'] = line_strings
+
+        return sample
+
+
+@register_op
+class MultiplyAndAddToBrightness(BaseOperator):
+    def __init__(self, mul=(0.85, 1.15), add=(-10, 10), prob=0.5):
+        super(MultiplyAndAddToBrightness, self).__init__()
+        self.mul = tuple(mul)
+        self.add = tuple(add)
+        self.prob = prob
+
+    def apply(self, sample, context=None):
+        transform = iaa.Sometimes(
+            self.prob,
+            iaa.MultiplyAndAddToBrightness(
+                mul=self.mul, add=self.add))
+        if 'mask' in sample.keys():
+            img, line_strings, seg = transform(
+                image=sample['image'],
+                line_strings=sample['lanes'],
+                segmentation_maps=sample['mask'])
+            sample['image'] = img
+            sample['lanes'] = line_strings
+            sample['mask'] = seg
+        else:
+            img, line_strings = transform(
+                image=sample['image'], line_strings=sample['lanes'])
+            sample['image'] = img
+            sample['lanes'] = line_strings
+
+        return sample
+
+
+@register_op
+class AddToHueAndSaturation(BaseOperator):
+    def __init__(self, value=(-10, 10), prob=0.5):
+        super(AddToHueAndSaturation, self).__init__()
+        self.value = tuple(value)
+        self.prob = prob
+
+    def apply(self, sample, context=None):
+        transform = iaa.Sometimes(
+            self.prob, iaa.AddToHueAndSaturation(value=self.value))
+        if 'mask' in sample.keys():
+            img, line_strings, seg = transform(
+                image=sample['image'],
+                line_strings=sample['lanes'],
+                segmentation_maps=sample['mask'])
+            sample['image'] = img
+            sample['lanes'] = line_strings
+            sample['mask'] = seg
+        else:
+            img, line_strings = transform(
+                image=sample['image'], line_strings=sample['lanes'])
+            sample['image'] = img
+            sample['lanes'] = line_strings
+
+        return sample
+
+
+@register_op
+class OneOfBlur(BaseOperator):
+    def __init__(self, MotionBlur_k=(3, 5), MedianBlur_k=(3, 5), prob=0.5):
+        super(OneOfBlur, self).__init__()
+        self.MotionBlur_k = tuple(MotionBlur_k)
+        self.MedianBlur_k = tuple(MedianBlur_k)
+        self.prob = prob
+
+    def apply(self, sample, context=None):
+        transform = iaa.Sometimes(
+            self.prob,
+            iaa.OneOf([
+                iaa.MotionBlur(k=self.MotionBlur_k),
+                iaa.MedianBlur(k=self.MedianBlur_k)
+            ]))
+
+        if 'mask' in sample.keys():
+            img, line_strings, seg = transform(
+                image=sample['image'],
+                line_strings=sample['lanes'],
+                segmentation_maps=sample['mask'])
+            sample['image'] = img
+            sample['lanes'] = line_strings
+            sample['mask'] = seg
+        else:
+            img, line_strings = transform(
+                image=sample['image'], line_strings=sample['lanes'])
+            sample['image'] = img
+            sample['lanes'] = line_strings
+
+        return sample
+
+
+@register_op
+class CULaneAffine(BaseOperator):
+    def __init__(self,
+                 translate_percent_x=(-0.1, 0.1),
+                 translate_percent_y=(-0.1, 0.1),
+                 rotate=(3, 5),
+                 scale=(0.8, 1.2),
+                 prob=0.5):
+        super(CULaneAffine, self).__init__()
+        self.translate_percent = {
+            'x': tuple(translate_percent_x),
+            'y': tuple(translate_percent_y)
+        }
+        self.rotate = tuple(rotate)
+        self.scale = tuple(scale)
+        self.prob = prob
+
+    def apply(self, sample, context=None):
+        transform = iaa.Sometimes(
+            self.prob,
+            iaa.Affine(
+                translate_percent=self.translate_percent,
+                rotate=self.rotate,
+                scale=self.scale))
+
+        if 'mask' in sample.keys():
+            img, line_strings, seg = transform(
+                image=sample['image'],
+                line_strings=sample['lanes'],
+                segmentation_maps=sample['mask'])
+            sample['image'] = img
+            sample['lanes'] = line_strings
+            sample['mask'] = seg
+        else:
+            img, line_strings = transform(
+                image=sample['image'], line_strings=sample['lanes'])
+            sample['image'] = img
+            sample['lanes'] = line_strings
+
+        return sample
--- a/paddle_detection/ppdet/data/transform/gridmask_utils.py
+++ b/paddle_detection/ppdet/data/transform/gridmask_utils.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The code is based on:
+# https://github.com/dvlab-research/GridMask/blob/master/detection_grid/maskrcnn_benchmark/data/transforms/grid.py
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import numpy as np
+from PIL import Image
+
+
+class Gridmask(object):
+    def __init__(self,
+                 use_h=True,
+                 use_w=True,
+                 rotate=1,
+                 offset=False,
+                 ratio=0.5,
+                 mode=1,
+                 prob=0.7,
+                 upper_iter=360000):
+        super(Gridmask, self).__init__()
+        self.use_h = use_h
+        self.use_w = use_w
+        self.rotate = rotate
+        self.offset = offset
+        self.ratio = ratio
+        self.mode = mode
+        self.prob = prob
+        self.st_prob = prob
+        self.upper_iter = upper_iter
+
+    def __call__(self, x, curr_iter):
+        self.prob = self.st_prob * min(1, 1.0 * curr_iter / self.upper_iter)
+        if np.random.rand() > self.prob:
+            return x
+        h, w, _ = x.shape
+        hh = int(1.5 * h)
+        ww = int(1.5 * w)
+        d = np.random.randint(2, h)
+        self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1)
+        mask = np.ones((hh, ww), np.float32)
+        st_h = np.random.randint(d)
+        st_w = np.random.randint(d)
+        if self.use_h:
+            for i in range(hh // d):
+                s = d * i + st_h
+                t = min(s + self.l, hh)
+                mask[s:t, :] *= 0
+        if self.use_w:
+            for i in range(ww // d):
+                s = d * i + st_w
+                t = min(s + self.l, ww)
+                mask[:, s:t] *= 0
+
+        r = np.random.randint(self.rotate)
+        mask = Image.fromarray(np.uint8(mask))
+        mask = mask.rotate(r)
+        mask = np.asarray(mask)
+        mask = mask[(hh - h) // 2:(hh - h) // 2 + h, (ww - w) // 2:(ww - w) // 2
+                    + w].astype(np.float32)
+
+        if self.mode == 1:
+            mask = 1 - mask
+        mask = np.expand_dims(mask, axis=-1)
+        if self.offset:
+            offset = (2 * (np.random.rand(h, w) - 0.5)).astype(np.float32)
+            x = (x * mask + offset * (1 - mask)).astype(x.dtype)
+        else:
+            x = (x * mask).astype(x.dtype)
+
+        return x
--- a/paddle_detection/ppdet/data/transform/keypoint_operators.py
+++ b/paddle_detection/ppdet/data/transform/keypoint_operators.py
--- a/paddle_detection/ppdet/data/transform/keypoints_3d_operators.py
+++ b/paddle_detection/ppdet/data/transform/keypoints_3d_operators.py
@@ -0,0 +1,296 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+import cv2
+import numpy as np
+import math
+import copy
+import random
+import uuid
+from numbers import Number, Integral
+
+from ...modeling.keypoint_utils import get_affine_mat_kernel, warp_affine_joints, get_affine_transform, affine_transform, get_warp_matrix
+from ppdet.core.workspace import serializable
+from ppdet.utils.logger import setup_logger
+logger = setup_logger(__name__)
+
+registered_ops = []
+
+__all__ = [
+    'CropAndFlipImages', 'PermuteImages', 'RandomFlipHalfBody3DTransformImages'
+]
+
+import matplotlib.pyplot as plt
+from PIL import Image, ImageDraw
+from mpl_toolkits.mplot3d import Axes3D
+
+
+def register_keypointop(cls):
+    return serializable(cls)
+
+
+def register_op(cls):
+    registered_ops.append(cls.__name__)
+    if not hasattr(BaseOperator, cls.__name__):
+        setattr(BaseOperator, cls.__name__, cls)
+    else:
+        raise KeyError("The {} class has been registered.".format(cls.__name__))
+    return serializable(cls)
+
+
+class BaseOperator(object):
+    def __init__(self, name=None):
+        if name is None:
+            name = self.__class__.__name__
+        self._id = name + '_' + str(uuid.uuid4())[-6:]
+
+    def apply(self, sample, context=None):
+        """ Process a sample.
+        Args:
+            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
+            context (dict): info about this sample processing
+        Returns:
+            result (dict): a processed sample
+        """
+        return sample
+
+    def __call__(self, sample, context=None):
+        """ Process a sample.
+        Args:
+            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
+            context (dict): info about this sample processing
+        Returns:
+            result (dict): a processed sample
+        """
+        if isinstance(sample, Sequence):  # for batch_size
+            for i in range(len(sample)):
+                sample[i] = self.apply(sample[i], context)
+        else:
+            # image.shape changed
+            sample = self.apply(sample, context)
+        return sample
+
+    def __str__(self):
+        return str(self._id)
+
+
+@register_keypointop
+class CropAndFlipImages(object):
+    """Crop all images"""
+
+    def __init__(self, crop_range, flip_pairs=None):
+        super(CropAndFlipImages, self).__init__()
+        self.crop_range = crop_range
+        self.flip_pairs = flip_pairs
+
+    def __call__(self, records):  # tuple
+        images = records["image"]
+        images = images[:, :, ::-1, :]
+        images = images[:, :, self.crop_range[0]:self.crop_range[1]]
+        records["image"] = images
+
+        if "kps2d" in records.keys():
+            kps2d = records["kps2d"]
+
+            width, height = images.shape[2], images.shape[1]
+            kps2d = np.array(kps2d)
+            kps2d[:, :, 0] = kps2d[:, :, 0] - self.crop_range[0]
+
+            for pair in self.flip_pairs:
+                kps2d[:, pair[0], :], kps2d[:,pair[1], :] = \
+                    kps2d[:,pair[1], :], kps2d[:,pair[0], :].copy()
+
+            records["kps2d"] = kps2d
+
+        return records
+
+
+@register_op
+class PermuteImages(BaseOperator):
+    def __init__(self):
+        """
+        Change the channel to be (batch_size, C, H, W) #(6, 3, 1080, 1920)
+        """
+        super(PermuteImages, self).__init__()
+
+    def apply(self, sample, context=None):
+        images = sample["image"]
+        images = images.transpose((0, 3, 1, 2))
+
+        sample["image"] = images
+
+        return sample
+
+
+@register_keypointop
+class RandomFlipHalfBody3DTransformImages(object):
+    """apply data augment to images and coords
+    to achieve the flip, scale, rotate and half body transform effect for training image
+    Args:
+        trainsize (list):[w, h], Image target size
+        upper_body_ids (list): The upper body joint ids
+        flip_pairs (list): The left-right joints exchange order list
+        pixel_std (int): The pixel std of the scale
+        scale (float): The scale factor to transform the image
+        rot (int): The rotate factor to transform the image
+        num_joints_half_body (int): The joints threshold of the half body transform
+        prob_half_body (float): The threshold of the half body transform
+        flip (bool): Whether to flip the image
+    Returns:
+        records(dict): contain the image and coords after tranformed
+    """
+
+    def __init__(self,
+                 trainsize,
+                 upper_body_ids,
+                 flip_pairs,
+                 pixel_std,
+                 scale=0.35,
+                 rot=40,
+                 num_joints_half_body=8,
+                 prob_half_body=0.3,
+                 flip=True,
+                 rot_prob=0.6,
+                 do_occlusion=False):
+        super(RandomFlipHalfBody3DTransformImages, self).__init__()
+        self.trainsize = trainsize
+        self.upper_body_ids = upper_body_ids
+        self.flip_pairs = flip_pairs
+        self.pixel_std = pixel_std
+        self.scale = scale
+        self.rot = rot
+        self.num_joints_half_body = num_joints_half_body
+        self.prob_half_body = prob_half_body
+        self.flip = flip
+        self.aspect_ratio = trainsize[0] * 1.0 / trainsize[1]
+        self.rot_prob = rot_prob
+        self.do_occlusion = do_occlusion
+
+    def halfbody_transform(self, joints, joints_vis):
+        upper_joints = []
+        lower_joints = []
+        for joint_id in range(joints.shape[0]):
+            if joints_vis[joint_id][0] > 0:
+                if joint_id in self.upper_body_ids:
+                    upper_joints.append(joints[joint_id])
+                else:
+                    lower_joints.append(joints[joint_id])
+        if np.random.randn() < 0.5 and len(upper_joints) > 2:
+            selected_joints = upper_joints
+        else:
+            selected_joints = lower_joints if len(
+                lower_joints) > 2 else upper_joints
+        if len(selected_joints) < 2:
+            return None, None
+        selected_joints = np.array(selected_joints, dtype=np.float32)
+        center = selected_joints.mean(axis=0)[:2]
+        left_top = np.amin(selected_joints, axis=0)
+        right_bottom = np.amax(selected_joints, axis=0)
+        w = right_bottom[0] - left_top[0]
+        h = right_bottom[1] - left_top[1]
+        if w > self.aspect_ratio * h:
+            h = w * 1.0 / self.aspect_ratio
+        elif w < self.aspect_ratio * h:
+            w = h * self.aspect_ratio
+        scale = np.array(
+            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
+            dtype=np.float32)
+        scale = scale * 1.5
+
+        return center, scale
+
+    def flip_joints(self, joints, joints_vis, width, matched_parts, kps2d=None):
+        # joints: (6, 24, 3),(num_frames, num_joints, 3)
+
+        joints[:, :, 0] = width - joints[:, :, 0] - 1  # x
+        if kps2d is not None:
+            kps2d[:, :, 0] = width - kps2d[:, :, 0] - 1
+
+        for pair in matched_parts:
+            joints[:, pair[0], :], joints[:,pair[1], :] = \
+                joints[:,pair[1], :], joints[:,pair[0], :].copy()
+
+            joints_vis[:,pair[0], :], joints_vis[:,pair[1], :] = \
+                joints_vis[:,pair[1], :], joints_vis[:,pair[0], :].copy()
+
+            if kps2d is not None:
+                kps2d[:, pair[0], :], kps2d[:,pair[1], :] = \
+                    kps2d[:,pair[1], :], kps2d[:,pair[0], :].copy()
+
+        # move to zero
+        joints -= joints[:, [0], :]  # (batch_size, 24, 3),numpy.ndarray
+
+        return joints, joints_vis, kps2d
+
+    def __call__(self, records):
+        images = records[
+            'image']  #kps3d, kps3d_vis, images. images.shape(num_frames, width, height, 3)
+
+        joints = records['kps3d']
+        joints_vis = records['kps3d_vis']
+
+        kps2d = None
+        if 'kps2d' in records.keys():
+            kps2d = records['kps2d']
+
+        if self.flip and np.random.random() <= 0.5:
+            images = images[:, :, ::-1, :]  # 图像水平翻转 (6, 1080, 810, 3)
+            joints, joints_vis, kps2d = self.flip_joints(
+                joints, joints_vis, images.shape[2], self.flip_pairs,
+                kps2d)  # 关键点左右对称翻转
+        occlusion = False
+        if self.do_occlusion and random.random() <= 0.5:  # 随机遮挡
+            height = images[0].shape[0]
+            width = images[0].shape[1]
+            occlusion = True
+            while True:
+                area_min = 0.0
+                area_max = 0.2
+                synth_area = (random.random() *
+                              (area_max - area_min) + area_min) * width * height
+
+                ratio_min = 0.3
+                ratio_max = 1 / 0.3
+                synth_ratio = (random.random() *
+                               (ratio_max - ratio_min) + ratio_min)
+
+                synth_h = math.sqrt(synth_area * synth_ratio)
+                synth_w = math.sqrt(synth_area / synth_ratio)
+                synth_xmin = random.random() * (width - synth_w - 1)
+                synth_ymin = random.random() * (height - synth_h - 1)
+
+                if synth_xmin >= 0 and synth_ymin >= 0 and synth_xmin + synth_w < width and synth_ymin + synth_h < height:
+                    xmin = int(synth_xmin)
+                    ymin = int(synth_ymin)
+                    w = int(synth_w)
+                    h = int(synth_h)
+
+                    mask = np.random.rand(h, w, 3) * 255
+                    images[:, ymin:ymin + h, xmin:xmin + w, :] = mask[
+                        None, :, :, :]
+                    break
+
+        records['image'] = images
+        records['kps3d'] = joints
+        records['kps3d_vis'] = joints_vis
+        if kps2d is not None:
+            records['kps2d'] = kps2d
+
+        return records
--- a/paddle_detection/ppdet/data/transform/mot_operators.py
+++ b/paddle_detection/ppdet/data/transform/mot_operators.py
@@ -0,0 +1,627 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+from numbers import Integral
+
+import cv2
+import copy
+import numpy as np
+import random
+import math
+
+from .operators import BaseOperator, register_op
+from .batch_operators import Gt2TTFTarget
+from ppdet.modeling.bbox_utils import bbox_iou_np_expand
+from ppdet.utils.logger import setup_logger
+from .op_helper import gaussian_radius
+logger = setup_logger(__name__)
+
+__all__ = [
+    'RGBReverse', 'LetterBoxResize', 'MOTRandomAffine', 'Gt2JDETargetThres',
+    'Gt2JDETargetMax', 'Gt2FairMOTTarget'
+]
+
+
+@register_op
+class RGBReverse(BaseOperator):
+    """RGB to BGR, or BGR to RGB, sensitive to MOTRandomAffine
+    """
+
+    def __init__(self):
+        super(RGBReverse, self).__init__()
+
+    def apply(self, sample, context=None):
+        im = sample['image']
+        sample['image'] = np.ascontiguousarray(im[:, :, ::-1])
+        return sample
+
+
+@register_op
+class LetterBoxResize(BaseOperator):
+    def __init__(self, target_size):
+        """
+        Resize image to target size, convert normalized xywh to pixel xyxy
+        format ([x_center, y_center, width, height] -> [x0, y0, x1, y1]).
+        Args:
+            target_size (int|list): image target size.
+        """
+        super(LetterBoxResize, self).__init__()
+        if not isinstance(target_size, (Integral, Sequence)):
+            raise TypeError(
+                "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}".
+                format(type(target_size)))
+        if isinstance(target_size, Integral):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+
+    def apply_image(self, img, height, width, color=(127.5, 127.5, 127.5)):
+        # letterbox: resize a rectangular image to a padded rectangular
+        shape = img.shape[:2]  # [height, width]
+        ratio_h = float(height) / shape[0]
+        ratio_w = float(width) / shape[1]
+        ratio = min(ratio_h, ratio_w)
+        new_shape = (round(shape[1] * ratio),
+                     round(shape[0] * ratio))  # [width, height]
+        padw = (width - new_shape[0]) / 2
+        padh = (height - new_shape[1]) / 2
+        top, bottom = round(padh - 0.1), round(padh + 0.1)
+        left, right = round(padw - 0.1), round(padw + 0.1)
+
+        img = cv2.resize(
+            img, new_shape, interpolation=cv2.INTER_AREA)  # resized, no border
+        img = cv2.copyMakeBorder(
+            img, top, bottom, left, right, cv2.BORDER_CONSTANT,
+            value=color)  # padded rectangular
+        return img, ratio, padw, padh
+
+    def apply_bbox(self, bbox0, h, w, ratio, padw, padh):
+        bboxes = bbox0.copy()
+        bboxes[:, 0] = ratio * w * (bbox0[:, 0] - bbox0[:, 2] / 2) + padw
+        bboxes[:, 1] = ratio * h * (bbox0[:, 1] - bbox0[:, 3] / 2) + padh
+        bboxes[:, 2] = ratio * w * (bbox0[:, 0] + bbox0[:, 2] / 2) + padw
+        bboxes[:, 3] = ratio * h * (bbox0[:, 1] + bbox0[:, 3] / 2) + padh
+        return bboxes
+
+    def apply(self, sample, context=None):
+        """ Resize the image numpy.
+        """
+        im = sample['image']
+        h, w = sample['im_shape']
+        if not isinstance(im, np.ndarray):
+            raise TypeError("{}: image type is not numpy.".format(self))
+        if len(im.shape) != 3:
+            from PIL import UnidentifiedImageError
+            raise UnidentifiedImageError(
+                '{}: image is not 3-dimensional.'.format(self))
+
+        # apply image
+        height, width = self.target_size
+        img, ratio, padw, padh = self.apply_image(
+            im, height=height, width=width)
+
+        sample['image'] = img
+        new_shape = (round(h * ratio), round(w * ratio))
+        sample['im_shape'] = np.asarray(new_shape, dtype=np.float32)
+        sample['scale_factor'] = np.asarray([ratio, ratio], dtype=np.float32)
+
+        # apply bbox
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], h, w, ratio,
+                                                padw, padh)
+        return sample
+
+
+@register_op
+class MOTRandomAffine(BaseOperator):
+    """ 
+    Affine transform to image and coords to achieve the rotate, scale and
+    shift effect for training image.
+
+    Args:
+        degrees (list[2]): the rotate range to apply, transform range is [min, max]
+        translate (list[2]): the translate range to apply, transform range is [min, max]
+        scale (list[2]): the scale range to apply, transform range is [min, max]
+        shear (list[2]): the shear range to apply, transform range is [min, max]
+        borderValue (list[3]): value used in case of a constant border when appling
+            the perspective transformation
+        reject_outside (bool): reject warped bounding bboxes outside of image
+
+    Returns:
+        records(dict): contain the image and coords after tranformed
+
+    """
+
+    def __init__(self,
+                 degrees=(-5, 5),
+                 translate=(0.10, 0.10),
+                 scale=(0.50, 1.20),
+                 shear=(-2, 2),
+                 borderValue=(127.5, 127.5, 127.5),
+                 reject_outside=True):
+        super(MOTRandomAffine, self).__init__()
+        self.degrees = degrees
+        self.translate = translate
+        self.scale = scale
+        self.shear = shear
+        self.borderValue = borderValue
+        self.reject_outside = reject_outside
+
+    def apply(self, sample, context=None):
+        # https://medium.com/uruvideo/dataset-augmentation-with-random-homographies-a8f4b44830d4
+        border = 0  # width of added border (optional)
+
+        img = sample['image']
+        height, width = img.shape[0], img.shape[1]
+
+        # Rotation and Scale
+        R = np.eye(3)
+        a = random.random() * (self.degrees[1] - self.degrees[0]
+                               ) + self.degrees[0]
+        s = random.random() * (self.scale[1] - self.scale[0]) + self.scale[0]
+        R[:2] = cv2.getRotationMatrix2D(
+            angle=a, center=(width / 2, height / 2), scale=s)
+
+        # Translation
+        T = np.eye(3)
+        T[0, 2] = (
+            random.random() * 2 - 1
+        ) * self.translate[0] * height + border  # x translation (pixels)
+        T[1, 2] = (
+            random.random() * 2 - 1
+        ) * self.translate[1] * width + border  # y translation (pixels)
+
+        # Shear
+        S = np.eye(3)
+        S[0, 1] = math.tan((random.random() *
+                            (self.shear[1] - self.shear[0]) + self.shear[0]) *
+                           math.pi / 180)  # x shear (deg)
+        S[1, 0] = math.tan((random.random() *
+                            (self.shear[1] - self.shear[0]) + self.shear[0]) *
+                           math.pi / 180)  # y shear (deg)
+
+        M = S @T @R  # Combined rotation matrix. ORDER IS IMPORTANT HERE!!
+        imw = cv2.warpPerspective(
+            img,
+            M,
+            dsize=(width, height),
+            flags=cv2.INTER_LINEAR,
+            borderValue=self.borderValue)  # BGR order borderValue
+
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            targets = sample['gt_bbox']
+            n = targets.shape[0]
+            points = targets.copy()
+            area0 = (points[:, 2] - points[:, 0]) * (
+                points[:, 3] - points[:, 1])
+
+            # warp points
+            xy = np.ones((n * 4, 3))
+            xy[:, :2] = points[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
+                n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
+            xy = (xy @M.T)[:, :2].reshape(n, 8)
+
+            # create new boxes
+            x = xy[:, [0, 2, 4, 6]]
+            y = xy[:, [1, 3, 5, 7]]
+            xy = np.concatenate(
+                (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
+
+            # apply angle-based reduction
+            radians = a * math.pi / 180
+            reduction = max(abs(math.sin(radians)), abs(math.cos(radians)))**0.5
+            x = (xy[:, 2] + xy[:, 0]) / 2
+            y = (xy[:, 3] + xy[:, 1]) / 2
+            w = (xy[:, 2] - xy[:, 0]) * reduction
+            h = (xy[:, 3] - xy[:, 1]) * reduction
+            xy = np.concatenate(
+                (x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T
+
+            # reject warped points outside of image
+            if self.reject_outside:
+                np.clip(xy[:, 0], 0, width, out=xy[:, 0])
+                np.clip(xy[:, 2], 0, width, out=xy[:, 2])
+                np.clip(xy[:, 1], 0, height, out=xy[:, 1])
+                np.clip(xy[:, 3], 0, height, out=xy[:, 3])
+            w = xy[:, 2] - xy[:, 0]
+            h = xy[:, 3] - xy[:, 1]
+            area = w * h
+            ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16))
+            i = (w > 4) & (h > 4) & (area / (area0 + 1e-16) > 0.1) & (ar < 10)
+
+            if sum(i) > 0:
+                sample['gt_bbox'] = xy[i].astype(sample['gt_bbox'].dtype)
+                sample['gt_class'] = sample['gt_class'][i]
+                if 'difficult' in sample:
+                    sample['difficult'] = sample['difficult'][i]
+                if 'gt_ide' in sample:
+                    sample['gt_ide'] = sample['gt_ide'][i]
+                if 'is_crowd' in sample:
+                    sample['is_crowd'] = sample['is_crowd'][i]
+                sample['image'] = imw
+                return sample
+            else:
+                return sample
+
+
+@register_op
+class Gt2JDETargetThres(BaseOperator):
+    __shared__ = ['num_classes']
+    """
+    Generate JDE targets by groud truth data when training
+    Args:
+        anchors (list): anchors of JDE model
+        anchor_masks (list): anchor_masks of JDE model
+        downsample_ratios (list): downsample ratios of JDE model
+        ide_thresh (float): thresh of identity, higher is groud truth 
+        fg_thresh (float): thresh of foreground, higher is foreground
+        bg_thresh (float): thresh of background, lower is background
+        num_classes (int): number of classes
+    """
+
+    def __init__(self,
+                 anchors,
+                 anchor_masks,
+                 downsample_ratios,
+                 ide_thresh=0.5,
+                 fg_thresh=0.5,
+                 bg_thresh=0.4,
+                 num_classes=1):
+        super(Gt2JDETargetThres, self).__init__()
+        self.anchors = anchors
+        self.anchor_masks = anchor_masks
+        self.downsample_ratios = downsample_ratios
+        self.ide_thresh = ide_thresh
+        self.fg_thresh = fg_thresh
+        self.bg_thresh = bg_thresh
+        self.num_classes = num_classes
+
+    def generate_anchor(self, nGh, nGw, anchor_hw):
+        nA = len(anchor_hw)
+        yy, xx = np.meshgrid(np.arange(nGh), np.arange(nGw))
+
+        mesh = np.stack([xx.T, yy.T], axis=0)  # [2, nGh, nGw]
+        mesh = np.repeat(mesh[None, :], nA, axis=0)  # [nA, 2, nGh, nGw]
+
+        anchor_offset_mesh = anchor_hw[:, :, None][:, :, :, None]
+        anchor_offset_mesh = np.repeat(anchor_offset_mesh, nGh, axis=-2)
+        anchor_offset_mesh = np.repeat(anchor_offset_mesh, nGw, axis=-1)
+
+        anchor_mesh = np.concatenate(
+            [mesh, anchor_offset_mesh], axis=1)  # [nA, 4, nGh, nGw]
+        return anchor_mesh
+
+    def encode_delta(self, gt_box_list, fg_anchor_list):
+        px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \
+                        fg_anchor_list[:, 2], fg_anchor_list[:,3]
+        gx, gy, gw, gh = gt_box_list[:, 0], gt_box_list[:, 1], \
+                        gt_box_list[:, 2], gt_box_list[:, 3]
+        dx = (gx - px) / pw
+        dy = (gy - py) / ph
+        dw = np.log(gw / pw)
+        dh = np.log(gh / ph)
+        return np.stack([dx, dy, dw, dh], axis=1)
+
+    def pad_box(self, sample, num_max):
+        assert 'gt_bbox' in sample
+        bbox = sample['gt_bbox']
+        gt_num = len(bbox)
+        pad_bbox = np.zeros((num_max, 4), dtype=np.float32)
+        if gt_num > 0:
+            pad_bbox[:gt_num, :] = bbox[:gt_num, :]
+        sample['gt_bbox'] = pad_bbox
+        if 'gt_score' in sample:
+            pad_score = np.zeros((num_max, ), dtype=np.float32)
+            if gt_num > 0:
+                pad_score[:gt_num] = sample['gt_score'][:gt_num, 0]
+            sample['gt_score'] = pad_score
+        if 'difficult' in sample:
+            pad_diff = np.zeros((num_max, ), dtype=np.int32)
+            if gt_num > 0:
+                pad_diff[:gt_num] = sample['difficult'][:gt_num, 0]
+            sample['difficult'] = pad_diff
+        if 'is_crowd' in sample:
+            pad_crowd = np.zeros((num_max, ), dtype=np.int32)
+            if gt_num > 0:
+                pad_crowd[:gt_num] = sample['is_crowd'][:gt_num, 0]
+            sample['is_crowd'] = pad_crowd
+        if 'gt_ide' in sample:
+            pad_ide = np.zeros((num_max, ), dtype=np.int32)
+            if gt_num > 0:
+                pad_ide[:gt_num] = sample['gt_ide'][:gt_num, 0]
+            sample['gt_ide'] = pad_ide
+        return sample
+
+    def __call__(self, samples, context=None):
+        assert len(self.anchor_masks) == len(self.downsample_ratios), \
+            "anchor_masks', and 'downsample_ratios' should have same length."
+        h, w = samples[0]['image'].shape[1:3]
+
+        num_max = 0
+        for sample in samples:
+            num_max = max(num_max, len(sample['gt_bbox']))
+
+        for sample in samples:
+            gt_bbox = sample['gt_bbox']
+            gt_ide = sample['gt_ide']
+            for i, (anchor_hw, downsample_ratio
+                    ) in enumerate(zip(self.anchors, self.downsample_ratios)):
+                anchor_hw = np.array(
+                    anchor_hw, dtype=np.float32) / downsample_ratio
+                nA = len(anchor_hw)
+                nGh, nGw = int(h / downsample_ratio), int(w / downsample_ratio)
+                tbox = np.zeros((nA, nGh, nGw, 4), dtype=np.float32)
+                tconf = np.zeros((nA, nGh, nGw), dtype=np.float32)
+                tid = -np.ones((nA, nGh, nGw, 1), dtype=np.float32)
+
+                gxy, gwh = gt_bbox[:, 0:2].copy(), gt_bbox[:, 2:4].copy()
+                gxy[:, 0] = gxy[:, 0] * nGw
+                gxy[:, 1] = gxy[:, 1] * nGh
+                gwh[:, 0] = gwh[:, 0] * nGw
+                gwh[:, 1] = gwh[:, 1] * nGh
+                gxy[:, 0] = np.clip(gxy[:, 0], 0, nGw - 1)
+                gxy[:, 1] = np.clip(gxy[:, 1], 0, nGh - 1)
+                tboxes = np.concatenate([gxy, gwh], axis=1)
+
+                anchor_mesh = self.generate_anchor(nGh, nGw, anchor_hw)
+
+                anchor_list = np.transpose(anchor_mesh,
+                                           (0, 2, 3, 1)).reshape(-1, 4)
+                iou_pdist = bbox_iou_np_expand(
+                    anchor_list, tboxes, x1y1x2y2=False)
+
+                iou_max = np.max(iou_pdist, axis=1)
+                max_gt_index = np.argmax(iou_pdist, axis=1)
+
+                iou_map = iou_max.reshape(nA, nGh, nGw)
+                gt_index_map = max_gt_index.reshape(nA, nGh, nGw)
+
+                id_index = iou_map > self.ide_thresh
+                fg_index = iou_map > self.fg_thresh
+                bg_index = iou_map < self.bg_thresh
+                ign_index = (iou_map < self.fg_thresh) * (
+                    iou_map > self.bg_thresh)
+                tconf[fg_index] = 1
+                tconf[bg_index] = 0
+                tconf[ign_index] = -1
+
+                gt_index = gt_index_map[fg_index]
+                gt_box_list = tboxes[gt_index]
+                gt_id_list = gt_ide[gt_index_map[id_index]]
+
+                if np.sum(fg_index) > 0:
+                    tid[id_index] = gt_id_list
+
+                    fg_anchor_list = anchor_list.reshape(nA, nGh, nGw,
+                                                         4)[fg_index]
+                    delta_target = self.encode_delta(gt_box_list,
+                                                     fg_anchor_list)
+                    tbox[fg_index] = delta_target
+
+                sample['tbox{}'.format(i)] = tbox
+                sample['tconf{}'.format(i)] = tconf
+                sample['tide{}'.format(i)] = tid
+            sample.pop('gt_class')
+            sample = self.pad_box(sample, num_max)
+        return samples
+
+
+@register_op
+class Gt2JDETargetMax(BaseOperator):
+    __shared__ = ['num_classes']
+    """
+    Generate JDE targets by groud truth data when evaluating
+    Args:
+        anchors (list): anchors of JDE model
+        anchor_masks (list): anchor_masks of JDE model
+        downsample_ratios (list): downsample ratios of JDE model
+        max_iou_thresh (float): iou thresh for high quality anchor
+        num_classes (int): number of classes
+    """
+
+    def __init__(self,
+                 anchors,
+                 anchor_masks,
+                 downsample_ratios,
+                 max_iou_thresh=0.60,
+                 num_classes=1):
+        super(Gt2JDETargetMax, self).__init__()
+        self.anchors = anchors
+        self.anchor_masks = anchor_masks
+        self.downsample_ratios = downsample_ratios
+        self.max_iou_thresh = max_iou_thresh
+        self.num_classes = num_classes
+
+    def __call__(self, samples, context=None):
+        assert len(self.anchor_masks) == len(self.downsample_ratios), \
+            "anchor_masks', and 'downsample_ratios' should have same length."
+        h, w = samples[0]['image'].shape[1:3]
+        for sample in samples:
+            gt_bbox = sample['gt_bbox']
+            gt_ide = sample['gt_ide']
+            for i, (anchor_hw, downsample_ratio
+                    ) in enumerate(zip(self.anchors, self.downsample_ratios)):
+                anchor_hw = np.array(
+                    anchor_hw, dtype=np.float32) / downsample_ratio
+                nA = len(anchor_hw)
+                nGh, nGw = int(h / downsample_ratio), int(w / downsample_ratio)
+                tbox = np.zeros((nA, nGh, nGw, 4), dtype=np.float32)
+                tconf = np.zeros((nA, nGh, nGw), dtype=np.float32)
+                tid = -np.ones((nA, nGh, nGw, 1), dtype=np.float32)
+
+                gxy, gwh = gt_bbox[:, 0:2].copy(), gt_bbox[:, 2:4].copy()
+                gxy[:, 0] = gxy[:, 0] * nGw
+                gxy[:, 1] = gxy[:, 1] * nGh
+                gwh[:, 0] = gwh[:, 0] * nGw
+                gwh[:, 1] = gwh[:, 1] * nGh
+                gi = np.clip(gxy[:, 0], 0, nGw - 1).astype(int)
+                gj = np.clip(gxy[:, 1], 0, nGh - 1).astype(int)
+
+                # iou of targets-anchors (using wh only)
+                box1 = gwh
+                box2 = anchor_hw[:, None, :]
+                inter_area = np.minimum(box1, box2).prod(2)
+                iou = inter_area / (
+                    box1.prod(1) + box2.prod(2) - inter_area + 1e-16)
+
+                # Select best iou_pred and anchor
+                iou_best = iou.max(0)  # best anchor [0-2] for each target
+                a = np.argmax(iou, axis=0)
+
+                # Select best unique target-anchor combinations
+                iou_order = np.argsort(-iou_best)  # best to worst
+
+                # Unique anchor selection
+                u = np.stack((gi, gj, a), 0)[:, iou_order]
+                _, first_unique = np.unique(u, axis=1, return_index=True)
+                mask = iou_order[first_unique]
+                # best anchor must share significant commonality (iou) with target
+                # TODO: examine arbitrary threshold
+                idx = mask[iou_best[mask] > self.max_iou_thresh]
+
+                if len(idx) > 0:
+                    a_i, gj_i, gi_i = a[idx], gj[idx], gi[idx]
+                    t_box = gt_bbox[idx]
+                    t_id = gt_ide[idx]
+                    if len(t_box.shape) == 1:
+                        t_box = t_box.reshape(1, 4)
+
+                    gxy, gwh = t_box[:, 0:2].copy(), t_box[:, 2:4].copy()
+                    gxy[:, 0] = gxy[:, 0] * nGw
+                    gxy[:, 1] = gxy[:, 1] * nGh
+                    gwh[:, 0] = gwh[:, 0] * nGw
+                    gwh[:, 1] = gwh[:, 1] * nGh
+
+                    # XY coordinates
+                    tbox[:, :, :, 0:2][a_i, gj_i, gi_i] = gxy - gxy.astype(int)
+                    # Width and height in yolo method
+                    tbox[:, :, :, 2:4][a_i, gj_i, gi_i] = np.log(gwh /
+                                                                 anchor_hw[a_i])
+                    tconf[a_i, gj_i, gi_i] = 1
+                    tid[a_i, gj_i, gi_i] = t_id
+
+                sample['tbox{}'.format(i)] = tbox
+                sample['tconf{}'.format(i)] = tconf
+                sample['tide{}'.format(i)] = tid
+
+
+class Gt2FairMOTTarget(Gt2TTFTarget):
+    __shared__ = ['num_classes']
+    """
+    Generate FairMOT targets by ground truth data.
+    Difference between Gt2FairMOTTarget and Gt2TTFTarget are:
+        1. the gaussian kernal radius to generate a heatmap.
+        2. the targets needed during training.
+    
+    Args:
+        num_classes(int): the number of classes.
+        down_ratio(int): the down ratio from images to heatmap, 4 by default.
+        max_objs(int): the maximum number of ground truth objects in a image, 500 by default.
+    """
+
+    def __init__(self, num_classes=1, down_ratio=4, max_objs=500):
+        super(Gt2TTFTarget, self).__init__()
+        self.down_ratio = down_ratio
+        self.num_classes = num_classes
+        self.max_objs = max_objs
+
+    def __call__(self, samples, context=None):
+        for b_id, sample in enumerate(samples):
+            output_h = sample['image'].shape[1] // self.down_ratio
+            output_w = sample['image'].shape[2] // self.down_ratio
+
+            heatmap = np.zeros(
+                (self.num_classes, output_h, output_w), dtype='float32')
+            bbox_size = np.zeros((self.max_objs, 4), dtype=np.float32)
+            center_offset = np.zeros((self.max_objs, 2), dtype=np.float32)
+            index = np.zeros((self.max_objs, ), dtype=np.int64)
+            index_mask = np.zeros((self.max_objs, ), dtype=np.int32)
+            reid = np.zeros((self.max_objs, ), dtype=np.int64)
+            bbox_xys = np.zeros((self.max_objs, 4), dtype=np.float32)
+            if self.num_classes > 1:
+                # each category corresponds to a set of track ids
+                cls_tr_ids = np.zeros(
+                    (self.num_classes, output_h, output_w), dtype=np.int64)
+                cls_id_map = np.full((output_h, output_w), -1, dtype=np.int64)
+
+            gt_bbox = sample['gt_bbox']
+            gt_class = sample['gt_class']
+            gt_ide = sample['gt_ide']
+
+            for k in range(len(gt_bbox)):
+                cls_id = gt_class[k][0]
+                bbox = gt_bbox[k]
+                ide = gt_ide[k][0]
+                bbox[[0, 2]] = bbox[[0, 2]] * output_w
+                bbox[[1, 3]] = bbox[[1, 3]] * output_h
+                bbox_amodal = copy.deepcopy(bbox)
+                bbox_amodal[0] = bbox_amodal[0] - bbox_amodal[2] / 2.
+                bbox_amodal[1] = bbox_amodal[1] - bbox_amodal[3] / 2.
+                bbox_amodal[2] = bbox_amodal[0] + bbox_amodal[2]
+                bbox_amodal[3] = bbox_amodal[1] + bbox_amodal[3]
+                bbox[0] = np.clip(bbox[0], 0, output_w - 1)
+                bbox[1] = np.clip(bbox[1], 0, output_h - 1)
+                h = bbox[3]
+                w = bbox[2]
+
+                bbox_xy = copy.deepcopy(bbox)
+                bbox_xy[0] = bbox_xy[0] - bbox_xy[2] / 2
+                bbox_xy[1] = bbox_xy[1] - bbox_xy[3] / 2
+                bbox_xy[2] = bbox_xy[0] + bbox_xy[2]
+                bbox_xy[3] = bbox_xy[1] + bbox_xy[3]
+
+                if h > 0 and w > 0:
+                    radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)
+                    radius = max(0, int(radius))
+                    ct = np.array([bbox[0], bbox[1]], dtype=np.float32)
+                    ct_int = ct.astype(np.int32)
+                    self.draw_truncate_gaussian(heatmap[cls_id], ct_int, radius,
+                                                radius)
+                    bbox_size[k] = ct[0] - bbox_amodal[0], ct[1] - bbox_amodal[1], \
+                            bbox_amodal[2] - ct[0], bbox_amodal[3] - ct[1]
+
+                    index[k] = ct_int[1] * output_w + ct_int[0]
+                    center_offset[k] = ct - ct_int
+                    index_mask[k] = 1
+                    reid[k] = ide
+                    bbox_xys[k] = bbox_xy
+                    if self.num_classes > 1:
+                        cls_id_map[ct_int[1], ct_int[0]] = cls_id
+                        cls_tr_ids[cls_id][ct_int[1]][ct_int[0]] = ide - 1
+                        # track id start from 0
+
+            sample['heatmap'] = heatmap
+            sample['index'] = index
+            sample['offset'] = center_offset
+            sample['size'] = bbox_size
+            sample['index_mask'] = index_mask
+            sample['reid'] = reid
+            if self.num_classes > 1:
+                sample['cls_id_map'] = cls_id_map
+                sample['cls_tr_ids'] = cls_tr_ids
+            sample['bbox_xys'] = bbox_xys
+            sample.pop('is_crowd', None)
+            sample.pop('difficult', None)
+            sample.pop('gt_class', None)
+            sample.pop('gt_bbox', None)
+            sample.pop('gt_score', None)
+            sample.pop('gt_ide', None)
+        return samples
--- a/paddle_detection/ppdet/data/transform/op_helper.py
+++ b/paddle_detection/ppdet/data/transform/op_helper.py
@@ -0,0 +1,494 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# this file contains helper methods for BBOX processing
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import random
+import math
+import cv2
+
+
+def meet_emit_constraint(src_bbox, sample_bbox):
+    center_x = (src_bbox[2] + src_bbox[0]) / 2
+    center_y = (src_bbox[3] + src_bbox[1]) / 2
+    if center_x >= sample_bbox[0] and \
+            center_x <= sample_bbox[2] and \
+            center_y >= sample_bbox[1] and \
+            center_y <= sample_bbox[3]:
+        return True
+    return False
+
+
+def clip_bbox(src_bbox):
+    src_bbox[0] = max(min(src_bbox[0], 1.0), 0.0)
+    src_bbox[1] = max(min(src_bbox[1], 1.0), 0.0)
+    src_bbox[2] = max(min(src_bbox[2], 1.0), 0.0)
+    src_bbox[3] = max(min(src_bbox[3], 1.0), 0.0)
+    return src_bbox
+
+
+def bbox_area(src_bbox):
+    if src_bbox[2] < src_bbox[0] or src_bbox[3] < src_bbox[1]:
+        return 0.
+    else:
+        width = src_bbox[2] - src_bbox[0]
+        height = src_bbox[3] - src_bbox[1]
+        return width * height
+
+
+def is_overlap(object_bbox, sample_bbox):
+    if object_bbox[0] >= sample_bbox[2] or \
+       object_bbox[2] <= sample_bbox[0] or \
+       object_bbox[1] >= sample_bbox[3] or \
+       object_bbox[3] <= sample_bbox[1]:
+        return False
+    else:
+        return True
+
+
+def filter_and_process(sample_bbox, bboxes, labels, scores=None,
+                       keypoints=None):
+    new_bboxes = []
+    new_labels = []
+    new_scores = []
+    new_keypoints = []
+    new_kp_ignore = []
+    for i in range(len(bboxes)):
+        new_bbox = [0, 0, 0, 0]
+        obj_bbox = [bboxes[i][0], bboxes[i][1], bboxes[i][2], bboxes[i][3]]
+        if not meet_emit_constraint(obj_bbox, sample_bbox):
+            continue
+        if not is_overlap(obj_bbox, sample_bbox):
+            continue
+        sample_width = sample_bbox[2] - sample_bbox[0]
+        sample_height = sample_bbox[3] - sample_bbox[1]
+        new_bbox[0] = (obj_bbox[0] - sample_bbox[0]) / sample_width
+        new_bbox[1] = (obj_bbox[1] - sample_bbox[1]) / sample_height
+        new_bbox[2] = (obj_bbox[2] - sample_bbox[0]) / sample_width
+        new_bbox[3] = (obj_bbox[3] - sample_bbox[1]) / sample_height
+        new_bbox = clip_bbox(new_bbox)
+        if bbox_area(new_bbox) > 0:
+            new_bboxes.append(new_bbox)
+            new_labels.append([labels[i][0]])
+            if scores is not None:
+                new_scores.append([scores[i][0]])
+            if keypoints is not None:
+                sample_keypoint = keypoints[0][i]
+                for j in range(len(sample_keypoint)):
+                    kp_len = sample_height if j % 2 else sample_width
+                    sample_coord = sample_bbox[1] if j % 2 else sample_bbox[0]
+                    sample_keypoint[j] = (
+                        sample_keypoint[j] - sample_coord) / kp_len
+                    sample_keypoint[j] = max(min(sample_keypoint[j], 1.0), 0.0)
+                new_keypoints.append(sample_keypoint)
+                new_kp_ignore.append(keypoints[1][i])
+
+    bboxes = np.array(new_bboxes)
+    labels = np.array(new_labels)
+    scores = np.array(new_scores)
+    if keypoints is not None:
+        keypoints = np.array(new_keypoints)
+        new_kp_ignore = np.array(new_kp_ignore)
+        return bboxes, labels, scores, (keypoints, new_kp_ignore)
+    return bboxes, labels, scores
+
+
+def bbox_area_sampling(bboxes, labels, scores, target_size, min_size):
+    new_bboxes = []
+    new_labels = []
+    new_scores = []
+    for i, bbox in enumerate(bboxes):
+        w = float((bbox[2] - bbox[0]) * target_size)
+        h = float((bbox[3] - bbox[1]) * target_size)
+        if w * h < float(min_size * min_size):
+            continue
+        else:
+            new_bboxes.append(bbox)
+            new_labels.append(labels[i])
+            if scores is not None and scores.size != 0:
+                new_scores.append(scores[i])
+    bboxes = np.array(new_bboxes)
+    labels = np.array(new_labels)
+    scores = np.array(new_scores)
+    return bboxes, labels, scores
+
+
+def generate_sample_bbox(sampler):
+    scale = np.random.uniform(sampler[2], sampler[3])
+    aspect_ratio = np.random.uniform(sampler[4], sampler[5])
+    aspect_ratio = max(aspect_ratio, (scale**2.0))
+    aspect_ratio = min(aspect_ratio, 1 / (scale**2.0))
+    bbox_width = scale * (aspect_ratio**0.5)
+    bbox_height = scale / (aspect_ratio**0.5)
+    xmin_bound = 1 - bbox_width
+    ymin_bound = 1 - bbox_height
+    xmin = np.random.uniform(0, xmin_bound)
+    ymin = np.random.uniform(0, ymin_bound)
+    xmax = xmin + bbox_width
+    ymax = ymin + bbox_height
+    sampled_bbox = [xmin, ymin, xmax, ymax]
+    return sampled_bbox
+
+
+def generate_sample_bbox_square(sampler, image_width, image_height):
+    scale = np.random.uniform(sampler[2], sampler[3])
+    aspect_ratio = np.random.uniform(sampler[4], sampler[5])
+    aspect_ratio = max(aspect_ratio, (scale**2.0))
+    aspect_ratio = min(aspect_ratio, 1 / (scale**2.0))
+    bbox_width = scale * (aspect_ratio**0.5)
+    bbox_height = scale / (aspect_ratio**0.5)
+    if image_height < image_width:
+        bbox_width = bbox_height * image_height / image_width
+    else:
+        bbox_height = bbox_width * image_width / image_height
+    xmin_bound = 1 - bbox_width
+    ymin_bound = 1 - bbox_height
+    xmin = np.random.uniform(0, xmin_bound)
+    ymin = np.random.uniform(0, ymin_bound)
+    xmax = xmin + bbox_width
+    ymax = ymin + bbox_height
+    sampled_bbox = [xmin, ymin, xmax, ymax]
+    return sampled_bbox
+
+
+def data_anchor_sampling(bbox_labels, image_width, image_height, scale_array,
+                         resize_width):
+    num_gt = len(bbox_labels)
+    # np.random.randint range: [low, high)
+    rand_idx = np.random.randint(0, num_gt) if num_gt != 0 else 0
+
+    if num_gt != 0:
+        norm_xmin = bbox_labels[rand_idx][0]
+        norm_ymin = bbox_labels[rand_idx][1]
+        norm_xmax = bbox_labels[rand_idx][2]
+        norm_ymax = bbox_labels[rand_idx][3]
+
+        xmin = norm_xmin * image_width
+        ymin = norm_ymin * image_height
+        wid = image_width * (norm_xmax - norm_xmin)
+        hei = image_height * (norm_ymax - norm_ymin)
+        range_size = 0
+
+        area = wid * hei
+        for scale_ind in range(0, len(scale_array) - 1):
+            if area > scale_array[scale_ind] ** 2 and area < \
+                    scale_array[scale_ind + 1] ** 2:
+                range_size = scale_ind + 1
+                break
+
+        if area > scale_array[len(scale_array) - 2]**2:
+            range_size = len(scale_array) - 2
+
+        scale_choose = 0.0
+        if range_size == 0:
+            rand_idx_size = 0
+        else:
+            # np.random.randint range: [low, high)
+            rng_rand_size = np.random.randint(0, range_size + 1)
+            rand_idx_size = rng_rand_size % (range_size + 1)
+
+        if rand_idx_size == range_size:
+            min_resize_val = scale_array[rand_idx_size] / 2.0
+            max_resize_val = min(2.0 * scale_array[rand_idx_size],
+                                 2 * math.sqrt(wid * hei))
+            scale_choose = random.uniform(min_resize_val, max_resize_val)
+        else:
+            min_resize_val = scale_array[rand_idx_size] / 2.0
+            max_resize_val = 2.0 * scale_array[rand_idx_size]
+            scale_choose = random.uniform(min_resize_val, max_resize_val)
+
+        sample_bbox_size = wid * resize_width / scale_choose
+
+        w_off_orig = 0.0
+        h_off_orig = 0.0
+        if sample_bbox_size < max(image_height, image_width):
+            if wid <= sample_bbox_size:
+                w_off_orig = np.random.uniform(xmin + wid - sample_bbox_size,
+                                               xmin)
+            else:
+                w_off_orig = np.random.uniform(xmin,
+                                               xmin + wid - sample_bbox_size)
+
+            if hei <= sample_bbox_size:
+                h_off_orig = np.random.uniform(ymin + hei - sample_bbox_size,
+                                               ymin)
+            else:
+                h_off_orig = np.random.uniform(ymin,
+                                               ymin + hei - sample_bbox_size)
+
+        else:
+            w_off_orig = np.random.uniform(image_width - sample_bbox_size, 0.0)
+            h_off_orig = np.random.uniform(image_height - sample_bbox_size, 0.0)
+
+        w_off_orig = math.floor(w_off_orig)
+        h_off_orig = math.floor(h_off_orig)
+
+        # Figure out top left coordinates.
+        w_off = float(w_off_orig / image_width)
+        h_off = float(h_off_orig / image_height)
+
+        sampled_bbox = [
+            w_off, h_off, w_off + float(sample_bbox_size / image_width),
+            h_off + float(sample_bbox_size / image_height)
+        ]
+        return sampled_bbox
+    else:
+        return 0
+
+
+def jaccard_overlap(sample_bbox, object_bbox):
+    if sample_bbox[0] >= object_bbox[2] or \
+        sample_bbox[2] <= object_bbox[0] or \
+        sample_bbox[1] >= object_bbox[3] or \
+        sample_bbox[3] <= object_bbox[1]:
+        return 0
+    intersect_xmin = max(sample_bbox[0], object_bbox[0])
+    intersect_ymin = max(sample_bbox[1], object_bbox[1])
+    intersect_xmax = min(sample_bbox[2], object_bbox[2])
+    intersect_ymax = min(sample_bbox[3], object_bbox[3])
+    intersect_size = (intersect_xmax - intersect_xmin) * (
+        intersect_ymax - intersect_ymin)
+    sample_bbox_size = bbox_area(sample_bbox)
+    object_bbox_size = bbox_area(object_bbox)
+    overlap = intersect_size / (
+        sample_bbox_size + object_bbox_size - intersect_size)
+    return overlap
+
+
+def intersect_bbox(bbox1, bbox2):
+    if bbox2[0] > bbox1[2] or bbox2[2] < bbox1[0] or \
+        bbox2[1] > bbox1[3] or bbox2[3] < bbox1[1]:
+        intersection_box = [0.0, 0.0, 0.0, 0.0]
+    else:
+        intersection_box = [
+            max(bbox1[0], bbox2[0]), max(bbox1[1], bbox2[1]),
+            min(bbox1[2], bbox2[2]), min(bbox1[3], bbox2[3])
+        ]
+    return intersection_box
+
+
+def bbox_coverage(bbox1, bbox2):
+    inter_box = intersect_bbox(bbox1, bbox2)
+    intersect_size = bbox_area(inter_box)
+
+    if intersect_size > 0:
+        bbox1_size = bbox_area(bbox1)
+        return intersect_size / bbox1_size
+    else:
+        return 0.
+
+
+def satisfy_sample_constraint(sampler,
+                              sample_bbox,
+                              gt_bboxes,
+                              satisfy_all=False):
+    if sampler[6] == 0 and sampler[7] == 0:
+        return True
+    satisfied = []
+    for i in range(len(gt_bboxes)):
+        object_bbox = [
+            gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3]
+        ]
+        overlap = jaccard_overlap(sample_bbox, object_bbox)
+        if sampler[6] != 0 and \
+                overlap < sampler[6]:
+            satisfied.append(False)
+            continue
+        if sampler[7] != 0 and \
+                overlap > sampler[7]:
+            satisfied.append(False)
+            continue
+        satisfied.append(True)
+        if not satisfy_all:
+            return True
+
+    if satisfy_all:
+        return np.all(satisfied)
+    else:
+        return False
+
+
+def satisfy_sample_constraint_coverage(sampler, sample_bbox, gt_bboxes):
+    if sampler[6] == 0 and sampler[7] == 0:
+        has_jaccard_overlap = False
+    else:
+        has_jaccard_overlap = True
+    if sampler[8] == 0 and sampler[9] == 0:
+        has_object_coverage = False
+    else:
+        has_object_coverage = True
+
+    if not has_jaccard_overlap and not has_object_coverage:
+        return True
+    found = False
+    for i in range(len(gt_bboxes)):
+        object_bbox = [
+            gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3]
+        ]
+        if has_jaccard_overlap:
+            overlap = jaccard_overlap(sample_bbox, object_bbox)
+            if sampler[6] != 0 and \
+                    overlap < sampler[6]:
+                continue
+            if sampler[7] != 0 and \
+                    overlap > sampler[7]:
+                continue
+            found = True
+        if has_object_coverage:
+            object_coverage = bbox_coverage(object_bbox, sample_bbox)
+            if sampler[8] != 0 and \
+                    object_coverage < sampler[8]:
+                continue
+            if sampler[9] != 0 and \
+                    object_coverage > sampler[9]:
+                continue
+            found = True
+        if found:
+            return True
+    return found
+
+
+def crop_image_sampling(img, sample_bbox, image_width, image_height,
+                        target_size):
+    # no clipping here
+    xmin = int(sample_bbox[0] * image_width)
+    xmax = int(sample_bbox[2] * image_width)
+    ymin = int(sample_bbox[1] * image_height)
+    ymax = int(sample_bbox[3] * image_height)
+
+    w_off = xmin
+    h_off = ymin
+    width = xmax - xmin
+    height = ymax - ymin
+    cross_xmin = max(0.0, float(w_off))
+    cross_ymin = max(0.0, float(h_off))
+    cross_xmax = min(float(w_off + width - 1.0), float(image_width))
+    cross_ymax = min(float(h_off + height - 1.0), float(image_height))
+    cross_width = cross_xmax - cross_xmin
+    cross_height = cross_ymax - cross_ymin
+
+    roi_xmin = 0 if w_off >= 0 else abs(w_off)
+    roi_ymin = 0 if h_off >= 0 else abs(h_off)
+    roi_width = cross_width
+    roi_height = cross_height
+
+    roi_y1 = int(roi_ymin)
+    roi_y2 = int(roi_ymin + roi_height)
+    roi_x1 = int(roi_xmin)
+    roi_x2 = int(roi_xmin + roi_width)
+
+    cross_y1 = int(cross_ymin)
+    cross_y2 = int(cross_ymin + cross_height)
+    cross_x1 = int(cross_xmin)
+    cross_x2 = int(cross_xmin + cross_width)
+
+    sample_img = np.zeros((height, width, 3))
+    sample_img[roi_y1: roi_y2, roi_x1: roi_x2] = \
+        img[cross_y1: cross_y2, cross_x1: cross_x2]
+
+    sample_img = cv2.resize(
+        sample_img, (target_size, target_size), interpolation=cv2.INTER_AREA)
+
+    return sample_img
+
+
+def is_poly(segm):
+    assert isinstance(segm, (list, dict)), \
+        "Invalid segm type: {}".format(type(segm))
+    return isinstance(segm, list)
+
+
+def gaussian_radius(bbox_size, min_overlap):
+    height, width = bbox_size
+
+    a1 = 1
+    b1 = (height + width)
+    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
+    sq1 = np.sqrt(b1**2 - 4 * a1 * c1)
+    radius1 = (b1 + sq1) / (2 * a1)
+
+    a2 = 4
+    b2 = 2 * (height + width)
+    c2 = (1 - min_overlap) * width * height
+    sq2 = np.sqrt(b2**2 - 4 * a2 * c2)
+    radius2 = (b2 + sq2) / 2
+
+    a3 = 4 * min_overlap
+    b3 = -2 * min_overlap * (height + width)
+    c3 = (min_overlap - 1) * width * height
+    sq3 = np.sqrt(b3**2 - 4 * a3 * c3)
+    radius3 = (b3 + sq3) / 2
+    return min(radius1, radius2, radius3)
+
+
+def draw_gaussian(heatmap, center, radius, k=1, delte=6):
+    diameter = 2 * radius + 1
+    sigma = diameter / delte
+    gaussian = gaussian2D((diameter, diameter), sigma_x=sigma, sigma_y=sigma)
+
+    x, y = center
+
+    height, width = heatmap.shape[0:2]
+
+    left, right = min(x, radius), min(width - x, radius + 1)
+    top, bottom = min(y, radius), min(height - y, radius + 1)
+
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:
+                               radius + right]
+    np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
+
+
+def gaussian2D(shape, sigma_x=1, sigma_y=1):
+    m, n = [(ss - 1.) / 2. for ss in shape]
+    y, x = np.ogrid[-m:m + 1, -n:n + 1]
+
+    h = np.exp(-(x * x / (2 * sigma_x * sigma_x) + y * y / (2 * sigma_y *
+                                                            sigma_y)))
+    h[h < np.finfo(h.dtype).eps * h.max()] = 0
+    return h
+
+
+def draw_umich_gaussian(heatmap, center, radius, k=1):
+    """
+    draw_umich_gaussian, refer to https://github.com/xingyizhou/CenterNet/blob/master/src/lib/utils/image.py#L126
+    """
+    diameter = 2 * radius + 1
+    gaussian = gaussian2D(
+        (diameter, diameter), sigma_x=diameter / 6, sigma_y=diameter / 6)
+
+    x, y = int(center[0]), int(center[1])
+
+    height, width = heatmap.shape[0:2]
+
+    left, right = min(x, radius), min(width - x, radius + 1)
+    top, bottom = min(y, radius), min(height - y, radius + 1)
+
+    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
+    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:
+                               radius + right]
+    if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
+        np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
+    return heatmap
+
+
+def get_border(border, size):
+    i = 1
+    while size - border // i <= border // i:
+        i *= 2
+    return border // i
--- a/paddle_detection/ppdet/data/transform/operators.py
+++ b/paddle_detection/ppdet/data/transform/operators.py
--- a/paddle_detection/ppdet/data/transform/rotated_operators.py
+++ b/paddle_detection/ppdet/data/transform/rotated_operators.py
@@ -0,0 +1,480 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+try:
+    from collections.abc import Sequence
+except Exception:
+    from collections import Sequence
+
+from numbers import Number, Integral
+
+import cv2
+import numpy as np
+import math
+import copy
+
+from .operators import register_op, BaseOperator
+from ppdet.modeling.rbox_utils import poly2rbox_le135_np, poly2rbox_oc_np, rbox2poly_np
+from ppdet.utils.logger import setup_logger
+from ppdet.utils.compact import imagedraw_textsize_c
+logger = setup_logger(__name__)
+
+
+@register_op
+class RRotate(BaseOperator):
+    """ Rotate Image, Polygon, Box
+
+    Args:
+        scale (float): rotate scale
+        angle (float): rotate angle
+        fill_value (int, tuple): fill color
+        auto_bound (bool): whether auto bound or not
+    """
+
+    def __init__(self, scale=1.0, angle=0., fill_value=0., auto_bound=True):
+        super(RRotate, self).__init__()
+        self.scale = scale
+        self.angle = angle
+        self.fill_value = fill_value
+        self.auto_bound = auto_bound
+
+    def get_rotated_matrix(self, angle, scale, h, w):
+        center = ((w - 1) * 0.5, (h - 1) * 0.5)
+        matrix = cv2.getRotationMatrix2D(center, -angle, scale)
+        # calculate the new size
+        cos = np.abs(matrix[0, 0])
+        sin = np.abs(matrix[0, 1])
+        new_w = h * sin + w * cos
+        new_h = h * cos + w * sin
+        # calculate offset
+        n_w = int(np.round(new_w))
+        n_h = int(np.round(new_h))
+        if self.auto_bound:
+            ratio = min(w / n_w, h / n_h)
+            matrix = cv2.getRotationMatrix2D(center, -angle, ratio)
+        else:
+            matrix[0, 2] += (new_w - w) * 0.5
+            matrix[1, 2] += (new_h - h) * 0.5
+            w = n_w
+            h = n_h
+        return matrix, h, w
+
+    def get_rect_from_pts(self, pts, h, w):
+        """ get minimum rectangle of points
+        """
+        assert pts.shape[-1] % 2 == 0, 'the dim of input [pts] is not correct'
+        min_x, min_y = np.min(pts[:, 0::2], axis=1), np.min(pts[:, 1::2],
+                                                            axis=1)
+        max_x, max_y = np.max(pts[:, 0::2], axis=1), np.max(pts[:, 1::2],
+                                                            axis=1)
+        min_x, min_y = np.clip(min_x, 0, w), np.clip(min_y, 0, h)
+        max_x, max_y = np.clip(max_x, 0, w), np.clip(max_y, 0, h)
+        boxes = np.stack([min_x, min_y, max_x, max_y], axis=-1)
+        return boxes
+
+    def apply_image(self, image, matrix, h, w):
+        return cv2.warpAffine(
+            image, matrix, (w, h), borderValue=self.fill_value)
+
+    def apply_pts(self, pts, matrix, h, w):
+        assert pts.shape[-1] % 2 == 0, 'the dim of input [pts] is not correct'
+        # n is number of samples and m is two times the number of points due to (x, y)
+        _, m = pts.shape
+        # transpose points
+        pts_ = pts.reshape(-1, 2).T
+        # pad 1 to convert the points to homogeneous coordinates
+        padding = np.ones((1, pts_.shape[1]), pts.dtype)
+        rotated_pts = np.matmul(matrix, np.concatenate((pts_, padding), axis=0))
+        return rotated_pts[:2, :].T.reshape(-1, m)
+
+    def apply(self, sample, context=None):
+        image = sample['image']
+        h, w = image.shape[:2]
+        matrix, h, w = self.get_rotated_matrix(self.angle, self.scale, h, w)
+        sample['image'] = self.apply_image(image, matrix, h, w)
+        polys = sample['gt_poly']
+        # TODO: segment or keypoint to be processed 
+        if len(polys) > 0:
+            pts = self.apply_pts(polys, matrix, h, w)
+            sample['gt_poly'] = pts
+            sample['gt_bbox'] = self.get_rect_from_pts(pts, h, w)
+
+        return sample
+
+
+@register_op
+class RandomRRotate(BaseOperator):
+    """ Random Rotate Image
+    Args:
+        scale (float, tuple, list): rotate scale
+        scale_mode (str): mode of scale, [range, value, None]
+        angle (float, tuple, list): rotate angle
+        angle_mode (str): mode of angle, [range, value, None]
+        fill_value (float, tuple, list): fill value
+        rotate_prob (float): probability of rotation
+        auto_bound (bool): whether auto bound or not
+    """
+
+    def __init__(self,
+                 scale=1.0,
+                 scale_mode=None,
+                 angle=0.,
+                 angle_mode=None,
+                 fill_value=0.,
+                 rotate_prob=1.0,
+                 auto_bound=True):
+        super(RandomRRotate, self).__init__()
+        self.scale = scale
+        self.scale_mode = scale_mode
+        self.angle = angle
+        self.angle_mode = angle_mode
+        self.fill_value = fill_value
+        self.rotate_prob = rotate_prob
+        self.auto_bound = auto_bound
+
+    def get_angle(self, angle, angle_mode):
+        assert not angle_mode or angle_mode in [
+            'range', 'value'
+        ], 'angle mode should be in [range, value, None]'
+        if not angle_mode:
+            return angle
+        elif angle_mode == 'range':
+            low, high = angle
+            return np.random.rand() * (high - low) + low
+        elif angle_mode == 'value':
+            return np.random.choice(angle)
+
+    def get_scale(self, scale, scale_mode):
+        assert not scale_mode or scale_mode in [
+            'range', 'value'
+        ], 'scale mode should be in [range, value, None]'
+        if not scale_mode:
+            return scale
+        elif scale_mode == 'range':
+            low, high = scale
+            return np.random.rand() * (high - low) + low
+        elif scale_mode == 'value':
+            return np.random.choice(scale)
+
+    def apply(self, sample, context=None):
+        if np.random.rand() > self.rotate_prob:
+            return sample
+
+        angle = self.get_angle(self.angle, self.angle_mode)
+        scale = self.get_scale(self.scale, self.scale_mode)
+        rotator = RRotate(scale, angle, self.fill_value, self.auto_bound)
+        return rotator(sample)
+
+
+@register_op
+class Poly2RBox(BaseOperator):
+    """ Polygon to Rotated Box, using new OpenCV definition since 4.5.1
+
+    Args:
+        filter_threshold (int, float): threshold to filter annotations
+        filter_mode (str): filter mode, ['area', 'edge']
+        rbox_type (str): rbox type, ['le135', 'oc']
+
+    """
+
+    def __init__(self, filter_threshold=4, filter_mode=None, rbox_type='le135'):
+        super(Poly2RBox, self).__init__()
+        self.filter_fn = lambda size: self.filter(size, filter_threshold, filter_mode)
+        self.rbox_fn = poly2rbox_le135_np if rbox_type == 'le135' else poly2rbox_oc_np
+
+    def filter(self, size, threshold, mode):
+        if mode == 'area':
+            if size[0] * size[1] < threshold:
+                return True
+        elif mode == 'edge':
+            if min(size) < threshold:
+                return True
+        return False
+
+    def get_rbox(self, polys):
+        valid_ids, rboxes, bboxes = [], [], []
+        for i, poly in enumerate(polys):
+            cx, cy, w, h, angle = self.rbox_fn(poly)
+            if self.filter_fn((w, h)):
+                continue
+            rboxes.append(np.array([cx, cy, w, h, angle], dtype=np.float32))
+            valid_ids.append(i)
+            xmin, ymin = min(poly[0::2]), min(poly[1::2])
+            xmax, ymax = max(poly[0::2]), max(poly[1::2])
+            bboxes.append(np.array([xmin, ymin, xmax, ymax], dtype=np.float32))
+
+        if len(valid_ids) == 0:
+            rboxes = np.zeros((0, 5), dtype=np.float32)
+            bboxes = np.zeros((0, 4), dtype=np.float32)
+        else:
+            rboxes = np.stack(rboxes)
+            bboxes = np.stack(bboxes)
+
+        return rboxes, bboxes, valid_ids
+
+    def apply(self, sample, context=None):
+        rboxes, bboxes, valid_ids = self.get_rbox(sample['gt_poly'])
+        sample['gt_rbox'] = rboxes
+        sample['gt_bbox'] = bboxes
+        for k in ['gt_class', 'gt_score', 'gt_poly', 'is_crowd', 'difficult']:
+            if k in sample:
+                sample[k] = sample[k][valid_ids]
+
+        return sample
+
+
+@register_op
+class Poly2Array(BaseOperator):
+    """ convert gt_poly to np.array for rotated bboxes
+    """
+
+    def __init__(self):
+        super(Poly2Array, self).__init__()
+
+    def apply(self, sample, context=None):
+        if 'gt_poly' in sample:
+            sample['gt_poly'] = np.array(
+                sample['gt_poly'], dtype=np.float32).reshape((-1, 8))
+
+        return sample
+
+
+@register_op
+class RResize(BaseOperator):
+    def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR):
+        """
+        Resize image to target size. if keep_ratio is True, 
+        resize the image's long side to the maximum of target_size
+        if keep_ratio is False, resize the image to target size(h, w)
+        Args:
+            target_size (int|list): image target size
+            keep_ratio (bool): whether keep_ratio or not, default true
+            interp (int): the interpolation method
+        """
+        super(RResize, self).__init__()
+        self.keep_ratio = keep_ratio
+        self.interp = interp
+        if not isinstance(target_size, (Integral, Sequence)):
+            raise TypeError(
+                "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}".
+                format(type(target_size)))
+        if isinstance(target_size, Integral):
+            target_size = [target_size, target_size]
+        self.target_size = target_size
+
+    def apply_image(self, image, scale):
+        im_scale_x, im_scale_y = scale
+
+        return cv2.resize(
+            image,
+            None,
+            None,
+            fx=im_scale_x,
+            fy=im_scale_y,
+            interpolation=self.interp)
+
+    def apply_pts(self, pts, scale, size):
+        im_scale_x, im_scale_y = scale
+        resize_w, resize_h = size
+        pts[:, 0::2] *= im_scale_x
+        pts[:, 1::2] *= im_scale_y
+        pts[:, 0::2] = np.clip(pts[:, 0::2], 0, resize_w)
+        pts[:, 1::2] = np.clip(pts[:, 1::2], 0, resize_h)
+        return pts
+
+    def apply(self, sample, context=None):
+        """ Resize the image numpy.
+        """
+        im = sample['image']
+        if not isinstance(im, np.ndarray):
+            raise TypeError("{}: image type is not numpy.".format(self))
+        if len(im.shape) != 3:
+            raise ImageError('{}: image is not 3-dimensional.'.format(self))
+
+        # apply image
+        im_shape = im.shape
+        if self.keep_ratio:
+
+            im_size_min = np.min(im_shape[0:2])
+            im_size_max = np.max(im_shape[0:2])
+
+            target_size_min = np.min(self.target_size)
+            target_size_max = np.max(self.target_size)
+
+            im_scale = min(target_size_min / im_size_min,
+                           target_size_max / im_size_max)
+
+            resize_h = im_scale * float(im_shape[0])
+            resize_w = im_scale * float(im_shape[1])
+
+            im_scale_x = im_scale
+            im_scale_y = im_scale
+        else:
+            resize_h, resize_w = self.target_size
+            im_scale_y = resize_h / im_shape[0]
+            im_scale_x = resize_w / im_shape[1]
+
+        im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])
+        sample['image'] = im.astype(np.float32)
+        sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
+        if 'scale_factor' in sample:
+            scale_factor = sample['scale_factor']
+            sample['scale_factor'] = np.asarray(
+                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
+                dtype=np.float32)
+        else:
+            sample['scale_factor'] = np.asarray(
+                [im_scale_y, im_scale_x], dtype=np.float32)
+
+        # apply bbox
+        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+            sample['gt_bbox'] = self.apply_pts(sample['gt_bbox'],
+                                               [im_scale_x, im_scale_y],
+                                               [resize_w, resize_h])
+
+        # apply polygon
+        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
+            sample['gt_poly'] = self.apply_pts(sample['gt_poly'],
+                                               [im_scale_x, im_scale_y],
+                                               [resize_w, resize_h])
+
+        return sample
+
+
+@register_op
+class RandomRFlip(BaseOperator):
+    def __init__(self, prob=0.5):
+        """
+        Args:
+            prob (float): the probability of flipping image
+        """
+        super(RandomRFlip, self).__init__()
+        self.prob = prob
+        if not (isinstance(self.prob, float)):
+            raise TypeError("{}: input type is invalid.".format(self))
+
+    def apply_image(self, image):
+        return image[:, ::-1, :]
+
+    def apply_pts(self, pts, width):
+        oldx = pts[:, 0::2].copy()
+        pts[:, 0::2] = width - oldx - 1
+        return pts
+
+    def apply(self, sample, context=None):
+        """Filp the image and bounding box.
+        Operators:
+            1. Flip the image numpy.
+            2. Transform the bboxes' x coordinates.
+              (Must judge whether the coordinates are normalized!)
+            3. Transform the segmentations' x coordinates.
+              (Must judge whether the coordinates are normalized!)
+        Output:
+            sample: the image, bounding box and segmentation part
+                    in sample are flipped.
+        """
+        if np.random.uniform(0, 1) < self.prob:
+            im = sample['image']
+            height, width = im.shape[:2]
+            im = self.apply_image(im)
+            if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
+                sample['gt_bbox'] = self.apply_pts(sample['gt_bbox'], width)
+            if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
+                sample['gt_poly'] = self.apply_pts(sample['gt_poly'], width)
+
+            sample['flipped'] = True
+            sample['image'] = im
+        return sample
+
+
+@register_op
+class VisibleRBox(BaseOperator):
+    """
+    In debug mode, visualize images according to `gt_box`.
+    (Currently only supported when not cropping and flipping image.)
+    """
+
+    def __init__(self, output_dir='debug'):
+        super(VisibleRBox, self).__init__()
+        self.output_dir = output_dir
+        if not os.path.isdir(output_dir):
+            os.makedirs(output_dir)
+
+    def apply(self, sample, context=None):
+        image = Image.fromarray(sample['image'].astype(np.uint8))
+        out_file_name = '{:012d}.jpg'.format(sample['im_id'][0])
+        width = sample['w']
+        height = sample['h']
+        # gt_poly = sample['gt_rbox']
+        gt_poly = sample['gt_poly']
+        gt_class = sample['gt_class']
+        draw = ImageDraw.Draw(image)
+        for i in range(gt_poly.shape[0]):
+            x1, y1, x2, y2, x3, y3, x4, y4 = gt_poly[i]
+            draw.line(
+                [(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)],
+                width=2,
+                fill='green')
+            # draw label
+            xmin = min(x1, x2, x3, x4)
+            ymin = min(y1, y2, y3, y4)
+            text = str(gt_class[i][0])
+            tw, th = imagedraw_textsize_c(draw, text)
+            draw.rectangle(
+                [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill='green')
+            draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))
+
+        if 'gt_keypoint' in sample.keys():
+            gt_keypoint = sample['gt_keypoint']
+            if self.is_normalized:
+                for i in range(gt_keypoint.shape[1]):
+                    if i % 2:
+                        gt_keypoint[:, i] = gt_keypoint[:, i] * height
+                    else:
+                        gt_keypoint[:, i] = gt_keypoint[:, i] * width
+            for i in range(gt_keypoint.shape[0]):
+                keypoint = gt_keypoint[i]
+                for j in range(int(keypoint.shape[0] / 2)):
+                    x1 = round(keypoint[2 * j]).astype(np.int32)
+                    y1 = round(keypoint[2 * j + 1]).astype(np.int32)
+                    draw.ellipse(
+                        (x1, y1, x1 + 5, y1 + 5), fill='green', outline='green')
+        save_path = os.path.join(self.output_dir, out_file_name)
+        image.save(save_path, quality=95)
+        return sample
+
+
+@register_op
+class Rbox2Poly(BaseOperator):
+    """
+    Convert rbbox format to poly format.
+    """
+
+    def __init__(self):
+        super(Rbox2Poly, self).__init__()
+
+    def apply(self, sample, context=None):
+        assert 'gt_rbox' in sample
+        assert sample['gt_rbox'].shape[1] == 5
+        rboxes = sample['gt_rbox']
+        polys = rbox2poly_np(rboxes)
+        sample['gt_poly'] = polys
+        xmin, ymin = polys[:, 0::2].min(1), polys[:, 1::2].min(1)
+        xmax, ymax = polys[:, 0::2].max(1), polys[:, 1::2].max(1)
+        sample['gt_bbox'] = np.stack([xmin, ymin, xmin, ymin], axis=1)
+        return sample
--- a/paddle_detection/ppdet/data/utils.py
+++ b/paddle_detection/ppdet/data/utils.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numbers
+import numpy as np
+
+try:
+    from collections.abc import Sequence, Mapping
+except:
+    from collections import Sequence, Mapping
+
+
+def default_collate_fn(batch):
+    """
+    Default batch collating function for :code:`paddle.io.DataLoader`,
+    get input data as a list of sample datas, each element in list
+    if the data of a sample, and sample data should composed of list,
+    dictionary, string, number, numpy array, this
+    function will parse input data recursively and stack number,
+    numpy array and paddle.Tensor datas as batch datas. e.g. for
+    following input data:
+    [{'image': np.array(shape=[3, 224, 224]), 'label': 1},
+     {'image': np.array(shape=[3, 224, 224]), 'label': 3},
+     {'image': np.array(shape=[3, 224, 224]), 'label': 4},
+     {'image': np.array(shape=[3, 224, 224]), 'label': 5},]
+    
+    
+    This default collate function zipped each number and numpy array
+    field together and stack each field as the batch field as follows:
+    {'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])}
+    Args:  
+        batch(list of sample data): batch should be a list of sample data.
+    
+    Returns:
+        Batched data: batched each number, numpy array and paddle.Tensor
+                      in input data.
+    """
+    sample = batch[0]
+    if isinstance(sample, np.ndarray):
+        batch = np.stack(batch, axis=0)
+        return batch
+    elif isinstance(sample, numbers.Number):
+        batch = np.array(batch)
+        return batch
+    elif isinstance(sample, (str, bytes)):
+        return batch
+    elif isinstance(sample, Mapping):
+        return {
+            key: default_collate_fn([d[key] for d in batch])
+            for key in sample
+        }
+    elif isinstance(sample, Sequence):
+        sample_fields_num = len(sample)
+        if not all(len(sample) == sample_fields_num for sample in iter(batch)):
+            raise RuntimeError(
+                "fileds number not same among samples in a batch")
+        return [default_collate_fn(fields) for fields in zip(*batch)]
+
+    raise TypeError("batch data con only contains: tensor, numpy.ndarray, "
+                    "dict, list, number, but got {}".format(type(sample)))