更换文档检测模型

2024-08-27 14:42:45 +08:00
parent aea6f19951
commit 1514e09c40
2072 changed files with 254336 additions and 4967 deletions
--- a/paddle_detection/ppdet/modeling/architectures/init.py
+++ b/paddle_detection/ppdet/modeling/architectures/init.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from . import meta_arch
+from . import faster_rcnn
+from . import mask_rcnn
+from . import yolo
+from . import ppyoloe
+from . import cascade_rcnn
+from . import ssd
+from . import fcos
+from . import solov2
+from . import ttfnet
+from . import s2anet
+from . import keypoint_hrhrnet
+from . import keypoint_hrnet
+from . import keypoint_vitpose
+from . import jde
+from . import deepsort
+from . import fairmot
+from . import centernet
+from . import gfl
+from . import picodet
+from . import detr
+from . import sparse_rcnn
+from . import tood
+from . import retinanet
+from . import bytetrack
+from . import yolox
+from . import yolof
+from . import pose3d_metro
+from . import centertrack
+from . import queryinst
+from . import detr_ssod
+from . import multi_stream_detector
+from . import clrnet
+
+from .meta_arch import *
+from .faster_rcnn import *
+from .mask_rcnn import *
+from .yolo import *
+from .ppyoloe import *
+from .cascade_rcnn import *
+from .ssd import *
+from .fcos import *
+from .solov2 import *
+from .ttfnet import *
+from .s2anet import *
+from .keypoint_hrhrnet import *
+from .keypoint_hrnet import *
+from .keypoint_vitpose import *
+from .jde import *
+from .deepsort import *
+from .fairmot import *
+from .centernet import *
+from .blazeface import *
+from .gfl import *
+from .picodet import *
+from .detr import *
+from .sparse_rcnn import *
+from .tood import *
+from .retinanet import *
+from .bytetrack import *
+from .yolox import *
+from .yolof import *
+from .pose3d_metro import *
+from .centertrack import *
+from .queryinst import *
+from .keypoint_petr import *
+from .detr_ssod import *
+from .multi_stream_detector import *
+from .clrnet import *
--- a/paddle_detection/ppdet/modeling/architectures/blazeface.py
+++ b/paddle_detection/ppdet/modeling/architectures/blazeface.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+import paddle
+import paddle.nn.functional as F
+
+__all__ = ['BlazeFace']
+
+
+@register
+class BlazeFace(BaseArch):
+    """
+    BlazeFace: Sub-millisecond Neural Face Detection on Mobile GPUs,
+               see https://arxiv.org/abs/1907.05047
+
+    Args:
+        backbone (nn.Layer): backbone instance
+        neck (nn.Layer): neck instance
+        blaze_head (nn.Layer): `blazeHead` instance
+        post_process (object): `BBoxPostProcess` instance
+    """
+
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+
+    def __init__(self, backbone, blaze_head, neck, post_process):
+        super(BlazeFace, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.blaze_head = blaze_head
+        self.post_process = post_process
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+        # fpn
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+        # head
+        kwargs = {'input_shape': neck.out_shape}
+        blaze_head = create(cfg['blaze_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            'blaze_head': blaze_head,
+        }
+
+    def _forward(self):
+        # Backbone
+        body_feats = self.backbone(self.inputs)
+        # neck
+        neck_feats = self.neck(body_feats)
+        # blaze Head
+        if self.training:
+            return self.blaze_head(neck_feats, self.inputs['image'],
+                                   self.inputs['gt_bbox'],
+                                   self.inputs['gt_class'])
+        else:
+            preds, anchors = self.blaze_head(neck_feats, self.inputs['image'])
+            bbox, bbox_num, nms_keep_idx = self.post_process(
+                preds, anchors, self.inputs['im_shape'],
+                self.inputs['scale_factor'])
+            if self.use_extra_data:
+                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                """extra_data:{
+                            'scores': predict scores,
+                            'nms_keep_idx': bbox index before nms,
+                           }
+                           """
+                preds_logits = preds[1]  # [[1xNumBBoxNumClass]]
+                extra_data['scores'] = F.softmax(paddle.concat(
+                    preds_logits, axis=1)).transpose([0, 2, 1])
+                extra_data['logits'] = paddle.concat(
+                    preds_logits, axis=1).transpose([0, 2, 1])
+                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms
+                return bbox, bbox_num, extra_data
+            else:
+                return bbox, bbox_num
+
+    def get_loss(self, ):
+        return {"loss": self._forward()}
+
+    def get_pred(self):
+        if self.use_extra_data:
+            bbox_pred, bbox_num, extra_data = self._forward()
+            output = {
+                "bbox": bbox_pred,
+                "bbox_num": bbox_num,
+                "extra_data": extra_data
+            }
+        else:
+            bbox_pred, bbox_num = self._forward()
+            output = {
+                "bbox": bbox_pred,
+                "bbox_num": bbox_num,
+            }
+
+        return output
--- a/paddle_detection/ppdet/modeling/architectures/bytetrack.py
+++ b/paddle_detection/ppdet/modeling/architectures/bytetrack.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['ByteTrack']
+
+
+@register
+class ByteTrack(BaseArch):
+    """
+    ByteTrack network, see https://arxiv.org/abs/2110.06864
+
+    Args:
+        detector (object): detector model instance
+        reid (object): reid model instance, default None
+        tracker (object): tracker instance
+    """
+    __category__ = 'architecture'
+
+    def __init__(self,
+                 detector='YOLOX',
+                 reid=None,
+                 tracker='JDETracker'):
+        super(ByteTrack, self).__init__()
+        self.detector = detector
+        self.reid = reid
+        self.tracker = tracker
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        detector = create(cfg['detector'])
+
+        if cfg['reid'] != 'None':
+            reid = create(cfg['reid'])
+        else:
+            reid = None
+
+        tracker = create(cfg['tracker'])
+
+        return {
+            "detector": detector,
+            "reid": reid,
+            "tracker": tracker,
+        }
+
+    def _forward(self):
+        det_outs = self.detector(self.inputs)
+
+        if self.training:
+            return det_outs
+        else:
+            if self.reid is not None:
+                assert 'crops' in self.inputs
+                crops = self.inputs['crops']
+                pred_embs = self.reid(crops)
+            else:
+                pred_embs = None
+            det_outs['embeddings'] = pred_embs
+            return det_outs
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
+
--- a/paddle_detection/ppdet/modeling/architectures/cascade_rcnn.py
+++ b/paddle_detection/ppdet/modeling/architectures/cascade_rcnn.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['CascadeRCNN']
+
+
+@register
+class CascadeRCNN(BaseArch):
+    """
+    Cascade R-CNN network, see https://arxiv.org/abs/1712.00726
+
+    Args:
+        backbone (object): backbone instance
+        rpn_head (object): `RPNHead` instance
+        bbox_head (object): `BBoxHead` instance
+        bbox_post_process (object): `BBoxPostProcess` instance
+        neck (object): 'FPN' instance
+        mask_head (object): `MaskHead` instance
+        mask_post_process (object): `MaskPostProcess` instance
+    """
+    __category__ = 'architecture'
+    __inject__ = [
+        'bbox_post_process',
+        'mask_post_process',
+    ]
+
+    def __init__(self,
+                 backbone,
+                 rpn_head,
+                 bbox_head,
+                 bbox_post_process,
+                 neck=None,
+                 mask_head=None,
+                 mask_post_process=None):
+        super(CascadeRCNN, self).__init__()
+        self.backbone = backbone
+        self.rpn_head = rpn_head
+        self.bbox_head = bbox_head
+        self.bbox_post_process = bbox_post_process
+        self.neck = neck
+        self.mask_head = mask_head
+        self.mask_post_process = mask_post_process
+        self.with_mask = mask_head is not None
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = cfg['neck'] and create(cfg['neck'], **kwargs)
+
+        out_shape = neck and neck.out_shape or backbone.out_shape
+        kwargs = {'input_shape': out_shape}
+        rpn_head = create(cfg['rpn_head'], **kwargs)
+        bbox_head = create(cfg['bbox_head'], **kwargs)
+
+        out_shape = neck and out_shape or bbox_head.get_head().out_shape
+        kwargs = {'input_shape': out_shape}
+        mask_head = cfg['mask_head'] and create(cfg['mask_head'], **kwargs)
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "rpn_head": rpn_head,
+            "bbox_head": bbox_head,
+            "mask_head": mask_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        if self.neck is not None:
+            body_feats = self.neck(body_feats)
+
+        if self.training:
+            rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs)
+            bbox_loss, bbox_feat = self.bbox_head(body_feats, rois, rois_num,
+                                                  self.inputs)
+            rois, rois_num = self.bbox_head.get_assigned_rois()
+            bbox_targets = self.bbox_head.get_assigned_targets()
+            if self.with_mask:
+                mask_loss = self.mask_head(body_feats, rois, rois_num,
+                                           self.inputs, bbox_targets, bbox_feat)
+                return rpn_loss, bbox_loss, mask_loss
+            else:
+                return rpn_loss, bbox_loss, {}
+        else:
+            rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)
+            preds, _ = self.bbox_head(body_feats, rois, rois_num, self.inputs)
+            refined_rois = self.bbox_head.get_refined_rois()
+
+            im_shape = self.inputs['im_shape']
+            scale_factor = self.inputs['scale_factor']
+
+            bbox, bbox_num, nms_keep_idx = self.bbox_post_process(
+                preds, (refined_rois, rois_num), im_shape, scale_factor)
+            # rescale the prediction back to origin image
+            bbox, bbox_pred, bbox_num = self.bbox_post_process.get_pred(
+                bbox, bbox_num, im_shape, scale_factor)
+            if not self.with_mask:
+                return bbox_pred, bbox_num, None
+            mask_out = self.mask_head(body_feats, bbox, bbox_num, self.inputs)
+            origin_shape = self.bbox_post_process.get_origin_shape()
+            mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num,
+                                               origin_shape)
+            return bbox_pred, bbox_num, mask_pred
+
+    def get_loss(self, ):
+        rpn_loss, bbox_loss, mask_loss = self._forward()
+        loss = {}
+        loss.update(rpn_loss)
+        loss.update(bbox_loss)
+        if self.with_mask:
+            loss.update(mask_loss)
+        total_loss = paddle.add_n(list(loss.values()))
+        loss.update({'loss': total_loss})
+        return loss
+
+    def get_pred(self):
+        bbox_pred, bbox_num, mask_pred = self._forward()
+        output = {
+            'bbox': bbox_pred,
+            'bbox_num': bbox_num,
+        }
+        if self.with_mask:
+            output.update({'mask': mask_pred})
+        return output
--- a/paddle_detection/ppdet/modeling/architectures/centernet.py
+++ b/paddle_detection/ppdet/modeling/architectures/centernet.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['CenterNet']
+
+
+@register
+class CenterNet(BaseArch):
+    """
+    CenterNet network, see http://arxiv.org/abs/1904.07850
+
+    Args:
+        backbone (object): backbone instance
+        neck (object): FPN instance, default use 'CenterNetDLAFPN'
+        head (object): 'CenterNetHead' instance
+        post_process (object): 'CenterNetPostProcess' instance
+        for_mot (bool): whether return other features used in tracking model
+
+    """
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+    __shared__ = ['for_mot']
+
+    def __init__(self,
+                 backbone,
+                 neck='CenterNetDLAFPN',
+                 head='CenterNetHead',
+                 post_process='CenterNetPostProcess',
+                 for_mot=False):
+        super(CenterNet, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.head = head
+        self.post_process = post_process
+        self.for_mot = for_mot
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = cfg['neck'] and create(cfg['neck'], **kwargs)
+
+        out_shape = neck and neck.out_shape or backbone.out_shape
+        kwargs = {'input_shape': out_shape}
+        head = create(cfg['head'], **kwargs)
+
+        return {'backbone': backbone, 'neck': neck, "head": head}
+
+    def _forward(self):
+        neck_feat = self.backbone(self.inputs)
+        if self.neck is not None:
+            neck_feat = self.neck(neck_feat)
+        head_out = self.head(neck_feat, self.inputs)
+        if self.for_mot:
+            head_out.update({'neck_feat': neck_feat})
+        elif self.training:
+            head_out['loss'] = head_out.pop('det_loss')
+        return head_out
+
+    def get_pred(self):
+        head_out = self._forward()
+        bbox, bbox_num, bbox_inds, topk_clses, topk_ys, topk_xs = self.post_process(
+            head_out['heatmap'],
+            head_out['size'],
+            head_out['offset'],
+            im_shape=self.inputs['im_shape'],
+            scale_factor=self.inputs['scale_factor'])
+
+        if self.for_mot:
+            output = {
+                "bbox": bbox,
+                "bbox_num": bbox_num,
+                "bbox_inds": bbox_inds,
+                "topk_clses": topk_clses,
+                "topk_ys": topk_ys,
+                "topk_xs": topk_xs,
+                "neck_feat": head_out['neck_feat']
+            }
+        else:
+            output = {"bbox": bbox, "bbox_num": bbox_num}
+        return output
+
+    def get_loss(self):
+        return self._forward()
--- a/paddle_detection/ppdet/modeling/architectures/centertrack.py
+++ b/paddle_detection/ppdet/modeling/architectures/centertrack.py
@@ -0,0 +1,176 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import math
+import numpy as np
+import paddle
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+from ..keypoint_utils import affine_transform
+from ppdet.data.transform.op_helper import gaussian_radius, gaussian2D, draw_umich_gaussian
+
+__all__ = ['CenterTrack']
+
+
+@register
+class CenterTrack(BaseArch):
+    """
+    CenterTrack network, see http://arxiv.org/abs/2004.01177
+
+    Args:
+        detector (object): 'CenterNet' instance
+        plugin_head (object): 'CenterTrackHead' instance
+        tracker (object): 'CenterTracker' instance
+    """
+    __category__ = 'architecture'
+    __shared__ = ['mot_metric']
+
+    def __init__(self,
+                 detector='CenterNet',
+                 plugin_head='CenterTrackHead',
+                 tracker='CenterTracker',
+                 mot_metric=False):
+        super(CenterTrack, self).__init__()
+        self.detector = detector
+        self.plugin_head = plugin_head
+        self.tracker = tracker
+        self.mot_metric = mot_metric
+        self.pre_image = None
+        self.deploy = False
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        detector = create(cfg['detector'])
+        detector_out_shape = detector.neck and detector.neck.out_shape or detector.backbone.out_shape
+
+        kwargs = {'input_shape': detector_out_shape}
+        plugin_head = create(cfg['plugin_head'], **kwargs)
+        tracker = create(cfg['tracker'])
+
+        return {
+            'detector': detector,
+            'plugin_head': plugin_head,
+            'tracker': tracker,
+        }
+
+    def _forward(self):
+        if self.training:
+            det_outs = self.detector(self.inputs)
+            neck_feat = det_outs['neck_feat']
+
+            losses = {}
+            for k, v in det_outs.items():
+                if 'loss' not in k: continue
+                losses.update({k: v})
+
+            plugin_outs = self.plugin_head(neck_feat, self.inputs)
+            for k, v in plugin_outs.items():
+                if 'loss' not in k: continue
+                losses.update({k: v})
+
+            losses['loss'] = det_outs['det_loss'] + plugin_outs['plugin_loss']
+            return losses
+
+        else:
+            if not self.mot_metric:
+                # detection, support bs>=1
+                det_outs = self.detector(self.inputs)
+                return {
+                    'bbox': det_outs['bbox'],
+                    'bbox_num': det_outs['bbox_num']
+                }
+
+            else:
+                # MOT, only support bs=1
+                if not self.deploy:
+                    if self.pre_image is None:
+                        self.pre_image = self.inputs['image']
+                        # initializing tracker for the first frame
+                        self.tracker.init_track([])
+                    self.inputs['pre_image'] = self.pre_image
+                    self.pre_image = self.inputs[
+                        'image']  # Note: update for next image
+
+                    # render input heatmap from tracker status
+                    pre_hm = self.get_additional_inputs(
+                        self.tracker.tracks, self.inputs, with_hm=True)
+                    self.inputs['pre_hm'] = paddle.to_tensor(pre_hm)
+
+                # model inference
+                det_outs = self.detector(self.inputs)
+                neck_feat = det_outs['neck_feat']
+                result = self.plugin_head(
+                    neck_feat, self.inputs, det_outs['bbox'],
+                    det_outs['bbox_inds'], det_outs['topk_clses'],
+                    det_outs['topk_ys'], det_outs['topk_xs'])
+
+                if not self.deploy:
+                    # convert the cropped and 4x downsampled output coordinate system
+                    # back to the input image coordinate system
+                    result = self.plugin_head.centertrack_post_process(
+                        result, self.inputs, self.tracker.out_thresh)
+                return result
+
+    def get_pred(self):
+        return self._forward()
+
+    def get_loss(self):
+        return self._forward()
+
+    def reset_tracking(self):
+        self.tracker.reset()
+        self.pre_image = None
+
+    def get_additional_inputs(self, dets, meta, with_hm=True):
+        # Render input heatmap from previous trackings.
+        trans_input = meta['trans_input'][0].numpy()
+        inp_width, inp_height = int(meta['inp_width'][0]), int(meta[
+            'inp_height'][0])
+        input_hm = np.zeros((1, inp_height, inp_width), dtype=np.float32)
+
+        for det in dets:
+            if det['score'] < self.tracker.pre_thresh:
+                continue
+            bbox = affine_transform_bbox(det['bbox'], trans_input, inp_width,
+                                         inp_height)
+            h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
+            if (h > 0 and w > 0):
+                radius = gaussian_radius(
+                    (math.ceil(h), math.ceil(w)), min_overlap=0.7)
+                radius = max(0, int(radius))
+                ct = np.array(
+                    [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
+                    dtype=np.float32)
+                ct_int = ct.astype(np.int32)
+                if with_hm:
+                    input_hm[0] = draw_umich_gaussian(input_hm[0], ct_int,
+                                                      radius)
+        if with_hm:
+            input_hm = input_hm[np.newaxis]
+        return input_hm
+
+
+def affine_transform_bbox(bbox, trans, width, height):
+    bbox = np.array(copy.deepcopy(bbox), dtype=np.float32)
+    bbox[:2] = affine_transform(bbox[:2], trans)
+    bbox[2:] = affine_transform(bbox[2:], trans)
+    bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, width - 1)
+    bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, height - 1)
+    return bbox
--- a/paddle_detection/ppdet/modeling/architectures/clrnet.py
+++ b/paddle_detection/ppdet/modeling/architectures/clrnet.py
@@ -0,0 +1,67 @@
+from .meta_arch import BaseArch
+from ppdet.core.workspace import register, create
+from paddle import in_dynamic_mode
+
+__all__ = ['CLRNet']
+
+
+@register
+class CLRNet(BaseArch):
+    __category__ = 'architecture'
+
+    def __init__(self,
+                 backbone="CLRResNet",
+                 neck="CLRFPN",
+                 clr_head="CLRHead",
+                 post_process=None):
+        super(CLRNet, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.heads = clr_head
+        self.post_process = post_process
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+        # fpn
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+        # head
+        kwargs = {'input_shape': neck.out_shape}
+        clr_head = create(cfg['clr_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            'clr_head': clr_head,
+        }
+
+    def _forward(self):
+        # Backbone
+        body_feats = self.backbone(self.inputs['image'])
+        # neck
+        neck_feats = self.neck(body_feats)
+        # CRL Head
+
+        if self.training:
+            output = self.heads(neck_feats, self.inputs)
+        else:
+            output = self.heads(neck_feats)
+            output = {'lanes': output}
+            # TODO: hard code fix as_lanes=False problem in clrnet_head.py "get_lanes" function for static mode
+            if in_dynamic_mode():
+                output = self.heads.get_lanes(output['lanes'])
+                output = {
+                    "lanes": output,
+                    "img_path": self.inputs['full_img_path'],
+                    "img_name": self.inputs['img_name']
+                }
+
+        return output
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
--- a/paddle_detection/ppdet/modeling/architectures/deepsort.py
+++ b/paddle_detection/ppdet/modeling/architectures/deepsort.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+from ppdet.modeling.mot.utils import Detection, get_crops, scale_coords, clip_box
+
+__all__ = ['DeepSORT']
+
+
+@register
+class DeepSORT(BaseArch):
+    """
+    DeepSORT network, see https://arxiv.org/abs/1703.07402
+
+    Args:
+        detector (object): detector model instance
+        reid (object): reid model instance
+        tracker (object): tracker instance
+    """
+    __category__ = 'architecture'
+
+    def __init__(self,
+                 detector='YOLOv3',
+                 reid='PCBPyramid',
+                 tracker='DeepSORTTracker'):
+        super(DeepSORT, self).__init__()
+        self.detector = detector
+        self.reid = reid
+        self.tracker = tracker
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        if cfg['detector'] != 'None':
+            detector = create(cfg['detector'])
+        else:
+            detector = None
+        reid = create(cfg['reid'])
+        tracker = create(cfg['tracker'])
+
+        return {
+            "detector": detector,
+            "reid": reid,
+            "tracker": tracker,
+        }
+
+    def _forward(self):
+        crops = self.inputs['crops']
+        outs = {}
+        outs['embeddings'] = self.reid(crops)
+        return outs
+
+    def get_pred(self):
+        return self._forward()
--- a/paddle_detection/ppdet/modeling/architectures/detr.py
+++ b/paddle_detection/ppdet/modeling/architectures/detr.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from .meta_arch import BaseArch
+from ppdet.core.workspace import register, create
+
+__all__ = ['DETR']
+# Deformable DETR, DINO use the same architecture as DETR
+
+
+@register
+class DETR(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['post_process', 'post_process_semi']
+    __shared__ = ['with_mask', 'exclude_post_process']
+
+    def __init__(self,
+                 backbone,
+                 transformer='DETRTransformer',
+                 detr_head='DETRHead',
+                 neck=None,
+                 post_process='DETRPostProcess',
+                 post_process_semi=None,
+                 with_mask=False,
+                 exclude_post_process=False):
+        super(DETR, self).__init__()
+        self.backbone = backbone
+        self.transformer = transformer
+        self.detr_head = detr_head
+        self.neck = neck
+        self.post_process = post_process
+        self.with_mask = with_mask
+        self.exclude_post_process = exclude_post_process
+        self.post_process_semi = post_process_semi
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+        # neck
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs) if cfg['neck'] else None
+
+        # transformer
+        if neck is not None:
+            kwargs = {'input_shape': neck.out_shape}
+        transformer = create(cfg['transformer'], **kwargs)
+        # head
+        kwargs = {
+            'hidden_dim': transformer.hidden_dim,
+            'nhead': transformer.nhead,
+            'input_shape': backbone.out_shape
+        }
+        detr_head = create(cfg['detr_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'transformer': transformer,
+            "detr_head": detr_head,
+            "neck": neck
+        }
+
+    def _forward(self):
+        # Backbone
+        body_feats = self.backbone(self.inputs)
+
+        # Neck
+        if self.neck is not None:
+            body_feats = self.neck(body_feats)
+
+        # Transformer
+        pad_mask = self.inputs.get('pad_mask', None)
+        out_transformer = self.transformer(body_feats, pad_mask, self.inputs)
+
+        # DETR Head
+        if self.training:
+            detr_losses = self.detr_head(out_transformer, body_feats,
+                                         self.inputs)
+            detr_losses.update({
+                'loss': paddle.add_n(
+                    [v for k, v in detr_losses.items() if 'log' not in k])
+            })
+            return detr_losses
+        else:
+            preds = self.detr_head(out_transformer, body_feats)
+            if self.exclude_post_process:
+                bbox, bbox_num, mask = preds
+            else:
+                bbox, bbox_num, mask = self.post_process(
+                    preds, self.inputs['im_shape'], self.inputs['scale_factor'],
+                    paddle.shape(self.inputs['image'])[2:])
+
+            output = {'bbox': bbox, 'bbox_num': bbox_num}
+            if self.with_mask:
+                output['mask'] = mask
+            return output
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
--- a/paddle_detection/ppdet/modeling/architectures/detr_ssod.py
+++ b/paddle_detection/ppdet/modeling/architectures/detr_ssod.py
@@ -0,0 +1,341 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from ppdet.core.workspace import register, create, merge_config
+import paddle
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, create
+from ppdet.utils.logger import setup_logger
+from ppdet.modeling.ssod.utils import filter_invalid
+from .multi_stream_detector import MultiSteamDetector
+logger = setup_logger(__name__)
+
+__all__ = ['DETR_SSOD']
+__shared__ = ['num_classes']
+
+
+@register
+class DETR_SSOD(MultiSteamDetector):
+    def __init__(self,
+                 teacher,
+                 student,
+                 train_cfg=None,
+                 test_cfg=None,
+                 RTDETRTransformer=None,
+                 num_classes=80):
+        super(DETR_SSOD, self).__init__(
+            dict(
+                teacher=teacher, student=student),
+            train_cfg=train_cfg,
+            test_cfg=test_cfg, )
+        self.ema_start_iters = train_cfg['ema_start_iters']
+        self.momentum = 0.9996
+        self.cls_thr = None
+        self.cls_thr_ig = None
+        self.num_classes = num_classes
+        if train_cfg is not None:
+            self.freeze("teacher")
+            self.unsup_weight = self.train_cfg['unsup_weight']
+            self.sup_weight = self.train_cfg['sup_weight']
+            self._teacher = None
+            self._student = None
+            self._transformer = None
+
+    @classmethod
+    def from_config(cls, cfg):
+        teacher = create(cfg['teacher'])
+        merge_config(cfg)
+        student = create(cfg['student'])
+        train_cfg = cfg['train_cfg']
+        test_cfg = cfg['test_cfg']
+        RTDETRTransformer = cfg['RTDETRTransformer']
+        return {
+            'teacher': teacher,
+            'student': student,
+            'train_cfg': train_cfg,
+            'test_cfg': test_cfg,
+            'RTDETRTransformer': RTDETRTransformer
+        }
+
+    def forward_train(self, inputs, **kwargs):
+        if isinstance(inputs, dict):
+            iter_id = inputs['iter_id']
+        elif isinstance(inputs, list):
+            iter_id = inputs[-1]
+        if iter_id == self.ema_start_iters:
+            self.update_ema_model(momentum=0)
+        elif iter_id > self.ema_start_iters:
+            self.update_ema_model(momentum=self.momentum)
+        if iter_id > self.ema_start_iters:
+            data_sup_w, data_sup_s, data_unsup_w, data_unsup_s, _ = inputs
+
+            if data_sup_w['image'].shape != data_sup_s['image'].shape:
+                data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w,
+                                                                 data_sup_s)
+
+            if 'gt_bbox' in data_unsup_s.keys():
+                del data_unsup_s['gt_bbox']
+            if 'gt_class' in data_unsup_s.keys():
+                del data_unsup_s['gt_class']
+            if 'gt_class' in data_unsup_w.keys():
+                del data_unsup_w['gt_class']
+            if 'gt_bbox' in data_unsup_w.keys():
+                del data_unsup_w['gt_bbox']
+            for k, v in data_sup_s.items():
+                if k in ['epoch_id']:
+                    continue
+                elif k in ['gt_class', 'gt_bbox', 'is_crowd']:
+                    data_sup_s[k].extend(data_sup_w[k])
+                else:
+                    data_sup_s[k] = paddle.concat([v, data_sup_w[k]])
+
+            loss = {}
+            body_feats = self.student.backbone(data_sup_s)
+            if self.student.neck is not None:
+                body_feats = self.student.neck(body_feats)
+            out_transformer = self.student.transformer(body_feats, None,
+                                                       data_sup_s)
+            sup_loss = self.student.detr_head(out_transformer, body_feats,
+                                              data_sup_s)
+            sup_loss.update({
+                'loss': paddle.add_n(
+                    [v for k, v in sup_loss.items() if 'log' not in k])
+            })
+            sup_loss = {"sup_" + k: v for k, v in sup_loss.items()}
+
+            loss.update(**sup_loss)
+            unsup_loss = self.foward_unsup_train(data_unsup_w, data_unsup_s)
+            unsup_loss.update({
+                'loss': paddle.add_n(
+                    [v for k, v in unsup_loss.items() if 'log' not in k])
+            })
+            unsup_loss = {"unsup_" + k: v for k, v in unsup_loss.items()}
+            unsup_loss.update({
+                'loss': paddle.add_n(
+                    [v for k, v in unsup_loss.items() if 'log' not in k])
+            })
+            loss.update(**unsup_loss)
+            loss.update({'loss': loss['sup_loss'] + loss['unsup_loss']})
+        else:
+            if iter_id == self.ema_start_iters:
+                logger.info("start semi_supervised_traing")
+            data_sup_w, data_sup_s, data_unsup_w, data_unsup_s, _ = inputs
+
+            if data_sup_w['image'].shape != data_sup_s['image'].shape:
+                data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w,
+                                                                 data_sup_s)
+            for k, v in data_sup_s.items():
+                if k in ['epoch_id']:
+                    continue
+                elif k in ['gt_class', 'gt_bbox', 'is_crowd']:
+                    data_sup_s[k].extend(data_sup_w[k])
+                else:
+                    data_sup_s[k] = paddle.concat([v, data_sup_w[k]])
+            loss = {}
+            sup_loss = self.student(data_sup_s)
+            unsup_loss = {
+                "unsup_" + k: v * paddle.to_tensor(0)
+                for k, v in sup_loss.items()
+            }
+            sup_loss = {"sup_" + k: v for k, v in sup_loss.items()}
+            loss.update(**sup_loss)
+            unsup_loss.update({
+                'loss': paddle.add_n(
+                    [v * 0 for k, v in sup_loss.items() if 'log' not in k])
+            })
+            unsup_loss = {"unsup_" + k: v * 0 for k, v in unsup_loss.items()}
+            loss.update(**unsup_loss)
+            loss.update({'loss': loss['sup_loss']})
+        return loss
+
+    def foward_unsup_train(self, data_unsup_w, data_unsup_s):
+
+        with paddle.no_grad():
+            body_feats = self.teacher.backbone(data_unsup_w)
+            if self.teacher.neck is not None:
+                body_feats = self.teacher.neck(body_feats, is_teacher=True)
+            out_transformer = self.teacher.transformer(
+                body_feats, None, data_unsup_w, is_teacher=True)
+            preds = self.teacher.detr_head(out_transformer, body_feats)
+            bbox, bbox_num = self.teacher.post_process_semi(preds)
+        self.place = body_feats[0].place
+
+        proposal_bbox_list = bbox[:, -4:]
+        proposal_bbox_list = proposal_bbox_list.split(
+            tuple(np.array(bbox_num)), 0)
+
+        proposal_label_list = paddle.cast(bbox[:, :1], np.float32)
+        proposal_label_list = proposal_label_list.split(
+            tuple(np.array(bbox_num)), 0)
+        proposal_score_list = paddle.cast(bbox[:, 1:self.num_classes + 1],
+                                          np.float32)
+        proposal_score_list = proposal_score_list.split(
+            tuple(np.array(bbox_num)), 0)
+        proposal_bbox_list = [
+            paddle.to_tensor(
+                p, place=self.place) for p in proposal_bbox_list
+        ]
+        proposal_label_list = [
+            paddle.to_tensor(
+                p, place=self.place) for p in proposal_label_list
+        ]
+        # filter invalid box roughly
+        if isinstance(self.train_cfg['pseudo_label_initial_score_thr'], float):
+            thr = self.train_cfg['pseudo_label_initial_score_thr']
+        else:
+            # TODO: use dynamic threshold
+            raise NotImplementedError(
+                "Dynamic Threshold is not implemented yet.")
+        proposal_bbox_list, proposal_label_list, proposal_score_list = list(
+            zip(* [
+                filter_invalid(
+                    proposal[:, :4],
+                    proposal_label,
+                    proposal_score,
+                    thr=thr,
+                    min_size=self.train_cfg['min_pseduo_box_size'], )
+                for proposal, proposal_label, proposal_score in
+                zip(proposal_bbox_list, proposal_label_list,
+                    proposal_score_list)
+            ]))
+
+        teacher_bboxes = list(proposal_bbox_list)
+        teacher_labels = proposal_label_list
+        teacher_info = [teacher_bboxes, teacher_labels]
+        student_unsup = data_unsup_s
+        return self.compute_pseudo_label_loss(student_unsup, teacher_info,
+                                              proposal_score_list)
+
+    def compute_pseudo_label_loss(self, student_unsup, teacher_info,
+                                  proposal_score_list):
+
+        pseudo_bboxes = list(teacher_info[0])
+        pseudo_labels = list(teacher_info[1])
+        losses = dict()
+        for i in range(len(pseudo_bboxes)):
+            if pseudo_labels[i].shape[0] == 0:
+                pseudo_bboxes[i] = paddle.zeros([0, 4]).numpy()
+                pseudo_labels[i] = paddle.zeros([0, 1]).numpy()
+            else:
+                pseudo_bboxes[i] = pseudo_bboxes[i][:, :4].numpy()
+                pseudo_labels[i] = pseudo_labels[i].numpy()
+        for i in range(len(pseudo_bboxes)):
+            pseudo_labels[i] = paddle.to_tensor(
+                pseudo_labels[i], dtype=paddle.int32, place=self.place)
+            pseudo_bboxes[i] = paddle.to_tensor(
+                pseudo_bboxes[i], dtype=paddle.float32, place=self.place)
+        student_unsup.update({
+            'gt_bbox': pseudo_bboxes,
+            'gt_class': pseudo_labels
+        })
+        pseudo_sum = 0
+        for i in range(len(pseudo_bboxes)):
+            pseudo_sum += pseudo_bboxes[i].sum()
+        if pseudo_sum == 0:  #input fake data when there are no pseudo labels
+            pseudo_bboxes[0] = paddle.ones([1, 4]) - 0.5
+            pseudo_labels[0] = paddle.ones([1, 1]).astype('int32')
+            student_unsup.update({
+                'gt_bbox': pseudo_bboxes,
+                'gt_class': pseudo_labels
+            })
+            body_feats = self.student.backbone(student_unsup)
+            if self.student.neck is not None:
+                body_feats = self.student.neck(body_feats)
+            out_transformer = self.student.transformer(body_feats, None,
+                                                       student_unsup)
+            losses = self.student.detr_head(out_transformer, body_feats,
+                                            student_unsup)
+            for n, v in losses.items():
+                losses[n] = v * 0
+        else:
+            gt_bbox = []
+            gt_class = []
+            images = []
+            proposal_score = []
+            for i in range(len(pseudo_bboxes)):
+                if pseudo_labels[i].shape[0] == 0:
+                    continue
+                else:
+                    proposal_score.append(proposal_score_list[i].max(-1)
+                                          .unsqueeze(-1))
+                    gt_class.append(pseudo_labels[i])
+                    gt_bbox.append(pseudo_bboxes[i])
+                    images.append(student_unsup['image'][i])
+            images = paddle.stack(images)
+            student_unsup.update({
+                'image': images,
+                'gt_bbox': gt_bbox,
+                'gt_class': gt_class
+            })
+            body_feats = self.student.backbone(student_unsup)
+            if self.student.neck is not None:
+                body_feats = self.student.neck(body_feats)
+            out_transformer = self.student.transformer(body_feats, None,
+                                                       student_unsup)
+            student_unsup.update({'gt_score': proposal_score})
+            losses = self.student.detr_head(out_transformer, body_feats,
+                                            student_unsup)
+        return losses
+
+
+def box_cxcywh_to_xyxy(x):
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return paddle.stack(b, axis=-1)
+
+
+def box_xyxy_to_cxcywh(x):
+    x0, y0, x1, y1 = x.unbind(-1)
+    b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
+    return paddle.stack(b, axis=-1)
+
+
+def get_size_with_aspect_ratio(image_size, size, max_size=None):
+    w, h = image_size
+    if max_size is not None:
+        min_original_size = float(min((w, h)))
+        max_original_size = float(max((w, h)))
+        if max_original_size / min_original_size * size > max_size:
+            size = int(round(max_size * min_original_size / max_original_size))
+
+    if (w <= h and w == size) or (h <= w and h == size):
+        return (w, h)
+
+    if w < h:
+        ow = size
+        oh = int(size * h / w)
+    else:
+        oh = size
+        ow = int(size * w / h)
+
+    return (ow, oh)
+
+
+def align_weak_strong_shape(data_weak, data_strong):
+    shape_x = data_strong['image'].shape[2]
+    shape_y = data_strong['image'].shape[3]
+
+    target_size = [shape_x, shape_y]
+    data_weak['image'] = F.interpolate(
+        data_weak['image'],
+        size=target_size,
+        mode='bilinear',
+        align_corners=False)
+    return data_weak, data_strong
--- a/paddle_detection/ppdet/modeling/architectures/fairmot.py
+++ b/paddle_detection/ppdet/modeling/architectures/fairmot.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['FairMOT']
+
+
+@register
+class FairMOT(BaseArch):
+    """
+    FairMOT network, see http://arxiv.org/abs/2004.01888
+
+    Args:
+        detector (object): 'CenterNet' instance
+        reid (object): 'FairMOTEmbeddingHead' instance
+        tracker (object): 'JDETracker' instance
+        loss (object): 'FairMOTLoss' instance
+
+    """
+
+    __category__ = 'architecture'
+    __inject__ = ['loss']
+
+    def __init__(self,
+                 detector='CenterNet',
+                 reid='FairMOTEmbeddingHead',
+                 tracker='JDETracker',
+                 loss='FairMOTLoss'):
+        super(FairMOT, self).__init__()
+        self.detector = detector
+        self.reid = reid
+        self.tracker = tracker
+        self.loss = loss
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        detector = create(cfg['detector'])
+        detector_out_shape = detector.neck and detector.neck.out_shape or detector.backbone.out_shape
+
+        kwargs = {'input_shape': detector_out_shape}
+        reid = create(cfg['reid'], **kwargs)
+        loss = create(cfg['loss'])
+        tracker = create(cfg['tracker'])
+
+        return {
+            'detector': detector,
+            'reid': reid,
+            'loss': loss,
+            'tracker': tracker
+        }
+
+    def _forward(self):
+        loss = dict()
+        # det_outs keys:
+        # train: neck_feat, det_loss, heatmap_loss, size_loss, offset_loss (optional: iou_loss)
+        # eval/infer: neck_feat, bbox, bbox_inds
+        det_outs = self.detector(self.inputs)
+        neck_feat = det_outs['neck_feat']
+        if self.training:
+            reid_loss = self.reid(neck_feat, self.inputs)
+
+            det_loss = det_outs['det_loss']
+            loss = self.loss(det_loss, reid_loss)
+            for k, v in det_outs.items():
+                if 'loss' not in k:
+                    continue
+                loss.update({k: v})
+            loss.update({'reid_loss': reid_loss})
+            return loss
+        else:
+            pred_dets, pred_embs = self.reid(
+                neck_feat, self.inputs, det_outs['bbox'], det_outs['bbox_inds'],
+                det_outs['topk_clses'])
+            return pred_dets, pred_embs
+
+    def get_pred(self):
+        output = self._forward()
+        return output
+
+    def get_loss(self):
+        loss = self._forward()
+        return loss
--- a/paddle_detection/ppdet/modeling/architectures/faster_rcnn.py
+++ b/paddle_detection/ppdet/modeling/architectures/faster_rcnn.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+import numpy as np
+
+__all__ = ['FasterRCNN']
+
+
+@register
+class FasterRCNN(BaseArch):
+    """
+    Faster R-CNN network, see https://arxiv.org/abs/1506.01497
+
+    Args:
+        backbone (object): backbone instance
+        rpn_head (object): `RPNHead` instance
+        bbox_head (object): `BBoxHead` instance
+        bbox_post_process (object): `BBoxPostProcess` instance
+        neck (object): 'FPN' instance
+    """
+    __category__ = 'architecture'
+    __inject__ = ['bbox_post_process']
+
+    def __init__(self,
+                 backbone,
+                 rpn_head,
+                 bbox_head,
+                 bbox_post_process,
+                 neck=None):
+        super(FasterRCNN, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.rpn_head = rpn_head
+        self.bbox_head = bbox_head
+        self.bbox_post_process = bbox_post_process
+
+    def init_cot_head(self, relationship):
+        self.bbox_head.init_cot_head(relationship)
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = cfg['neck'] and create(cfg['neck'], **kwargs)
+
+        out_shape = neck and neck.out_shape or backbone.out_shape
+        kwargs = {'input_shape': out_shape}
+        rpn_head = create(cfg['rpn_head'], **kwargs)
+        bbox_head = create(cfg['bbox_head'], **kwargs)
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "rpn_head": rpn_head,
+            "bbox_head": bbox_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        if self.neck is not None:
+            body_feats = self.neck(body_feats)
+        if self.training:
+            rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs)
+            bbox_loss, _ = self.bbox_head(body_feats, rois, rois_num,
+                                          self.inputs)
+            return rpn_loss, bbox_loss
+        else:
+            rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)
+            preds, _ = self.bbox_head(body_feats, rois, rois_num, None)
+            im_shape = self.inputs['im_shape']
+            scale_factor = self.inputs['scale_factor']
+            bbox, bbox_num, nms_keep_idx = self.bbox_post_process(
+                preds, (rois, rois_num), im_shape, scale_factor)
+
+            # rescale the prediction back to origin image
+            bboxes, bbox_pred, bbox_num = self.bbox_post_process.get_pred(
+                bbox, bbox_num, im_shape, scale_factor)
+
+            if self.use_extra_data:
+                extra_data = {
+                }  # record the bbox output before nms, such like scores and nms_keep_idx
+                """extra_data:{
+                            'scores': predict scores,
+                            'nms_keep_idx': bbox index before nms,
+                           }
+                """
+                extra_data['scores'] = preds[1]  # predict scores (probability)
+                # Todo: get logits output
+                extra_data[
+                    'nms_keep_idx'] = nms_keep_idx  # bbox index before nms
+                return bbox_pred, bbox_num, extra_data
+            else:
+                return bbox_pred, bbox_num
+
+    def get_loss(self, ):
+        rpn_loss, bbox_loss = self._forward()
+        loss = {}
+        loss.update(rpn_loss)
+        loss.update(bbox_loss)
+        total_loss = paddle.add_n(list(loss.values()))
+        loss.update({'loss': total_loss})
+        return loss
+
+    def get_pred(self):
+        if self.use_extra_data:
+            bbox_pred, bbox_num, extra_data = self._forward()
+            output = {
+                'bbox': bbox_pred,
+                'bbox_num': bbox_num,
+                'extra_data': extra_data
+            }
+        else:
+            bbox_pred, bbox_num = self._forward()
+            output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
+        return output
+
+    def target_bbox_forward(self, data):
+        body_feats = self.backbone(data)
+        if self.neck is not None:
+            body_feats = self.neck(body_feats)
+        rois = [roi for roi in data['gt_bbox']]
+        rois_num = paddle.concat([paddle.shape(roi)[0:1] for roi in rois])
+
+        preds, _ = self.bbox_head(body_feats, rois, rois_num, None, cot=True)
+        return preds
+
+    def relationship_learning(self, loader, num_classes_novel):
+        print('computing relationship')
+        train_labels_list = []
+        label_list = []
+
+        for step_id, data in enumerate(loader):
+            _, bbox_prob = self.target_bbox_forward(data)
+            batch_size = data['im_id'].shape[0]
+            for i in range(batch_size):
+                num_bbox = data['gt_class'][i].shape[0]
+                train_labels = data['gt_class'][i]
+                train_labels_list.append(train_labels.numpy().squeeze(1))
+            base_labels = bbox_prob.detach().numpy()[:, :-1]
+            label_list.append(base_labels)
+
+        labels = np.concatenate(train_labels_list, 0)
+        probabilities = np.concatenate(label_list, 0)
+        N_t = np.max(labels) + 1
+        conditional = []
+        for i in range(N_t):
+            this_class = probabilities[labels == i]
+            average = np.mean(this_class, axis=0, keepdims=True)
+            conditional.append(average)
+        return np.concatenate(conditional)
--- a/paddle_detection/ppdet/modeling/architectures/fcos.py
+++ b/paddle_detection/ppdet/modeling/architectures/fcos.py
@@ -0,0 +1,222 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['FCOS', 'ARSL_FCOS']
+
+
+@register
+class FCOS(BaseArch):
+    """
+    FCOS network, see https://arxiv.org/abs/1904.01355
+
+    Args:
+        backbone (object): backbone instance
+        neck (object): 'FPN' instance
+        fcos_head (object): 'FCOSHead' instance
+        ssod_loss (object): 'SSODFCOSLoss' instance, only used for semi-det(ssod) by DenseTeacher
+    """
+
+    __category__ = 'architecture'
+    __inject__ = ['ssod_loss']
+
+    def __init__(self,
+                 backbone='ResNet',
+                 neck='FPN',
+                 fcos_head='FCOSHead',
+                 ssod_loss='SSODFCOSLoss'):
+        super(FCOS, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.fcos_head = fcos_head
+
+        # for ssod, semi-det
+        self.is_teacher = False
+        self.ssod_loss = ssod_loss
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'input_shape': neck.out_shape}
+        fcos_head = create(cfg['fcos_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "fcos_head": fcos_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        fpn_feats = self.neck(body_feats)
+
+        self.is_teacher = self.inputs.get('is_teacher', False)
+        if self.training or self.is_teacher:
+            losses = self.fcos_head(fpn_feats, self.inputs)
+            return losses
+        else:
+            fcos_head_outs = self.fcos_head(fpn_feats)
+            bbox_pred, bbox_num = self.fcos_head.post_process(
+                fcos_head_outs, self.inputs['scale_factor'])
+            return {'bbox': bbox_pred, 'bbox_num': bbox_num}
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
+
+    def get_loss_keys(self):
+        return ['loss_cls', 'loss_box', 'loss_quality']
+
+    def get_ssod_loss(self, student_head_outs, teacher_head_outs, train_cfg):
+        ssod_losses = self.ssod_loss(student_head_outs, teacher_head_outs,
+                                     train_cfg)
+        return ssod_losses
+
+
+@register
+class ARSL_FCOS(BaseArch):
+    """
+    FCOS ARSL network, see https://arxiv.org/abs/
+
+    Args:
+        backbone (object): backbone instance
+        neck (object): 'FPN' instance
+        fcos_head (object): 'FCOSHead_ARSL' instance
+        fcos_cr_loss (object): 'FCOSLossCR' instance, only used for semi-det(ssod) by ARSL
+    """
+
+    __category__ = 'architecture'
+    __inject__ = ['fcos_cr_loss']
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 fcos_head='FCOSHead_ARSL',
+                 fcos_cr_loss='FCOSLossCR'):
+        super(ARSL_FCOS, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.fcos_head = fcos_head
+        self.fcos_cr_loss = fcos_cr_loss
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'input_shape': neck.out_shape}
+        fcos_head = create(cfg['fcos_head'], **kwargs)
+
+        # consistency regularization loss
+        fcos_cr_loss = create(cfg['fcos_cr_loss'])
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            'fcos_head': fcos_head,
+            'fcos_cr_loss': fcos_cr_loss,
+        }
+
+    def forward(self, inputs, branch="supervised", teacher_prediction=None):
+        assert branch in ['supervised', 'semi_supervised'], \
+            print('In ARSL, type must be supervised or semi_supervised.')
+
+        if self.data_format == 'NHWC':
+            image = inputs['image']
+            inputs['image'] = paddle.transpose(image, [0, 2, 3, 1])
+        self.inputs = inputs
+
+        if self.training:
+            if branch == "supervised":
+                out = self.get_loss()
+            else:
+                out = self.get_pseudo_loss(teacher_prediction)
+        else:
+            # norm test
+            if branch == "supervised":
+                out = self.get_pred()
+                # predict pseudo labels
+            else:
+                out = self.get_pseudo_pred()
+        return out
+
+    # model forward 
+    def model_forward(self):
+        body_feats = self.backbone(self.inputs)
+        fpn_feats = self.neck(body_feats)
+        fcos_head_outs = self.fcos_head(fpn_feats)
+        return fcos_head_outs
+
+    # supervised loss for labeled data
+    def get_loss(self):
+        loss = {}
+        tag_labels, tag_bboxes, tag_centerness = [], [], []
+        for i in range(len(self.fcos_head.fpn_stride)):
+            # labels, reg_target, centerness
+            k_lbl = 'labels{}'.format(i)
+            if k_lbl in self.inputs:
+                tag_labels.append(self.inputs[k_lbl])
+            k_box = 'reg_target{}'.format(i)
+            if k_box in self.inputs:
+                tag_bboxes.append(self.inputs[k_box])
+            k_ctn = 'centerness{}'.format(i)
+            if k_ctn in self.inputs:
+                tag_centerness.append(self.inputs[k_ctn])
+        fcos_head_outs = self.model_forward()
+        loss_fcos = self.fcos_head.get_loss(fcos_head_outs, tag_labels,
+                                            tag_bboxes, tag_centerness)
+        loss.update(loss_fcos)
+        return loss
+
+    # unsupervised loss for unlabeled data
+    def get_pseudo_loss(self, teacher_prediction):
+        loss = {}
+        fcos_head_outs = self.model_forward()
+        unsup_loss = self.fcos_cr_loss(fcos_head_outs, teacher_prediction)
+        for k in unsup_loss.keys():
+            loss[k + '_pseudo'] = unsup_loss[k]
+        return loss
+
+    # get detection results for test, decode and rescale the results to original size
+    def get_pred(self):
+        fcos_head_outs = self.model_forward()
+        scale_factor = self.inputs['scale_factor']
+        bbox_pred, bbox_num = self.fcos_head.post_process(fcos_head_outs,
+                                                          scale_factor)
+        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
+        return output
+
+    # generate pseudo labels to guide student
+    def get_pseudo_pred(self):
+        fcos_head_outs = self.model_forward()
+        pred_cls, pred_loc, pred_iou = fcos_head_outs[1:]  # 0 is locations
+        for lvl, _ in enumerate(pred_loc):
+            pred_loc[lvl] = pred_loc[lvl] / self.fcos_head.fpn_stride[lvl]
+
+        return [pred_cls, pred_loc, pred_iou, self.fcos_head.fpn_stride]
--- a/paddle_detection/ppdet/modeling/architectures/gfl.py
+++ b/paddle_detection/ppdet/modeling/architectures/gfl.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['GFL']
+
+
+@register
+class GFL(BaseArch):
+    """
+    Generalized Focal Loss network, see https://arxiv.org/abs/2006.04388
+
+    Args:
+        backbone (object): backbone instance
+        neck (object): 'FPN' instance
+        head (object): 'GFLHead' instance
+    """
+
+    __category__ = 'architecture'
+
+    def __init__(self, backbone, neck, head='GFLHead'):
+        super(GFL, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.head = head
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'input_shape': neck.out_shape}
+        head = create(cfg['head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "head": head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        fpn_feats = self.neck(body_feats)
+        head_outs = self.head(fpn_feats)
+        if not self.training:
+            im_shape = self.inputs['im_shape']
+            scale_factor = self.inputs['scale_factor']
+            bboxes, bbox_num = self.head.post_process(head_outs, im_shape,
+                                                      scale_factor)
+            return bboxes, bbox_num
+        else:
+            return head_outs
+
+    def get_loss(self, ):
+        loss = {}
+
+        head_outs = self._forward()
+        loss_gfl = self.head.get_loss(head_outs, self.inputs)
+        loss.update(loss_gfl)
+        total_loss = paddle.add_n(list(loss.values()))
+        loss.update({'loss': total_loss})
+        return loss
+
+    def get_pred(self):
+        bbox_pred, bbox_num = self._forward()
+        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
+        return output
--- a/paddle_detection/ppdet/modeling/architectures/jde.py
+++ b/paddle_detection/ppdet/modeling/architectures/jde.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['JDE']
+
+
+@register
+class JDE(BaseArch):
+    __category__ = 'architecture'
+    __shared__ = ['metric']
+    """
+    JDE network, see https://arxiv.org/abs/1909.12605v1
+
+    Args:
+        detector (object): detector model instance
+        reid (object): reid model instance
+        tracker (object): tracker instance
+        metric (str): 'MOTDet' for training and detection evaluation, 'ReID'
+            for ReID embedding evaluation, or 'MOT' for multi object tracking
+            evaluation.
+    """
+
+    def __init__(self,
+                 detector='YOLOv3',
+                 reid='JDEEmbeddingHead',
+                 tracker='JDETracker',
+                 metric='MOT'):
+        super(JDE, self).__init__()
+        self.detector = detector
+        self.reid = reid
+        self.tracker = tracker
+        self.metric = metric
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        detector = create(cfg['detector'])
+        kwargs = {'input_shape': detector.neck.out_shape}
+
+        reid = create(cfg['reid'], **kwargs)
+
+        tracker = create(cfg['tracker'])
+
+        return {
+            "detector": detector,
+            "reid": reid,
+            "tracker": tracker,
+        }
+
+    def _forward(self):
+        det_outs = self.detector(self.inputs)
+
+        if self.training:
+            emb_feats = det_outs['emb_feats']
+            loss_confs = det_outs['det_losses']['loss_confs']
+            loss_boxes = det_outs['det_losses']['loss_boxes']
+            jde_losses = self.reid(
+                emb_feats,
+                self.inputs,
+                loss_confs=loss_confs,
+                loss_boxes=loss_boxes)
+            return jde_losses
+        else:
+            if self.metric == 'MOTDet':
+                det_results = {
+                    'bbox': det_outs['bbox'],
+                    'bbox_num': det_outs['bbox_num'],
+                }
+                return det_results
+
+            elif self.metric == 'MOT':
+                emb_feats = det_outs['emb_feats']
+                bboxes = det_outs['bbox']
+                boxes_idx = det_outs['boxes_idx']
+                nms_keep_idx = det_outs['nms_keep_idx']
+
+                pred_dets, pred_embs = self.reid(
+                    emb_feats,
+                    self.inputs,
+                    bboxes=bboxes,
+                    boxes_idx=boxes_idx,
+                    nms_keep_idx=nms_keep_idx)
+                return pred_dets, pred_embs
+
+            else:
+                raise ValueError("Unknown metric {} for multi object tracking.".
+                                 format(self.metric))
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
--- a/paddle_detection/ppdet/modeling/architectures/keypoint_hrhrnet.py
+++ b/paddle_detection/ppdet/modeling/architectures/keypoint_hrhrnet.py
@@ -0,0 +1,287 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from scipy.optimize import linear_sum_assignment
+from collections import abc, defaultdict
+import numpy as np
+import paddle
+
+from ppdet.core.workspace import register, create, serializable
+from .meta_arch import BaseArch
+from .. import layers as L
+from ..keypoint_utils import transpred
+
+__all__ = ['HigherHRNet']
+
+
+@register
+class HigherHRNet(BaseArch):
+    __category__ = 'architecture'
+
+    def __init__(self,
+                 backbone='HRNet',
+                 hrhrnet_head='HrHRNetHead',
+                 post_process='HrHRNetPostProcess',
+                 eval_flip=True,
+                 flip_perm=None,
+                 max_num_people=30):
+        """
+        HigherHRNet network, see https://arxiv.org/abs/1908.10357；
+        HigherHRNet+swahr, see https://arxiv.org/abs/2012.15175
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            hrhrnet_head (nn.Layer): keypoint_head instance
+            bbox_post_process (object): `BBoxPostProcess` instance
+        """
+        super(HigherHRNet, self).__init__()
+        self.backbone = backbone
+        self.hrhrnet_head = hrhrnet_head
+        self.post_process = post_process
+        self.flip = eval_flip
+        self.flip_perm = paddle.to_tensor(flip_perm)
+        self.deploy = False
+        self.interpolate = L.Upsample(2, mode='bilinear')
+        self.pool = L.MaxPool(5, 1, 2)
+        self.max_num_people = max_num_people
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+        # head
+        kwargs = {'input_shape': backbone.out_shape}
+        hrhrnet_head = create(cfg['hrhrnet_head'], **kwargs)
+        post_process = create(cfg['post_process'])
+
+        return {
+            'backbone': backbone,
+            "hrhrnet_head": hrhrnet_head,
+            "post_process": post_process,
+        }
+
+    def _forward(self):
+        if self.flip and not self.training and not self.deploy:
+            self.inputs['image'] = paddle.concat(
+                (self.inputs['image'], paddle.flip(self.inputs['image'], [3])))
+        body_feats = self.backbone(self.inputs)
+
+        if self.training:
+            return self.hrhrnet_head(body_feats, self.inputs)
+        else:
+            outputs = self.hrhrnet_head(body_feats)
+
+            if self.flip and not self.deploy:
+                outputs = [paddle.split(o, 2) for o in outputs]
+                output_rflip = [
+                    paddle.flip(paddle.gather(o[1], self.flip_perm, 1), [3])
+                    for o in outputs
+                ]
+                output1 = [o[0] for o in outputs]
+                heatmap = (output1[0] + output_rflip[0]) / 2.
+                tagmaps = [output1[1], output_rflip[1]]
+                outputs = [heatmap] + tagmaps
+            outputs = self.get_topk(outputs)
+
+            if self.deploy:
+                return outputs
+
+            res_lst = []
+            h = self.inputs['im_shape'][0, 0].numpy().item()
+            w = self.inputs['im_shape'][0, 1].numpy().item()
+            kpts, scores = self.post_process(*outputs, h, w)
+            res_lst.append([kpts, scores])
+            return res_lst
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        outputs = {}
+        res_lst = self._forward()
+        outputs['keypoint'] = res_lst
+        return outputs
+
+    def get_topk(self, outputs):
+        # resize to image size
+        outputs = [self.interpolate(x) for x in outputs]
+        if len(outputs) == 3:
+            tagmap = paddle.concat(
+                (outputs[1].unsqueeze(4), outputs[2].unsqueeze(4)), axis=4)
+        else:
+            tagmap = outputs[1].unsqueeze(4)
+
+        heatmap = outputs[0]
+        N, J = 1, self.hrhrnet_head.num_joints
+        heatmap_maxpool = self.pool(heatmap)
+        # topk
+        maxmap = heatmap * (heatmap == heatmap_maxpool)
+        maxmap = maxmap.reshape([N, J, -1])
+        heat_k, inds_k = maxmap.topk(self.max_num_people, axis=2)
+
+        outputs = [heatmap, tagmap, heat_k, inds_k]
+        return outputs
+
+
+@register
+@serializable
+class HrHRNetPostProcess(object):
+    '''
+    HrHRNet postprocess contain:
+        1) get topk keypoints in the output heatmap
+        2) sample the tagmap's value corresponding to each of the topk coordinate
+        3) match different joints to combine to some people with Hungary algorithm
+        4) adjust the coordinate by +-0.25 to decrease error std
+        5) salvage missing joints by check positivity of heatmap - tagdiff_norm
+    Args:
+        max_num_people (int): max number of people support in postprocess
+        heat_thresh (float): value of topk below this threshhold will be ignored
+        tag_thresh (float): coord's value sampled in tagmap below this threshold belong to same people for init
+
+        inputs(list[heatmap]): the output list of model, [heatmap, heatmap_maxpool, tagmap], heatmap_maxpool used to get topk
+        original_height, original_width (float): the original image size
+    '''
+
+    def __init__(self, max_num_people=30, heat_thresh=0.1, tag_thresh=1.):
+        self.max_num_people = max_num_people
+        self.heat_thresh = heat_thresh
+        self.tag_thresh = tag_thresh
+
+    def lerp(self, j, y, x, heatmap):
+        H, W = heatmap.shape[-2:]
+        left = np.clip(x - 1, 0, W - 1)
+        right = np.clip(x + 1, 0, W - 1)
+        up = np.clip(y - 1, 0, H - 1)
+        down = np.clip(y + 1, 0, H - 1)
+        offset_y = np.where(heatmap[j, down, x] > heatmap[j, up, x], 0.25,
+                            -0.25)
+        offset_x = np.where(heatmap[j, y, right] > heatmap[j, y, left], 0.25,
+                            -0.25)
+        return offset_y + 0.5, offset_x + 0.5
+
+    def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height,
+                 original_width):
+
+        N, J, H, W = heatmap.shape
+        assert N == 1, "only support batch size 1"
+        heatmap = heatmap[0].cpu().detach().numpy()
+        tagmap = tagmap[0].cpu().detach().numpy()
+        heats = heat_k[0].cpu().detach().numpy()
+        inds_np = inds_k[0].cpu().detach().numpy()
+        y = inds_np // W
+        x = inds_np % W
+        tags = tagmap[np.arange(J)[None, :].repeat(self.max_num_people),
+                      y.flatten(), x.flatten()].reshape(J, -1, tagmap.shape[-1])
+        coords = np.stack((y, x), axis=2)
+        # threshold
+        mask = heats > self.heat_thresh
+        # cluster
+        cluster = defaultdict(lambda: {
+            'coords': np.zeros((J, 2), dtype=np.float32),
+            'scores': np.zeros(J, dtype=np.float32),
+            'tags': []
+        })
+        for jid, m in enumerate(mask):
+            num_valid = m.sum()
+            if num_valid == 0:
+                continue
+            valid_inds = np.where(m)[0]
+            valid_tags = tags[jid, m, :]
+            if len(cluster) == 0:  # initialize
+                for i in valid_inds:
+                    tag = tags[jid, i]
+                    key = tag[0]
+                    cluster[key]['tags'].append(tag)
+                    cluster[key]['scores'][jid] = heats[jid, i]
+                    cluster[key]['coords'][jid] = coords[jid, i]
+                continue
+            candidates = list(cluster.keys())[:self.max_num_people]
+            centroids = [
+                np.mean(
+                    cluster[k]['tags'], axis=0) for k in candidates
+            ]
+            num_clusters = len(centroids)
+            # shape is (num_valid, num_clusters, tag_dim)
+            dist = valid_tags[:, None, :] - np.array(centroids)[None, ...]
+            l2_dist = np.linalg.norm(dist, ord=2, axis=2)
+            # modulate dist with heat value, see `use_detection_val`
+            cost = np.round(l2_dist) * 100 - heats[jid, m, None]
+            # pad the cost matrix, otherwise new pose are ignored
+            if num_valid > num_clusters:
+                cost = np.pad(cost, ((0, 0), (0, num_valid - num_clusters)),
+                              'constant',
+                              constant_values=((0, 0), (0, 1e-10)))
+            rows, cols = linear_sum_assignment(cost)
+            for y, x in zip(rows, cols):
+                tag = tags[jid, y]
+                if y < num_valid and x < num_clusters and \
+                   l2_dist[y, x] < self.tag_thresh:
+                    key = candidates[x]  # merge to cluster
+                else:
+                    key = tag[0]  # initialize new cluster
+                cluster[key]['tags'].append(tag)
+                cluster[key]['scores'][jid] = heats[jid, y]
+                cluster[key]['coords'][jid] = coords[jid, y]
+
+        # shape is [k, J, 2] and [k, J]
+        pose_tags = np.array([cluster[k]['tags'] for k in cluster])
+        pose_coords = np.array([cluster[k]['coords'] for k in cluster])
+        pose_scores = np.array([cluster[k]['scores'] for k in cluster])
+        valid = pose_scores > 0
+
+        pose_kpts = np.zeros((pose_scores.shape[0], J, 3), dtype=np.float32)
+        if valid.sum() == 0:
+            return pose_kpts, pose_kpts
+
+        # refine coords
+        valid_coords = pose_coords[valid].astype(np.int32)
+        y = valid_coords[..., 0].flatten()
+        x = valid_coords[..., 1].flatten()
+        _, j = np.nonzero(valid)
+        offsets = self.lerp(j, y, x, heatmap)
+        pose_coords[valid, 0] += offsets[0]
+        pose_coords[valid, 1] += offsets[1]
+
+        # mean score before salvage
+        mean_score = pose_scores.mean(axis=1)
+        pose_kpts[valid, 2] = pose_scores[valid]
+
+        # salvage missing joints
+        if True:
+            for pid, coords in enumerate(pose_coords):
+                tag_mean = np.array(pose_tags[pid]).mean(axis=0)
+                norm = np.sum((tagmap - tag_mean)**2, axis=3)**0.5
+                score = heatmap - np.round(norm)  # (J, H, W)
+                flat_score = score.reshape(J, -1)
+                max_inds = np.argmax(flat_score, axis=1)
+                max_scores = np.max(flat_score, axis=1)
+                salvage_joints = (pose_scores[pid] == 0) & (max_scores > 0)
+                if salvage_joints.sum() == 0:
+                    continue
+                y = max_inds[salvage_joints] // W
+                x = max_inds[salvage_joints] % W
+                offsets = self.lerp(salvage_joints.nonzero()[0], y, x, heatmap)
+                y = y.astype(np.float32) + offsets[0]
+                x = x.astype(np.float32) + offsets[1]
+                pose_coords[pid][salvage_joints, 0] = y
+                pose_coords[pid][salvage_joints, 1] = x
+                pose_kpts[pid][salvage_joints, 2] = max_scores[salvage_joints]
+        pose_kpts[..., :2] = transpred(pose_coords[..., :2][..., ::-1],
+                                       original_height, original_width,
+                                       min(H, W))
+        return pose_kpts, mean_score
--- a/paddle_detection/ppdet/modeling/architectures/keypoint_hrnet.py
+++ b/paddle_detection/ppdet/modeling/architectures/keypoint_hrnet.py
@@ -0,0 +1,468 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License. 
+# You may obtain a copy of the License at 
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and 
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import numpy as np
+import math
+import cv2
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+from ..keypoint_utils import transform_preds
+from .. import layers as L
+from paddle.nn import functional as F
+
+__all__ = ['TopDownHRNet', 'TinyPose3DHRNet', 'TinyPose3DHRHeatmapNet']
+
+
+@register
+class TopDownHRNet(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['loss']
+
+    def __init__(self,
+                 width,
+                 num_joints,
+                 backbone='HRNet',
+                 loss='KeyPointMSELoss',
+                 post_process='HRNetPostProcess',
+                 flip_perm=None,
+                 flip=True,
+                 shift_heatmap=True,
+                 use_dark=True):
+        """
+        HRNet network, see https://arxiv.org/abs/1902.09212
+ 
+        Args:
+            backbone (nn.Layer): backbone instance
+            post_process (object): `HRNetPostProcess` instance
+            flip_perm (list): The left-right joints exchange order list
+            use_dark(bool): Whether to use DARK in post processing
+        """
+        super(TopDownHRNet, self).__init__()
+        self.backbone = backbone
+        self.post_process = HRNetPostProcess(use_dark)
+        self.loss = loss
+        self.flip_perm = flip_perm
+        self.flip = flip
+        self.final_conv = L.Conv2d(width, num_joints, 1, 1, 0, bias=True)
+        self.shift_heatmap = shift_heatmap
+        self.deploy = False
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        return {'backbone': backbone, }
+
+    def _forward(self):
+        feats = self.backbone(self.inputs)
+        hrnet_outputs = self.final_conv(feats[0])
+
+        if self.training:
+            return self.loss(hrnet_outputs, self.inputs)
+        elif self.deploy:
+            outshape = hrnet_outputs.shape
+            max_idx = paddle.argmax(
+                hrnet_outputs.reshape(
+                    (outshape[0], outshape[1], outshape[2] * outshape[3])),
+                axis=-1)
+            return hrnet_outputs, max_idx
+        else:
+            if self.flip:
+                self.inputs['image'] = self.inputs['image'].flip([3])
+                feats = self.backbone(self.inputs)
+                output_flipped = self.final_conv(feats[0])
+                output_flipped = self.flip_back(output_flipped.numpy(),
+                                                self.flip_perm)
+                output_flipped = paddle.to_tensor(output_flipped.copy())
+                if self.shift_heatmap:
+                    output_flipped[:, :, :, 1:] = output_flipped.clone(
+                    )[:, :, :, 0:-1]
+                hrnet_outputs = (hrnet_outputs + output_flipped) * 0.5
+            imshape = (self.inputs['im_shape'].numpy()
+                       )[:, ::-1] if 'im_shape' in self.inputs else None
+            center = self.inputs['center'].numpy(
+            ) if 'center' in self.inputs else np.round(imshape / 2.)
+            scale = self.inputs['scale'].numpy(
+            ) if 'scale' in self.inputs else imshape / 200.
+            outputs = self.post_process(hrnet_outputs, center, scale)
+            return outputs
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        res_lst = self._forward()
+        outputs = {'keypoint': res_lst}
+        return outputs
+
+    def flip_back(self, output_flipped, matched_parts):
+        assert output_flipped.ndim == 4,\
+                'output_flipped should be [batch_size, num_joints, height, width]'
+
+        output_flipped = output_flipped[:, :, :, ::-1]
+
+        for pair in matched_parts:
+            tmp = output_flipped[:, pair[0], :, :].copy()
+            output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
+            output_flipped[:, pair[1], :, :] = tmp
+
+        return output_flipped
+
+
+class HRNetPostProcess(object):
+    def __init__(self, use_dark=True):
+        self.use_dark = use_dark
+
+    def get_max_preds(self, heatmaps):
+        '''get predictions from score maps
+ 
+        Args:
+            heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
+ 
+        Returns:
+            preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
+            maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints
+        '''
+        assert isinstance(heatmaps,
+                          np.ndarray), 'heatmaps should be numpy.ndarray'
+        assert heatmaps.ndim == 4, 'batch_images should be 4-ndim'
+
+        batch_size = heatmaps.shape[0]
+        num_joints = heatmaps.shape[1]
+        width = heatmaps.shape[3]
+        heatmaps_reshaped = heatmaps.reshape((batch_size, num_joints, -1))
+        idx = np.argmax(heatmaps_reshaped, 2)
+        maxvals = np.amax(heatmaps_reshaped, 2)
+
+        maxvals = maxvals.reshape((batch_size, num_joints, 1))
+        idx = idx.reshape((batch_size, num_joints, 1))
+
+        preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
+
+        preds[:, :, 0] = (preds[:, :, 0]) % width
+        preds[:, :, 1] = np.floor((preds[:, :, 1]) / width)
+
+        pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))
+        pred_mask = pred_mask.astype(np.float32)
+
+        preds *= pred_mask
+
+        return preds, maxvals
+
+    def gaussian_blur(self, heatmap, kernel):
+        border = (kernel - 1) // 2
+        batch_size = heatmap.shape[0]
+        num_joints = heatmap.shape[1]
+        height = heatmap.shape[2]
+        width = heatmap.shape[3]
+        for i in range(batch_size):
+            for j in range(num_joints):
+                origin_max = np.max(heatmap[i, j])
+                dr = np.zeros((height + 2 * border, width + 2 * border))
+                dr[border:-border, border:-border] = heatmap[i, j].copy()
+                dr = cv2.GaussianBlur(dr, (kernel, kernel), 0)
+                heatmap[i, j] = dr[border:-border, border:-border].copy()
+                heatmap[i, j] *= origin_max / np.max(heatmap[i, j])
+        return heatmap
+
+    def dark_parse(self, hm, coord):
+        heatmap_height = hm.shape[0]
+        heatmap_width = hm.shape[1]
+        px = int(coord[0])
+        py = int(coord[1])
+        if 1 < px < heatmap_width - 2 and 1 < py < heatmap_height - 2:
+            dx = 0.5 * (hm[py][px + 1] - hm[py][px - 1])
+            dy = 0.5 * (hm[py + 1][px] - hm[py - 1][px])
+            dxx = 0.25 * (hm[py][px + 2] - 2 * hm[py][px] + hm[py][px - 2])
+            dxy = 0.25 * (hm[py+1][px+1] - hm[py-1][px+1] - hm[py+1][px-1] \
+                + hm[py-1][px-1])
+            dyy = 0.25 * (
+                hm[py + 2 * 1][px] - 2 * hm[py][px] + hm[py - 2 * 1][px])
+            derivative = np.matrix([[dx], [dy]])
+            hessian = np.matrix([[dxx, dxy], [dxy, dyy]])
+            if dxx * dyy - dxy**2 != 0:
+                hessianinv = hessian.I
+                offset = -hessianinv * derivative
+                offset = np.squeeze(np.array(offset.T), axis=0)
+                coord += offset
+        return coord
+
+    def dark_postprocess(self, hm, coords, kernelsize):
+        '''DARK postpocessing, Zhang et al. Distribution-Aware Coordinate
+        Representation for Human Pose Estimation (CVPR 2020).
+        '''
+
+        hm = self.gaussian_blur(hm, kernelsize)
+        hm = np.maximum(hm, 1e-10)
+        hm = np.log(hm)
+        for n in range(coords.shape[0]):
+            for p in range(coords.shape[1]):
+                coords[n, p] = self.dark_parse(hm[n][p], coords[n][p])
+        return coords
+
+    def get_final_preds(self, heatmaps, center, scale, kernelsize=3):
+        """the highest heatvalue location with a quarter offset in the
+        direction from the highest response to the second highest response.
+ 
+        Args:
+            heatmaps (numpy.ndarray): The predicted heatmaps
+            center (numpy.ndarray): The boxes center
+            scale (numpy.ndarray): The scale factor
+ 
+        Returns:
+            preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
+            maxvals: numpy.ndarray([batch_size, num_joints, 1]), the maximum confidence of the keypoints
+        """
+        coords, maxvals = self.get_max_preds(heatmaps)
+
+        heatmap_height = heatmaps.shape[2]
+        heatmap_width = heatmaps.shape[3]
+
+        if self.use_dark:
+            coords = self.dark_postprocess(heatmaps, coords, kernelsize)
+        else:
+            for n in range(coords.shape[0]):
+                for p in range(coords.shape[1]):
+                    hm = heatmaps[n][p]
+                    px = int(math.floor(coords[n][p][0] + 0.5))
+                    py = int(math.floor(coords[n][p][1] + 0.5))
+                    if 1 < px < heatmap_width - 1 and 1 < py < heatmap_height - 1:
+                        diff = np.array([
+                            hm[py][px + 1] - hm[py][px - 1],
+                            hm[py + 1][px] - hm[py - 1][px]
+                        ])
+                        coords[n][p] += np.sign(diff) * .25
+        preds = coords.copy()
+
+        # Transform back
+        for i in range(coords.shape[0]):
+            preds[i] = transform_preds(coords[i], center[i], scale[i],
+                                       [heatmap_width, heatmap_height])
+
+        return preds, maxvals
+
+    def __call__(self, output, center, scale):
+        preds, maxvals = self.get_final_preds(output.numpy(), center, scale)
+        outputs = [[
+            np.concatenate(
+                (preds, maxvals), axis=-1), np.mean(
+                    maxvals, axis=1)
+        ]]
+        return outputs
+
+
+class TinyPose3DPostProcess(object):
+    def __init__(self):
+        pass
+
+    def __call__(self, output, center, scale):
+        """
+        Args:
+            output (numpy.ndarray): numpy.ndarray([batch_size, num_joints, 3]), keypoints coords
+            scale (numpy.ndarray): The scale factor
+        Returns:
+            preds: numpy.ndarray([batch_size, num_joints, 3]), keypoints coords
+        """
+
+        preds = output.numpy().copy()
+
+        # Transform back
+        for i in range(output.shape[0]):  # batch_size
+            preds[i][:, 0] = preds[i][:, 0] * scale[i][0]
+            preds[i][:, 1] = preds[i][:, 1] * scale[i][1]
+
+        return preds
+
+
+def soft_argmax(heatmaps, joint_num):
+    dims = heatmaps.shape
+    depth_dim = (int)(dims[1] / joint_num)
+    heatmaps = heatmaps.reshape((-1, joint_num, depth_dim * dims[2] * dims[3]))
+    heatmaps = F.softmax(heatmaps, 2)
+    heatmaps = heatmaps.reshape((-1, joint_num, depth_dim, dims[2], dims[3]))
+
+    accu_x = heatmaps.sum(axis=(2, 3))
+    accu_y = heatmaps.sum(axis=(2, 4))
+    accu_z = heatmaps.sum(axis=(3, 4))
+
+    accu_x = accu_x * paddle.arange(1, 33)
+    accu_y = accu_y * paddle.arange(1, 33)
+    accu_z = accu_z * paddle.arange(1, 33)
+
+    accu_x = accu_x.sum(axis=2, keepdim=True) - 1
+    accu_y = accu_y.sum(axis=2, keepdim=True) - 1
+    accu_z = accu_z.sum(axis=2, keepdim=True) - 1
+
+    coord_out = paddle.concat(
+        (accu_x, accu_y, accu_z), axis=2)  # [batch_size, joint_num, 3]
+
+    return coord_out
+
+
+@register
+class TinyPose3DHRHeatmapNet(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['loss']
+
+    def __init__(
+            self,
+            width,  # 40, backbone输出的channel数目
+            num_joints,
+            backbone='HRNet',
+            loss='KeyPointRegressionMSELoss',
+            post_process=TinyPose3DPostProcess):
+        """
+        Args:
+            backbone (nn.Layer): backbone instance
+            post_process (object): post process instance
+        """
+        super(TinyPose3DHRHeatmapNet, self).__init__()
+
+        self.backbone = backbone
+        self.post_process = TinyPose3DPostProcess()
+        self.loss = loss
+        self.deploy = False
+        self.num_joints = num_joints
+
+        self.final_conv = L.Conv2d(width, num_joints * 32, 1, 1, 0, bias=True)
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        return {'backbone': backbone, }
+
+    def _forward(self):
+        feats = self.backbone(self.inputs)  # feats:[[batch_size, 40, 32, 24]]
+
+        hrnet_outputs = self.final_conv(feats[0])
+        res = soft_argmax(hrnet_outputs, self.num_joints)
+        return res
+
+    def get_loss(self):
+        pose3d = self._forward()
+        loss = self.loss(pose3d, None, self.inputs)
+        outputs = {'loss': loss}
+        return outputs
+
+    def get_pred(self):
+        res_lst = self._forward()
+        outputs = {'pose3d': res_lst}
+        return outputs
+
+    def flip_back(self, output_flipped, matched_parts):
+        assert output_flipped.ndim == 4,\
+                'output_flipped should be [batch_size, num_joints, height, width]'
+
+        output_flipped = output_flipped[:, :, :, ::-1]
+
+        for pair in matched_parts:
+            tmp = output_flipped[:, pair[0], :, :].copy()
+            output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
+            output_flipped[:, pair[1], :, :] = tmp
+
+        return output_flipped
+
+
+@register
+class TinyPose3DHRNet(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['loss']
+
+    def __init__(self,
+                 width,
+                 num_joints,
+                 fc_channel=768,
+                 backbone='HRNet',
+                 loss='KeyPointRegressionMSELoss',
+                 post_process=TinyPose3DPostProcess):
+        """
+        Args:
+            backbone (nn.Layer): backbone instance
+            post_process (object): post process instance
+        """
+        super(TinyPose3DHRNet, self).__init__()
+        self.backbone = backbone
+        self.post_process = TinyPose3DPostProcess()
+        self.loss = loss
+        self.deploy = False
+        self.num_joints = num_joints
+
+        self.final_conv = L.Conv2d(width, num_joints, 1, 1, 0, bias=True)
+
+        self.flatten = paddle.nn.Flatten(start_axis=2, stop_axis=3)
+        self.fc1 = paddle.nn.Linear(fc_channel, 256)
+        self.act1 = paddle.nn.ReLU()
+        self.fc2 = paddle.nn.Linear(256, 64)
+        self.act2 = paddle.nn.ReLU()
+        self.fc3 = paddle.nn.Linear(64, 3)
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        return {'backbone': backbone, }
+
+    def _forward(self):
+        '''
+        self.inputs is a dict
+        '''
+        feats = self.backbone(
+            self.inputs)  # feats:[[batch_size, 40, width/4, height/4]]
+
+        hrnet_outputs = self.final_conv(
+            feats[0])  # hrnet_outputs: [batch_size, num_joints*32,32,32]
+
+        flatten_res = self.flatten(
+            hrnet_outputs)  # [batch_size,num_joints*32,32*32]
+
+        res = self.fc1(flatten_res)
+        res = self.act1(res)
+        res = self.fc2(res)
+        res = self.act2(res)
+        res = self.fc3(res)
+
+        if self.training:
+            return self.loss(res, self.inputs)
+        else:  # export model need
+            return res
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        res_lst = self._forward()
+        outputs = {'pose3d': res_lst}
+        return outputs
+
+    def flip_back(self, output_flipped, matched_parts):
+        assert output_flipped.ndim == 4,\
+                'output_flipped should be [batch_size, num_joints, height, width]'
+
+        output_flipped = output_flipped[:, :, :, ::-1]
+
+        for pair in matched_parts:
+            tmp = output_flipped[:, pair[0], :, :].copy()
+            output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
+            output_flipped[:, pair[1], :, :] = tmp
+
+        return output_flipped
--- a/paddle_detection/ppdet/modeling/architectures/keypoint_petr.py
+++ b/paddle_detection/ppdet/modeling/architectures/keypoint_petr.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License. 
+# You may obtain a copy of the License at 
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and 
+# limitations under the License.
+"""
+this code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/detectors/petr.py
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from ppdet.core.workspace import register
+from .meta_arch import BaseArch
+from .. import layers as L
+
+__all__ = ['PETR']
+
+
+@register
+class PETR(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['backbone', 'neck', 'bbox_head']
+
+    def __init__(self,
+                 backbone='ResNet',
+                 neck='ChannelMapper',
+                 bbox_head='PETRHead'):
+        """
+        PETR, see https://openaccess.thecvf.com/content/CVPR2022/papers/Shi_End-to-End_Multi-Person_Pose_Estimation_With_Transformers_CVPR_2022_paper.pdf
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            neck (nn.Layer): neck between backbone and head
+            bbox_head (nn.Layer): model output and loss
+        """
+        super(PETR, self).__init__()
+        self.backbone = backbone
+        if neck is not None:
+            self.with_neck = True
+        self.neck = neck
+        self.bbox_head = bbox_head
+        self.deploy = False
+
+    def extract_feat(self, img):
+        """Directly extract features from the backbone+neck."""
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def get_inputs(self):
+        img_metas = []
+        gt_bboxes = []
+        gt_labels = []
+        gt_keypoints = []
+        gt_areas = []
+        pad_gt_mask = self.inputs['pad_gt_mask'].astype("bool").squeeze(-1)
+        for idx, im_shape in enumerate(self.inputs['im_shape']):
+            img_meta = {
+                'img_shape': im_shape.astype("int32").tolist() + [1, ],
+                'batch_input_shape': self.inputs['image'].shape[-2:],
+                'image_name': self.inputs['image_file'][idx]
+            }
+            img_metas.append(img_meta)
+            if (not pad_gt_mask[idx].any()):
+                gt_keypoints.append(self.inputs['gt_joints'][idx][:1])
+                gt_labels.append(self.inputs['gt_class'][idx][:1])
+                gt_bboxes.append(self.inputs['gt_bbox'][idx][:1])
+                gt_areas.append(self.inputs['gt_areas'][idx][:1])
+                continue
+
+            gt_keypoints.append(self.inputs['gt_joints'][idx][pad_gt_mask[idx]])
+            gt_labels.append(self.inputs['gt_class'][idx][pad_gt_mask[idx]])
+            gt_bboxes.append(self.inputs['gt_bbox'][idx][pad_gt_mask[idx]])
+            gt_areas.append(self.inputs['gt_areas'][idx][pad_gt_mask[idx]])
+
+        return img_metas, gt_bboxes, gt_labels, gt_keypoints, gt_areas
+
+    def get_loss(self):
+        """
+        Args:
+            img (Tensor): Input images of shape (N, C, H, W).
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): A List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                :class:`mmdet.datasets.pipelines.Collect`.
+            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box.
+            gt_keypoints (list[Tensor]): Each item are the truth keypoints for
+                each image in [p^{1}_x, p^{1}_y, p^{1}_v, ..., p^{K}_x,
+                p^{K}_y, p^{K}_v] format.
+            gt_areas (list[Tensor]): mask areas corresponding to each box.
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+
+        img_metas, gt_bboxes, gt_labels, gt_keypoints, gt_areas = self.get_inputs(
+        )
+        gt_bboxes_ignore = getattr(self.inputs, 'gt_bboxes_ignore', None)
+
+        x = self.extract_feat(self.inputs)
+        losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes,
+                                              gt_labels, gt_keypoints, gt_areas,
+                                              gt_bboxes_ignore)
+        loss = 0
+        for k, v in losses.items():
+            loss += v
+        losses['loss'] = loss
+
+        return losses
+
+    def get_pred_numpy(self):
+        """Used for computing network flops.
+        """
+
+        img = self.inputs['image']
+        batch_size, _, height, width = img.shape
+        dummy_img_metas = [
+            dict(
+                batch_input_shape=(height, width),
+                img_shape=(height, width, 3),
+                scale_factor=(1., 1., 1., 1.)) for _ in range(batch_size)
+        ]
+        x = self.extract_feat(img)
+        outs = self.bbox_head(x, img_metas=dummy_img_metas)
+        bbox_list = self.bbox_head.get_bboxes(
+            *outs, dummy_img_metas, rescale=True)
+        return bbox_list
+
+    def get_pred(self):
+        """
+        """
+        img = self.inputs['image']
+        batch_size, _, height, width = img.shape
+        img_metas = [
+            dict(
+                batch_input_shape=(height, width),
+                img_shape=(height, width, 3),
+                scale_factor=self.inputs['scale_factor'][i])
+            for i in range(batch_size)
+        ]
+        kptpred = self.simple_test(
+            self.inputs, img_metas=img_metas, rescale=True)
+        keypoints = kptpred[0][1][0]
+        bboxs = kptpred[0][0][0]
+        keypoints[..., 2] = bboxs[:, None, 4]
+        res_lst = [[keypoints, bboxs[:, 4]]]
+        outputs = {'keypoint': res_lst}
+        return outputs
+
+    def simple_test(self, inputs, img_metas, rescale=False):
+        """Test function without test time augmentation.
+
+        Args:
+            inputs (list[paddle.Tensor]): List of multiple images.
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[list[np.ndarray]]: BBox and keypoint results of each image
+                and classes. The outer list corresponds to each image.
+                The inner list corresponds to each class.
+        """
+        batch_size = len(img_metas)
+        assert batch_size == 1, 'Currently only batch_size 1 for inference ' \
+            f'mode is supported. Found batch_size {batch_size}.'
+        feat = self.extract_feat(inputs)
+        results_list = self.bbox_head.simple_test(
+            feat, img_metas, rescale=rescale)
+
+        bbox_kpt_results = [
+            self.bbox_kpt2result(det_bboxes, det_labels, det_kpts,
+                                 self.bbox_head.num_classes)
+            for det_bboxes, det_labels, det_kpts in results_list
+        ]
+        return bbox_kpt_results
+
+    def bbox_kpt2result(self, bboxes, labels, kpts, num_classes):
+        """Convert detection results to a list of numpy arrays.
+
+        Args:
+            bboxes (paddle.Tensor | np.ndarray): shape (n, 5).
+            labels (paddle.Tensor | np.ndarray): shape (n, ).
+            kpts (paddle.Tensor | np.ndarray): shape (n, K, 3).
+            num_classes (int): class number, including background class.
+
+        Returns:
+            list(ndarray): bbox and keypoint results of each class.
+        """
+        if bboxes.shape[0] == 0:
+            return [np.zeros((0, 5), dtype=np.float32) for i in range(num_classes)], \
+                [np.zeros((0, kpts.size(1), 3), dtype=np.float32)
+                    for i in range(num_classes)]
+        else:
+            if isinstance(bboxes, paddle.Tensor):
+                bboxes = bboxes.numpy()
+                labels = labels.numpy()
+                kpts = kpts.numpy()
+            return [bboxes[labels == i, :] for i in range(num_classes)], \
+                [kpts[labels == i, :, :] for i in range(num_classes)]
--- a/paddle_detection/ppdet/modeling/architectures/keypoint_vitpose.py
+++ b/paddle_detection/ppdet/modeling/architectures/keypoint_vitpose.py
@@ -0,0 +1,317 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License. 
+# You may obtain a copy of the License at 
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and 
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import numpy as np
+import math
+import cv2
+from ppdet.core.workspace import register, create, serializable
+from .meta_arch import BaseArch
+from ..keypoint_utils import transform_preds
+from .. import layers as L
+
+__all__ = ['VitPose_TopDown', 'VitPosePostProcess']
+
+
+@register
+class VitPose_TopDown(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['loss']
+
+    def __init__(self, backbone, head, loss, post_process, flip_test):
+        """
+        VitPose network, see https://arxiv.org/pdf/2204.12484v2.pdf
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            post_process (object): `HRNetPostProcess` instance
+            
+        """
+        super(VitPose_TopDown, self).__init__()
+        self.backbone = backbone
+        self.head = head
+        self.loss = loss
+        self.post_process = post_process
+        self.flip_test = flip_test
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+        #head
+        head = create(cfg['head'])
+        #post_process
+        post_process = create(cfg['post_process'])
+
+        return {
+            'backbone': backbone,
+            'head': head,
+            'post_process': post_process
+        }
+
+    def _forward_train(self):
+
+        feats = self.backbone.forward_features(self.inputs['image'])
+        vitpost_output = self.head(feats)
+        return self.loss(vitpost_output, self.inputs)
+
+    def _forward_test(self):
+
+        feats = self.backbone.forward_features(self.inputs['image'])
+        output_heatmap = self.head(feats)
+
+        if self.flip_test:
+            img_flipped = self.inputs['image'].flip(3)
+            features_flipped = self.backbone.forward_features(img_flipped)
+            output_flipped_heatmap = self.head.inference_model(features_flipped,
+                                                               self.flip_test)
+
+            output_heatmap = (output_heatmap + output_flipped_heatmap) * 0.5
+
+        imshape = (self.inputs['im_shape'].numpy()
+                   )[:, ::-1] if 'im_shape' in self.inputs else None
+        center = self.inputs['center'].numpy(
+        ) if 'center' in self.inputs else np.round(imshape / 2.)
+        scale = self.inputs['scale'].numpy(
+        ) if 'scale' in self.inputs else imshape / 200.
+
+        result = self.post_process(output_heatmap.cpu().numpy(), center, scale)
+
+        return result
+
+    def get_loss(self):
+        return self._forward_train()
+
+    def get_pred(self):
+        res_lst = self._forward_test()
+        outputs = {'keypoint': res_lst}
+        return outputs
+
+
+@register
+@serializable
+class VitPosePostProcess(object):
+    def __init__(self, use_dark=False):
+        self.use_dark = use_dark
+
+    def get_max_preds(self, heatmaps):
+        '''get predictions from score maps
+
+        Args:
+            heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
+
+        Returns:
+            preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
+            maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints
+        '''
+        assert isinstance(heatmaps,
+                          np.ndarray), 'heatmaps should be numpy.ndarray'
+        assert heatmaps.ndim == 4, 'batch_images should be 4-ndim'
+
+        batch_size = heatmaps.shape[0]
+        num_joints = heatmaps.shape[1]
+        width = heatmaps.shape[3]
+        heatmaps_reshaped = heatmaps.reshape((batch_size, num_joints, -1))
+        idx = np.argmax(heatmaps_reshaped, 2)
+        maxvals = np.amax(heatmaps_reshaped, 2)
+
+        maxvals = maxvals.reshape((batch_size, num_joints, 1))
+        idx = idx.reshape((batch_size, num_joints, 1))
+
+        preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
+
+        preds[:, :, 0] = (preds[:, :, 0]) % width
+        preds[:, :, 1] = np.floor((preds[:, :, 1]) // width)
+
+        pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))
+        pred_mask = pred_mask.astype(np.float32)
+
+        preds *= pred_mask
+
+        return preds, maxvals
+
+    def post_datk_udp(self, coords, batch_heatmaps, kernel=3):
+        """DARK post-pocessing. Implemented by udp. Paper ref: Huang et al. The
+        Devil is in the Details: Delving into Unbiased Data Processing for Human
+        Pose Estimation (CVPR 2020). Zhang et al. Distribution-Aware Coordinate
+        Representation for Human Pose Estimation (CVPR 2020).
+
+        Note:
+            - batch size: B
+            - num keypoints: K
+            - num persons: N
+            - height of heatmaps: H
+            - width of heatmaps: W
+
+            B=1 for bottom_up paradigm where all persons share the same heatmap.
+            B=N for top_down paradigm where each person has its own heatmaps.
+
+        Args:
+            coords (np.ndarray[N, K, 2]): Initial coordinates of human pose.
+            batch_heatmaps (np.ndarray[B, K, H, W]): batch_heatmaps
+            kernel (int): Gaussian kernel size (K) for modulation.
+
+        Returns:
+            np.ndarray([N, K, 2]): Refined coordinates.
+        """
+        if not isinstance(batch_heatmaps, np.ndarray):
+            batch_heatmaps = batch_heatmaps.cpu().numpy()
+        B, K, H, W = batch_heatmaps.shape
+        N = coords.shape[0]
+        assert (B == 1 or B == N)
+        for heatmaps in batch_heatmaps:
+            for heatmap in heatmaps:
+                cv2.GaussianBlur(heatmap, (kernel, kernel), 0, heatmap)
+        np.clip(batch_heatmaps, 0.001, 50, batch_heatmaps)
+        np.log(batch_heatmaps, batch_heatmaps)
+
+        batch_heatmaps_pad = np.pad(batch_heatmaps, ((0, 0), (0, 0), (1, 1),
+                                                     (1, 1)),
+                                    mode='edge').flatten()
+
+        index = coords[..., 0] + 1 + (coords[..., 1] + 1) * (W + 2)
+        index += (W + 2) * (H + 2) * np.arange(0, B * K).reshape(-1, K)
+        index = index.astype(int).reshape(-1, 1)
+        i_ = batch_heatmaps_pad[index]
+        ix1 = batch_heatmaps_pad[index + 1]
+        iy1 = batch_heatmaps_pad[index + W + 2]
+        ix1y1 = batch_heatmaps_pad[index + W + 3]
+        ix1_y1_ = batch_heatmaps_pad[index - W - 3]
+        ix1_ = batch_heatmaps_pad[index - 1]
+        iy1_ = batch_heatmaps_pad[index - 2 - W]
+
+        dx = 0.5 * (ix1 - ix1_)
+        dy = 0.5 * (iy1 - iy1_)
+        derivative = np.concatenate([dx, dy], axis=1)
+        derivative = derivative.reshape(N, K, 2, 1)
+        dxx = ix1 - 2 * i_ + ix1_
+        dyy = iy1 - 2 * i_ + iy1_
+        dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_)
+        hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1)
+        hessian = hessian.reshape(N, K, 2, 2)
+        hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2))
+        coords -= np.einsum('ijmn,ijnk->ijmk', hessian, derivative).squeeze()
+        return coords
+
+    def transform_preds_udp(self,
+                            coords,
+                            center,
+                            scale,
+                            output_size,
+                            use_udp=True):
+        """Get final keypoint predictions from heatmaps and apply scaling and
+        translation to map them back to the image.
+
+        Note:
+            num_keypoints: K
+
+        Args:
+            coords (np.ndarray[K, ndims]):
+
+                * If ndims=2, corrds are predicted keypoint location.
+                * If ndims=4, corrds are composed of (x, y, scores, tags)
+                * If ndims=5, corrds are composed of (x, y, scores, tags,
+                flipped_tags)
+
+            center (np.ndarray[2, ]): Center of the bounding box (x, y).
+            scale (np.ndarray[2, ]): Scale of the bounding box
+                wrt [width, height].
+            output_size (np.ndarray[2, ] | list(2,)): Size of the
+                destination heatmaps.
+            use_udp (bool): Use unbiased data processing
+
+        Returns:
+            np.ndarray: Predicted coordinates in the images.
+        """
+
+        assert coords.shape[1] in (2, 4, 5)
+        assert len(center) == 2
+        assert len(scale) == 2
+        assert len(output_size) == 2
+
+        # Recover the scale which is normalized by a factor of 200.
+        scale = scale * 200.0
+
+        if use_udp:
+            scale_x = scale[0] / (output_size[0] - 1.0)
+            scale_y = scale[1] / (output_size[1] - 1.0)
+        else:
+            scale_x = scale[0] / output_size[0]
+            scale_y = scale[1] / output_size[1]
+
+        target_coords = np.ones_like(coords)
+        target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[
+            0] * 0.5
+        target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[
+            1] * 0.5
+
+        return target_coords
+
+    def get_final_preds(self, heatmaps, center, scale, kernelsize=11):
+        """the highest heatvalue location with a quarter offset in the
+        direction from the highest response to the second highest response.
+
+        Args:
+            heatmaps (numpy.ndarray): The predicted heatmaps
+            center (numpy.ndarray): The boxes center
+            scale (numpy.ndarray): The scale factor
+
+        Returns:
+            preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
+            maxvals: numpy.ndarray([batch_size, num_joints, 1]), the maximum confidence of the keypoints
+        """
+        coords, maxvals = self.get_max_preds(heatmaps)
+
+        N, K, H, W = heatmaps.shape
+
+        if self.use_dark:
+            coords = self.post_datk_udp(coords, heatmaps, kernelsize)
+            preds = coords.copy()
+            # Transform back to the image
+            for i in range(N):
+                preds[i] = self.transform_preds_udp(preds[i], center[i],
+                                                    scale[i], [W, H])
+        else:
+            for n in range(coords.shape[0]):
+                for p in range(coords.shape[1]):
+                    hm = heatmaps[n][p]
+                    px = int(math.floor(coords[n][p][0] + 0.5))
+                    py = int(math.floor(coords[n][p][1] + 0.5))
+                    if 1 < px < W - 1 and 1 < py < H - 1:
+                        diff = np.array([
+                            hm[py][px + 1] - hm[py][px - 1],
+                            hm[py + 1][px] - hm[py - 1][px]
+                        ])
+                        coords[n][p] += np.sign(diff) * .25
+            preds = coords.copy()
+
+            # Transform back
+            for i in range(coords.shape[0]):
+                preds[i] = transform_preds(coords[i], center[i], scale[i],
+                                           [W, H])
+
+        return preds, maxvals
+
+    def __call__(self, output, center, scale):
+        preds, maxvals = self.get_final_preds(output, center, scale)
+        outputs = [[
+            np.concatenate(
+                (preds, maxvals), axis=-1), np.mean(
+                    maxvals, axis=1)
+        ]]
+        return outputs
--- a/paddle_detection/ppdet/modeling/architectures/mask_rcnn.py
+++ b/paddle_detection/ppdet/modeling/architectures/mask_rcnn.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['MaskRCNN']
+
+
+@register
+class MaskRCNN(BaseArch):
+    """
+    Mask R-CNN network, see https://arxiv.org/abs/1703.06870
+
+    Args:
+        backbone (object): backbone instance
+        rpn_head (object): `RPNHead` instance
+        bbox_head (object): `BBoxHead` instance
+        mask_head (object): `MaskHead` instance
+        bbox_post_process (object): `BBoxPostProcess` instance
+        mask_post_process (object): `MaskPostProcess` instance
+        neck (object): 'FPN' instance
+    """
+
+    __category__ = 'architecture'
+    __inject__ = [
+        'bbox_post_process',
+        'mask_post_process',
+    ]
+
+    def __init__(self,
+                 backbone,
+                 rpn_head,
+                 bbox_head,
+                 mask_head,
+                 bbox_post_process,
+                 mask_post_process,
+                 neck=None):
+        super(MaskRCNN, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.rpn_head = rpn_head
+        self.bbox_head = bbox_head
+        self.mask_head = mask_head
+
+        self.bbox_post_process = bbox_post_process
+        self.mask_post_process = mask_post_process
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = cfg['neck'] and create(cfg['neck'], **kwargs)
+
+        out_shape = neck and neck.out_shape or backbone.out_shape
+        kwargs = {'input_shape': out_shape}
+        rpn_head = create(cfg['rpn_head'], **kwargs)
+        bbox_head = create(cfg['bbox_head'], **kwargs)
+
+        out_shape = neck and out_shape or bbox_head.get_head().out_shape
+        kwargs = {'input_shape': out_shape}
+        mask_head = create(cfg['mask_head'], **kwargs)
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "rpn_head": rpn_head,
+            "bbox_head": bbox_head,
+            "mask_head": mask_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        if self.neck is not None:
+            body_feats = self.neck(body_feats)
+
+        if self.training:
+            rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs)
+            bbox_loss, bbox_feat = self.bbox_head(body_feats, rois, rois_num,
+                                                  self.inputs)
+            rois, rois_num = self.bbox_head.get_assigned_rois()
+            bbox_targets = self.bbox_head.get_assigned_targets()
+            # Mask Head needs bbox_feat in Mask RCNN
+            mask_loss = self.mask_head(body_feats, rois, rois_num, self.inputs,
+                                       bbox_targets, bbox_feat)
+            return rpn_loss, bbox_loss, mask_loss
+        else:
+            rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)
+            preds, feat_func = self.bbox_head(body_feats, rois, rois_num, None)
+
+            im_shape = self.inputs['im_shape']
+            scale_factor = self.inputs['scale_factor']
+
+            bbox, bbox_num, nms_keep_idx = self.bbox_post_process(
+                preds, (rois, rois_num), im_shape, scale_factor)
+            mask_out = self.mask_head(
+                body_feats, bbox, bbox_num, self.inputs, feat_func=feat_func)
+
+            # rescale the prediction back to origin image
+            bbox, bbox_pred, bbox_num = self.bbox_post_process.get_pred(
+                bbox, bbox_num, im_shape, scale_factor)
+            origin_shape = self.bbox_post_process.get_origin_shape()
+            mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num,
+                                               origin_shape)
+
+            if self.use_extra_data:
+                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                """extra_data:{
+                            'scores': predict scores,
+                            'nms_keep_idx': bbox index before nms,
+                           }
+                """
+                extra_data['scores'] = preds[1]  # predict scores (probability)
+                # Todo: get logits output
+                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms
+                return bbox_pred, bbox_num, mask_pred, extra_data
+            else:
+                return bbox_pred, bbox_num, mask_pred
+
+    def get_loss(self, ):
+        bbox_loss, mask_loss, rpn_loss = self._forward()
+        loss = {}
+        loss.update(rpn_loss)
+        loss.update(bbox_loss)
+        loss.update(mask_loss)
+        total_loss = paddle.add_n(list(loss.values()))
+        loss.update({'loss': total_loss})
+        return loss
+
+    def get_pred(self):
+        if self.use_extra_data:
+            bbox_pred, bbox_num, mask_pred, extra_data = self._forward()
+            output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred, 'extra_data': extra_data}
+        else:
+            bbox_pred, bbox_num, mask_pred = self._forward()
+            output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred}
+        return output
--- a/paddle_detection/ppdet/modeling/architectures/meta_arch.py
+++ b/paddle_detection/ppdet/modeling/architectures/meta_arch.py
@@ -0,0 +1,132 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import typing
+
+from ppdet.core.workspace import register
+from ppdet.modeling.post_process import nms
+
+__all__ = ['BaseArch']
+
+
+@register
+class BaseArch(nn.Layer):
+    def __init__(self, data_format='NCHW', use_extra_data=False):
+        super(BaseArch, self).__init__()
+        self.data_format = data_format
+        self.inputs = {}
+        self.fuse_norm = False
+        self.use_extra_data = use_extra_data
+
+    def load_meanstd(self, cfg_transform):
+        scale = 1.
+        mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
+        std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
+        for item in cfg_transform:
+            if 'NormalizeImage' in item:
+                mean = np.array(
+                    item['NormalizeImage']['mean'], dtype=np.float32)
+                std = np.array(item['NormalizeImage']['std'], dtype=np.float32)
+                if item['NormalizeImage'].get('is_scale', True):
+                    scale = 1. / 255.
+                break
+        if self.data_format == 'NHWC':
+            self.scale = paddle.to_tensor(scale / std).reshape((1, 1, 1, 3))
+            self.bias = paddle.to_tensor(-mean / std).reshape((1, 1, 1, 3))
+        else:
+            self.scale = paddle.to_tensor(scale / std).reshape((1, 3, 1, 1))
+            self.bias = paddle.to_tensor(-mean / std).reshape((1, 3, 1, 1))
+
+    def forward(self, inputs):
+        if self.data_format == 'NHWC':
+            image = inputs['image']
+            inputs['image'] = paddle.transpose(image, [0, 2, 3, 1])
+
+        if self.fuse_norm:
+            image = inputs['image']
+            self.inputs['image'] = image * self.scale + self.bias
+            self.inputs['im_shape'] = inputs['im_shape']
+            self.inputs['scale_factor'] = inputs['scale_factor']
+        else:
+            self.inputs = inputs
+
+        self.model_arch()
+
+        if self.training:
+            out = self.get_loss()
+        else:
+            inputs_list = []
+            # multi-scale input
+            if not isinstance(inputs, typing.Sequence):
+                inputs_list.append(inputs)
+            else:
+                inputs_list.extend(inputs)
+            outs = []
+            for inp in inputs_list:
+                if self.fuse_norm:
+                    self.inputs['image'] = inp['image'] * self.scale + self.bias
+                    self.inputs['im_shape'] = inp['im_shape']
+                    self.inputs['scale_factor'] = inp['scale_factor']
+                else:
+                    self.inputs = inp
+                outs.append(self.get_pred())
+
+            # multi-scale test
+            if len(outs) > 1:
+                out = self.merge_multi_scale_predictions(outs)
+            else:
+                out = outs[0]
+        return out
+
+    def merge_multi_scale_predictions(self, outs):
+        # default values for architectures not included in following list
+        num_classes = 80
+        nms_threshold = 0.5
+        keep_top_k = 100
+
+        if self.__class__.__name__ in ('CascadeRCNN', 'FasterRCNN', 'MaskRCNN'):
+            num_classes = self.bbox_head.num_classes
+            keep_top_k = self.bbox_post_process.nms.keep_top_k
+            nms_threshold = self.bbox_post_process.nms.nms_threshold
+        else:
+            raise Exception(
+                "Multi scale test only supports CascadeRCNN, FasterRCNN and MaskRCNN for now"
+            )
+
+        final_boxes = []
+        all_scale_outs = paddle.concat([o['bbox'] for o in outs]).numpy()
+        for c in range(num_classes):
+            idxs = all_scale_outs[:, 0] == c
+            if np.count_nonzero(idxs) == 0:
+                continue
+            r = nms(all_scale_outs[idxs, 1:], nms_threshold)
+            final_boxes.append(
+                np.concatenate([np.full((r.shape[0], 1), c), r], 1))
+        out = np.concatenate(final_boxes)
+        out = np.concatenate(sorted(
+            out, key=lambda e: e[1])[-keep_top_k:]).reshape((-1, 6))
+        out = {
+            'bbox': paddle.to_tensor(out),
+            'bbox_num': paddle.to_tensor(np.array([out.shape[0], ]))
+        }
+
+        return out
+
+    def build_inputs(self, data, input_def):
+        inputs = {}
+        for i, k in enumerate(input_def):
+            inputs[k] = data[i]
+        return inputs
+
+    def model_arch(self, ):
+        pass
+
+    def get_loss(self, ):
+        raise NotImplementedError("Should implement get_loss method!")
+
+    def get_pred(self, ):
+        raise NotImplementedError("Should implement get_pred method!")
--- a/paddle_detection/ppdet/modeling/architectures/multi_stream_detector.py
+++ b/paddle_detection/ppdet/modeling/architectures/multi_stream_detector.py
@@ -0,0 +1,69 @@
+from typing import Dict
+from collections import OrderedDict
+from ppdet.modeling.architectures.meta_arch import BaseArch
+
+
+class MultiSteamDetector(BaseArch):
+    def __init__(self,
+                 model: Dict[str, BaseArch],
+                 train_cfg=None,
+                 test_cfg=None):
+        super(MultiSteamDetector, self).__init__()
+        self.submodules = list(model.keys())
+        for k, v in model.items():
+            setattr(self, k, v)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.inference_on = self.test_cfg.get("inference_on",
+                                              self.submodules[0])
+        self.first_load = True
+
+    def forward(self, inputs, return_loss=True, **kwargs):
+        """Calls either :func:`forward_train` or :func:`forward_test` depending
+        on whether ``return_loss`` is ``True``.
+
+        Note this setting will change the expected inputs. When
+        ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
+        and List[dict]), and when ``resturn_loss=False``, img and img_meta
+        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
+        the outer list indicating test time augmentations.
+        """
+        if return_loss:
+            return self.forward_train(inputs, **kwargs)
+        else:
+            return self.forward_test(inputs, **kwargs)
+
+    def get_loss(self, **kwargs):
+        # losses = self(**data)
+
+        return self.forward_train(self, **kwargs)
+
+    def model(self, **kwargs) -> BaseArch:
+        if "submodule" in kwargs:
+            assert (kwargs["submodule"] in self.submodules
+                    ), "Detector does not contain submodule {}".format(kwargs[
+                        "submodule"])
+            model: BaseArch = getattr(self, kwargs["submodule"])
+        else:
+            model: BaseArch = getattr(self, self.inference_on)
+        return model
+
+    def freeze(self, model_ref: str):
+        assert model_ref in self.submodules
+        model = getattr(self, model_ref)
+        model.eval()
+        for param in model.parameters():
+            param.stop_gradient = True
+
+    def update_ema_model(self, momentum=0.9996):
+        # print(momentum)
+        model_dict = self.student.state_dict()
+        new_dict = OrderedDict()
+        for key, value in self.teacher.state_dict().items():
+            if key in model_dict.keys():
+                new_dict[key] = (model_dict[key] *
+                                 (1 - momentum) + value * momentum)
+            else:
+                raise Exception("{} is not found in student model".format(key))
+        self.teacher.set_dict(new_dict)
--- a/paddle_detection/ppdet/modeling/architectures/picodet.py
+++ b/paddle_detection/ppdet/modeling/architectures/picodet.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['PicoDet']
+
+
+@register
+class PicoDet(BaseArch):
+    """
+    Generalized Focal Loss network, see https://arxiv.org/abs/2006.04388
+
+    Args:
+        backbone (object): backbone instance
+        neck (object): 'FPN' instance
+        head (object): 'PicoHead' instance
+    """
+
+    __category__ = 'architecture'
+
+    def __init__(self, backbone, neck, head='PicoHead', nms_cpu=False):
+        super(PicoDet, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.head = head
+        self.export_post_process = True
+        self.export_nms = True
+        self.nms_cpu = nms_cpu
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'input_shape': neck.out_shape}
+        head = create(cfg['head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "head": head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        fpn_feats = self.neck(body_feats)
+        head_outs = self.head(fpn_feats, self.export_post_process)
+        if self.training or not self.export_post_process:
+            return head_outs, None
+        else:
+            scale_factor = self.inputs['scale_factor']
+            bboxes, bbox_num = self.head.post_process(
+                head_outs,
+                scale_factor,
+                export_nms=self.export_nms,
+                nms_cpu=self.nms_cpu)
+            return bboxes, bbox_num
+
+    def get_loss(self, ):
+        loss = {}
+
+        head_outs, _ = self._forward()
+        loss_gfl = self.head.get_loss(head_outs, self.inputs)
+        loss.update(loss_gfl)
+        total_loss = paddle.add_n(list(loss.values()))
+        loss.update({'loss': total_loss})
+        return loss
+
+    def get_pred(self):
+        if not self.export_post_process:
+            return {'picodet': self._forward()[0]}
+        elif self.export_nms:
+            bbox_pred, bbox_num = self._forward()
+            output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
+            return output
+        else:
+            bboxes, mlvl_scores = self._forward()
+            output = {'bbox': bboxes, 'scores': mlvl_scores}
+            return output
--- a/paddle_detection/ppdet/modeling/architectures/pose3d_metro.py
+++ b/paddle_detection/ppdet/modeling/architectures/pose3d_metro.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License. 
+# You may obtain a copy of the License at 
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and 
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+from .. import layers as L
+
+__all__ = ['METRO_Body']
+
+
+def orthographic_projection(X, camera):
+    """Perform orthographic projection of 3D points X using the camera parameters
+    Args:
+        X: size = [B, N, 3]
+        camera: size = [B, 3]
+    Returns:
+        Projected 2D points -- size = [B, N, 2]
+    """
+    camera = camera.reshape((-1, 1, 3))
+    X_trans = X[:, :, :2] + camera[:, :, 1:]
+    shape = paddle.shape(X_trans)
+    X_2d = (camera[:, :, 0] * X_trans.reshape((shape[0], -1))).reshape(shape)
+    return X_2d
+
+
+@register
+class METRO_Body(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['loss']
+
+    def __init__(
+            self,
+            num_joints,
+            backbone='HRNet',
+            trans_encoder='',
+            loss='Pose3DLoss', ):
+        """
+        Modified from METRO network, see https://arxiv.org/abs/2012.09760
+
+        Args:
+            backbone (nn.Layer): backbone instance
+        """
+        super(METRO_Body, self).__init__()
+        self.num_joints = num_joints
+        self.backbone = backbone
+        self.loss = loss
+        self.deploy = False
+
+        self.trans_encoder = trans_encoder
+        self.conv_learn_tokens = paddle.nn.Conv1D(49, num_joints + 10, 1)
+        self.cam_param_fc = paddle.nn.Linear(3, 2)
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+        trans_encoder = create(cfg['trans_encoder'])
+
+        return {'backbone': backbone, 'trans_encoder': trans_encoder}
+
+    def _forward(self):
+        batch_size = self.inputs['image'].shape[0]
+
+        image_feat = self.backbone(self.inputs)
+        image_feat_flatten = image_feat.reshape((batch_size, 2048, 49))
+        image_feat_flatten = image_feat_flatten.transpose(perm=(0, 2, 1))
+        # and apply a conv layer to learn image token for each 3d joint/vertex position
+        features = self.conv_learn_tokens(image_feat_flatten)  # (B, J, C)
+
+        if self.training:
+            # apply mask vertex/joint modeling
+            # meta_masks is a tensor of all the masks, randomly generated in dataloader
+            # we pre-define a [MASK] token, which is a floating-value vector with 0.01s
+            meta_masks = self.inputs['mjm_mask'].expand((-1, -1, 2048))
+            constant_tensor = paddle.ones_like(features) * 0.01
+            features = features * meta_masks + constant_tensor * (1 - meta_masks
+                                                                  )
+        pred_out = self.trans_encoder(features)
+
+        pred_3d_joints = pred_out[:, :self.num_joints, :]
+        cam_features = pred_out[:, self.num_joints:, :]
+
+        # learn camera parameters
+        pred_2d_joints = self.cam_param_fc(cam_features)
+        return pred_3d_joints, pred_2d_joints
+
+    def get_loss(self):
+        preds_3d, preds_2d = self._forward()
+        loss = self.loss(preds_3d, preds_2d, self.inputs)
+        output = {'loss': loss}
+        return output
+
+    def get_pred(self):
+        preds_3d, preds_2d = self._forward()
+        outputs = {'pose3d': preds_3d, 'pose2d': preds_2d}
+        return outputs
--- a/paddle_detection/ppdet/modeling/architectures/ppyoloe.py
+++ b/paddle_detection/ppdet/modeling/architectures/ppyoloe.py
@@ -0,0 +1,260 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import paddle
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['PPYOLOE', 'PPYOLOEWithAuxHead']
+# PP-YOLOE and PP-YOLOE+ are recommended to use this architecture, especially when use distillation or aux head
+# PP-YOLOE and PP-YOLOE+ can also use the same architecture of YOLOv3 in yolo.py when not use distillation or aux head
+
+
+@register
+class PPYOLOE(BaseArch):
+    """
+    PPYOLOE network, see https://arxiv.org/abs/2203.16250
+
+    Args:
+        backbone (nn.Layer): backbone instance
+        neck (nn.Layer): neck instance
+        yolo_head (nn.Layer): anchor_head instance
+        post_process (object): `BBoxPostProcess` instance
+        ssod_loss (object): 'SSODPPYOLOELoss' instance, only used for semi-det(ssod)
+        for_distill (bool): whether for distillation
+        feat_distill_place (str): distill which feature for distillation
+        for_mot (bool): whether return other features for multi-object tracking
+            models, default False in pure object detection models.
+    """
+
+    __category__ = 'architecture'
+    __shared__ = ['for_distill']
+    __inject__ = ['post_process', 'ssod_loss']
+
+    def __init__(self,
+                 backbone='CSPResNet',
+                 neck='CustomCSPPAN',
+                 yolo_head='PPYOLOEHead',
+                 post_process='BBoxPostProcess',
+                 ssod_loss='SSODPPYOLOELoss',
+                 for_distill=False,
+                 feat_distill_place='neck_feats',
+                 for_mot=False):
+        super(PPYOLOE, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.yolo_head = yolo_head
+        self.post_process = post_process
+        self.for_mot = for_mot
+
+        # for ssod, semi-det
+        self.is_teacher = False
+        self.ssod_loss = ssod_loss
+
+        # distill
+        self.for_distill = for_distill
+        self.feat_distill_place = feat_distill_place
+        if for_distill:
+            assert feat_distill_place in ['backbone_feats', 'neck_feats']
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'input_shape': neck.out_shape}
+        yolo_head = create(cfg['yolo_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "yolo_head": yolo_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        neck_feats = self.neck(body_feats, self.for_mot)
+
+        self.is_teacher = self.inputs.get('is_teacher', False)  # for semi-det
+        if self.training or self.is_teacher:
+            yolo_losses = self.yolo_head(neck_feats, self.inputs)
+
+            if self.for_distill:
+                if self.feat_distill_place == 'backbone_feats':
+                    self.yolo_head.distill_pairs['backbone_feats'] = body_feats
+                elif self.feat_distill_place == 'neck_feats':
+                    self.yolo_head.distill_pairs['neck_feats'] = neck_feats
+                else:
+                    raise ValueError
+            return yolo_losses
+        else:
+
+            yolo_head_outs = self.yolo_head(neck_feats)
+
+            if self.post_process is not None:
+                bbox, bbox_num, nms_keep_idx = self.post_process(
+                    yolo_head_outs, self.yolo_head.mask_anchors,
+                    self.inputs['im_shape'], self.inputs['scale_factor'])
+
+            else:
+                bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process(
+                    yolo_head_outs, self.inputs['scale_factor'])
+
+            if self.use_extra_data:
+                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                """extra_data:{
+                            'scores': predict scores,
+                            'nms_keep_idx': bbox index before nms,
+                           }
+                           """
+                extra_data['scores'] = yolo_head_outs[0]  # predict scores (probability)
+                extra_data['nms_keep_idx'] = nms_keep_idx
+                output = {'bbox': bbox, 'bbox_num': bbox_num, 'extra_data': extra_data}
+            else:
+                output = {'bbox': bbox, 'bbox_num': bbox_num}
+
+            return output
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
+
+    def get_loss_keys(self):
+        return ['loss_cls', 'loss_iou', 'loss_dfl', 'loss_contrast']
+
+    def get_ssod_loss(self, student_head_outs, teacher_head_outs, train_cfg):
+        ssod_losses = self.ssod_loss(student_head_outs, teacher_head_outs,
+                                     train_cfg)
+        return ssod_losses
+
+
+@register
+class PPYOLOEWithAuxHead(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+
+    def __init__(self,
+                 backbone='CSPResNet',
+                 neck='CustomCSPPAN',
+                 yolo_head='PPYOLOEHead',
+                 aux_head='SimpleConvHead',
+                 post_process='BBoxPostProcess',
+                 for_mot=False,
+                 detach_epoch=5):
+        """
+        PPYOLOE network, see https://arxiv.org/abs/2203.16250
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            neck (nn.Layer): neck instance
+            yolo_head (nn.Layer): anchor_head instance
+            post_process (object): `BBoxPostProcess` instance
+            for_mot (bool): whether return other features for multi-object tracking
+                models, default False in pure object detection models.
+        """
+        super(PPYOLOEWithAuxHead, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.aux_neck = copy.deepcopy(self.neck)
+
+        self.yolo_head = yolo_head
+        self.aux_head = aux_head
+        self.post_process = post_process
+        self.for_mot = for_mot
+        self.detach_epoch = detach_epoch
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        # fpn
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+        aux_neck = copy.deepcopy(neck)
+
+        # head
+        kwargs = {'input_shape': neck.out_shape}
+        yolo_head = create(cfg['yolo_head'], **kwargs)
+        aux_head = create(cfg['aux_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "yolo_head": yolo_head,
+            'aux_head': aux_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        neck_feats = self.neck(body_feats, self.for_mot)
+
+        if self.training:
+            if self.inputs['epoch_id'] >= self.detach_epoch:
+                aux_neck_feats = self.aux_neck([f.detach() for f in body_feats])
+                dual_neck_feats = (paddle.concat(
+                    [f.detach(), aux_f], axis=1) for f, aux_f in
+                                   zip(neck_feats, aux_neck_feats))
+            else:
+                aux_neck_feats = self.aux_neck(body_feats)
+                dual_neck_feats = (paddle.concat(
+                    [f, aux_f], axis=1) for f, aux_f in
+                                   zip(neck_feats, aux_neck_feats))
+            aux_cls_scores, aux_bbox_preds = self.aux_head(dual_neck_feats)
+            loss = self.yolo_head(
+                neck_feats,
+                self.inputs,
+                aux_pred=[aux_cls_scores, aux_bbox_preds])
+            return loss
+        else:
+            yolo_head_outs = self.yolo_head(neck_feats)
+
+            if self.post_process is not None:
+                bbox, bbox_num, nms_keep_idx = self.post_process(
+                    yolo_head_outs, self.yolo_head.mask_anchors,
+                    self.inputs['im_shape'], self.inputs['scale_factor'])
+            else:
+                bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process(
+                    yolo_head_outs, self.inputs['scale_factor'])
+
+            if self.use_extra_data:
+                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                """extra_data:{
+                            'scores': predict scores,
+                            'nms_keep_idx': bbox index before nms,
+                           }
+                           """
+                extra_data['scores'] = yolo_head_outs[0]  # predict scores (probability)
+                # Todo: get logits output
+                extra_data['nms_keep_idx'] = nms_keep_idx
+                output = {'bbox': bbox, 'bbox_num': bbox_num, 'extra_data': extra_data}
+            else:
+                output = {'bbox': bbox, 'bbox_num': bbox_num}
+
+            return output
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
--- a/paddle_detection/ppdet/modeling/architectures/queryinst.py
+++ b/paddle_detection/ppdet/modeling/architectures/queryinst.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['QueryInst']
+
+
+@register
+class QueryInst(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 rpn_head,
+                 roi_head,
+                 post_process='SparsePostProcess'):
+        super(QueryInst, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.rpn_head = rpn_head
+        self.roi_head = roi_head
+        self.post_process = post_process
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'input_shape': neck.out_shape}
+        rpn_head = create(cfg['rpn_head'], **kwargs)
+        roi_head = create(cfg['roi_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            'rpn_head': rpn_head,
+            "roi_head": roi_head
+        }
+
+    def _forward(self, targets=None):
+        features = self.backbone(self.inputs)
+        features = self.neck(features)
+
+        proposal_bboxes, proposal_features = self.rpn_head(self.inputs[
+            'img_whwh'])
+        outputs = self.roi_head(features, proposal_bboxes, proposal_features,
+                                targets)
+
+        if self.training:
+            return outputs
+        else:
+            bbox_pred, bbox_num, mask_pred = self.post_process(
+                outputs['class_logits'], outputs['bbox_pred'],
+                self.inputs['scale_factor_whwh'], self.inputs['ori_shape'],
+                outputs['mask_logits'])
+            return bbox_pred, bbox_num, mask_pred
+
+    def get_loss(self):
+        targets = []
+        for i in range(len(self.inputs['img_whwh'])):
+            boxes = self.inputs['gt_bbox'][i]
+            labels = self.inputs['gt_class'][i].squeeze(-1)
+            img_whwh = self.inputs['img_whwh'][i]
+            if boxes.shape[0] != 0:
+                img_whwh_tgt = img_whwh.unsqueeze(0).tile([boxes.shape[0], 1])
+            else:
+                img_whwh_tgt = paddle.zeros_like(boxes)
+            gt_segm = self.inputs['gt_segm'][i].astype('float32')
+            targets.append({
+                'boxes': boxes,
+                'labels': labels,
+                'img_whwh': img_whwh,
+                'img_whwh_tgt': img_whwh_tgt,
+                'gt_segm': gt_segm
+            })
+        losses = self._forward(targets)
+        losses.update({'loss': sum(losses.values())})
+        return losses
+
+    def get_pred(self):
+        bbox_pred, bbox_num, mask_pred = self._forward()
+        return {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred}
--- a/paddle_detection/ppdet/modeling/architectures/retinanet.py
+++ b/paddle_detection/ppdet/modeling/architectures/retinanet.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+import paddle
+import paddle.nn.functional as F
+
+__all__ = ['RetinaNet']
+
+
+@register
+class RetinaNet(BaseArch):
+    __category__ = 'architecture'
+
+    def __init__(self, backbone, neck, head):
+        super(RetinaNet, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.head = head
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'input_shape': neck.out_shape}
+        head = create(cfg['head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            'head': head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        neck_feats = self.neck(body_feats)
+
+        if self.training:
+            return self.head(neck_feats, self.inputs)
+        else:
+            head_outs = self.head(neck_feats)
+            bbox, bbox_num, nms_keep_idx = self.head.post_process(
+                head_outs, self.inputs['im_shape'], self.inputs['scale_factor'])
+
+            if self.use_extra_data:
+                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                """extra_data:{
+                            'scores': predict scores,
+                            'nms_keep_idx': bbox index before nms,
+                           }
+                           """
+                preds_logits = self.head.decode_cls_logits(head_outs[0])
+                preds_scores = F.sigmoid(preds_logits)
+                extra_data['logits'] = preds_logits
+                extra_data['scores'] = preds_scores
+                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms
+                return {'bbox': bbox, 'bbox_num': bbox_num, "extra_data": extra_data}
+            else:
+                return {'bbox': bbox, 'bbox_num': bbox_num}
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
--- a/paddle_detection/ppdet/modeling/architectures/s2anet.py
+++ b/paddle_detection/ppdet/modeling/architectures/s2anet.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['S2ANet']
+
+
+@register
+class S2ANet(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ['head']
+
+    def __init__(self, backbone, neck, head):
+        """
+        S2ANet, see https://arxiv.org/pdf/2008.09397.pdf
+
+        Args:
+            backbone (object): backbone instance
+            neck (object): `FPN` instance
+            head (object): `Head` instance
+        """
+        super(S2ANet, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.s2anet_head = head
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = cfg['neck'] and create(cfg['neck'], **kwargs)
+
+        out_shape = neck and neck.out_shape or backbone.out_shape
+        kwargs = {'input_shape': out_shape}
+        head = create(cfg['head'], **kwargs)
+
+        return {'backbone': backbone, 'neck': neck, "head": head}
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        if self.neck is not None:
+            body_feats = self.neck(body_feats)
+        if self.training:
+            loss = self.s2anet_head(body_feats, self.inputs)
+            return loss
+        else:
+            head_outs = self.s2anet_head(body_feats)
+            # post_process
+            bboxes, bbox_num = self.s2anet_head.get_bboxes(head_outs)
+            # rescale the prediction back to origin image
+            im_shape = self.inputs['im_shape']
+            scale_factor = self.inputs['scale_factor']
+            bboxes = self.s2anet_head.get_pred(bboxes, bbox_num, im_shape,
+                                               scale_factor)
+            # output
+            output = {'bbox': bboxes, 'bbox_num': bbox_num}
+            return output
+
+    def get_loss(self, ):
+        loss = self._forward()
+        return loss
+
+    def get_pred(self):
+        output = self._forward()
+        return output
--- a/paddle_detection/ppdet/modeling/architectures/solov2.py
+++ b/paddle_detection/ppdet/modeling/architectures/solov2.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['SOLOv2']
+
+
+@register
+class SOLOv2(BaseArch):
+    """
+    SOLOv2 network, see https://arxiv.org/abs/2003.10152
+
+    Args:
+        backbone (object): an backbone instance
+        solov2_head (object): an `SOLOv2Head` instance
+        mask_head (object): an `SOLOv2MaskHead` instance
+        neck (object): neck of network, such as feature pyramid network instance
+    """
+
+    __category__ = 'architecture'
+
+    def __init__(self, backbone, solov2_head, mask_head, neck=None):
+        super(SOLOv2, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.solov2_head = solov2_head
+        self.mask_head = mask_head
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'input_shape': neck.out_shape}
+        solov2_head = create(cfg['solov2_head'], **kwargs)
+        mask_head = create(cfg['mask_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            'solov2_head': solov2_head,
+            'mask_head': mask_head,
+        }
+
+    def model_arch(self):
+        body_feats = self.backbone(self.inputs)
+
+        body_feats = self.neck(body_feats)
+
+        self.seg_pred = self.mask_head(body_feats)
+
+        self.cate_pred_list, self.kernel_pred_list = self.solov2_head(
+            body_feats)
+
+    def get_loss(self, ):
+        loss = {}
+        # get gt_ins_labels, gt_cate_labels, etc.
+        gt_ins_labels, gt_cate_labels, gt_grid_orders = [], [], []
+        fg_num = self.inputs['fg_num']
+        for i in range(len(self.solov2_head.seg_num_grids)):
+            ins_label = 'ins_label{}'.format(i)
+            if ins_label in self.inputs:
+                gt_ins_labels.append(self.inputs[ins_label])
+            cate_label = 'cate_label{}'.format(i)
+            if cate_label in self.inputs:
+                gt_cate_labels.append(self.inputs[cate_label])
+            grid_order = 'grid_order{}'.format(i)
+            if grid_order in self.inputs:
+                gt_grid_orders.append(self.inputs[grid_order])
+
+        loss_solov2 = self.solov2_head.get_loss(
+            self.cate_pred_list, self.kernel_pred_list, self.seg_pred,
+            gt_ins_labels, gt_cate_labels, gt_grid_orders, fg_num)
+        loss.update(loss_solov2)
+        total_loss = paddle.add_n(list(loss.values()))
+        loss.update({'loss': total_loss})
+        return loss
+
+    def get_pred(self):
+        seg_masks, cate_labels, cate_scores, bbox_num = self.solov2_head.get_prediction(
+            self.cate_pred_list, self.kernel_pred_list, self.seg_pred,
+            self.inputs['im_shape'], self.inputs['scale_factor'])
+        outs = {
+            "segm": seg_masks,
+            "bbox_num": bbox_num,
+            'cate_label': cate_labels,
+            'cate_score': cate_scores
+        }
+        return outs
--- a/paddle_detection/ppdet/modeling/architectures/sparse_rcnn.py
+++ b/paddle_detection/ppdet/modeling/architectures/sparse_rcnn.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ["SparseRCNN"]
+
+
+@register
+class SparseRCNN(BaseArch):
+    __category__ = 'architecture'
+    __inject__ = ["postprocess"]
+
+    def __init__(self,
+                 backbone,
+                 neck,
+                 head="SparsercnnHead",
+                 postprocess="SparsePostProcess"):
+        super(SparseRCNN, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.head = head
+        self.postprocess = postprocess
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'roi_input_shape': neck.out_shape}
+        head = create(cfg['head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "head": head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        fpn_feats = self.neck(body_feats)
+        head_outs = self.head(fpn_feats, self.inputs["img_whwh"])
+
+        if not self.training:
+            bbox_pred, bbox_num = self.postprocess(
+                head_outs["pred_logits"], head_outs["pred_boxes"],
+                self.inputs["scale_factor_whwh"], self.inputs["ori_shape"])
+            return bbox_pred, bbox_num
+        else:
+            return head_outs
+
+    def get_loss(self):
+        batch_gt_class = self.inputs["gt_class"]
+        batch_gt_box = self.inputs["gt_bbox"]
+        batch_whwh = self.inputs["img_whwh"]
+        targets = []
+
+        for i in range(len(batch_gt_class)):
+            boxes = batch_gt_box[i]
+            labels = batch_gt_class[i].squeeze(-1)
+            img_whwh = batch_whwh[i]
+            img_whwh_tgt = img_whwh.unsqueeze(0).tile([int(boxes.shape[0]), 1])
+            targets.append({
+                "boxes": boxes,
+                "labels": labels,
+                "img_whwh": img_whwh,
+                "img_whwh_tgt": img_whwh_tgt
+            })
+
+        outputs = self._forward()
+        loss_dict = self.head.get_loss(outputs, targets)
+        acc = loss_dict["acc"]
+        loss_dict.pop("acc")
+        total_loss = sum(loss_dict.values())
+        loss_dict.update({"loss": total_loss, "acc": acc})
+        return loss_dict
+
+    def get_pred(self):
+        bbox_pred, bbox_num = self._forward()
+        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
+        return output
--- a/paddle_detection/ppdet/modeling/architectures/ssd.py
+++ b/paddle_detection/ppdet/modeling/architectures/ssd.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+import paddle
+import paddle.nn.functional as F
+
+__all__ = ['SSD']
+
+
+@register
+class SSD(BaseArch):
+    """
+    Single Shot MultiBox Detector, see https://arxiv.org/abs/1512.02325
+
+    Args:
+        backbone (nn.Layer): backbone instance
+        ssd_head (nn.Layer): `SSDHead` instance
+        post_process (object): `BBoxPostProcess` instance
+    """
+
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+
+    def __init__(self, backbone, ssd_head, post_process, r34_backbone=False):
+        super(SSD, self).__init__()
+        self.backbone = backbone
+        self.ssd_head = ssd_head
+        self.post_process = post_process
+        self.r34_backbone = r34_backbone
+        if self.r34_backbone:
+            from ppdet.modeling.backbones.resnet import ResNet
+            assert isinstance(self.backbone, ResNet) and \
+                   self.backbone.depth == 34, \
+                "If you set r34_backbone=True, please use ResNet-34 as backbone."
+            self.backbone.res_layers[2].blocks[0].branch2a.conv._stride = [1, 1]
+            self.backbone.res_layers[2].blocks[0].short.conv._stride = [1, 1]
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        # head
+        kwargs = {'input_shape': backbone.out_shape}
+        ssd_head = create(cfg['ssd_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            "ssd_head": ssd_head,
+        }
+
+    def _forward(self):
+        # Backbone
+        body_feats = self.backbone(self.inputs)
+
+        # SSD Head
+        if self.training:
+            return self.ssd_head(body_feats, self.inputs['image'],
+                                 self.inputs['gt_bbox'],
+                                 self.inputs['gt_class'])
+        else:
+            preds, anchors = self.ssd_head(body_feats, self.inputs['image'])
+            bbox, bbox_num, nms_keep_idx = self.post_process(
+                preds, anchors, self.inputs['im_shape'],
+                self.inputs['scale_factor'])
+
+            if self.use_extra_data:
+                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                """extra_data:{
+                            'scores': predict scores,
+                            'nms_keep_idx': bbox index before nms,
+                           }
+                           """
+                preds_logits = preds[1]  # [[1xNumBBoxNumClass]]
+                extra_data['scores'] = F.softmax(paddle.concat(
+                    preds_logits, axis=1)).transpose([0, 2, 1])
+                extra_data['logits'] = paddle.concat(
+                    preds_logits, axis=1).transpose([0, 2, 1])
+                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms
+                return bbox, bbox_num, extra_data
+            else:
+                return bbox, bbox_num
+
+    def get_loss(self, ):
+        return {"loss": self._forward()}
+
+    def get_pred(self):
+        if self.use_extra_data:
+            bbox_pred, bbox_num, extra_data = self._forward()
+            output = {
+                "bbox": bbox_pred,
+                "bbox_num": bbox_num,
+                "extra_data": extra_data
+            }
+        else:
+            bbox_pred, bbox_num = self._forward()
+            output = {
+                "bbox": bbox_pred,
+                "bbox_num": bbox_num,
+            }
+        return output
--- a/paddle_detection/ppdet/modeling/architectures/tood.py
+++ b/paddle_detection/ppdet/modeling/architectures/tood.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['TOOD']
+
+
+@register
+class TOOD(BaseArch):
+    """
+    TOOD: Task-aligned One-stage Object Detection, see https://arxiv.org/abs/2108.07755
+    Args:
+        backbone (nn.Layer): backbone instance
+        neck (nn.Layer): 'FPN' instance
+        head (nn.Layer): 'TOODHead' instance
+    """
+
+    __category__ = 'architecture'
+
+    def __init__(self, backbone, neck, head):
+        super(TOOD, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.head = head
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'input_shape': neck.out_shape}
+        head = create(cfg['head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "head": head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        fpn_feats = self.neck(body_feats)
+        head_outs = self.head(fpn_feats)
+        if not self.training:
+            bboxes, bbox_num = self.head.post_process(
+                head_outs, self.inputs['im_shape'], self.inputs['scale_factor'])
+            return bboxes, bbox_num
+        else:
+            loss = self.head.get_loss(head_outs, self.inputs)
+            return loss
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        bbox_pred, bbox_num = self._forward()
+        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
+        return output
--- a/paddle_detection/ppdet/modeling/architectures/ttfnet.py
+++ b/paddle_detection/ppdet/modeling/architectures/ttfnet.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['TTFNet']
+
+
+@register
+class TTFNet(BaseArch):
+    """
+    TTFNet network, see https://arxiv.org/abs/1909.00700
+
+    Args:
+        backbone (object): backbone instance
+        neck (object): 'TTFFPN' instance
+        ttf_head (object): 'TTFHead' instance
+        post_process (object): 'BBoxPostProcess' instance
+    """
+
+    __category__ = 'architecture'
+    __inject__ = ['post_process']
+
+    def __init__(self,
+                 backbone='DarkNet',
+                 neck='TTFFPN',
+                 ttf_head='TTFHead',
+                 post_process='BBoxPostProcess'):
+        super(TTFNet, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.ttf_head = ttf_head
+        self.post_process = post_process
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        backbone = create(cfg['backbone'])
+
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        kwargs = {'input_shape': neck.out_shape}
+        ttf_head = create(cfg['ttf_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "ttf_head": ttf_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        body_feats = self.neck(body_feats)
+        hm, wh = self.ttf_head(body_feats)
+        if self.training:
+            return hm, wh
+        else:
+            bbox, bbox_num = self.post_process(hm, wh, self.inputs['im_shape'],
+                                               self.inputs['scale_factor'])
+            return bbox, bbox_num
+
+    def get_loss(self, ):
+        loss = {}
+        heatmap = self.inputs['ttf_heatmap']
+        box_target = self.inputs['ttf_box_target']
+        reg_weight = self.inputs['ttf_reg_weight']
+        hm, wh = self._forward()
+        head_loss = self.ttf_head.get_loss(hm, wh, heatmap, box_target,
+                                           reg_weight)
+        loss.update(head_loss)
+        total_loss = paddle.add_n(list(loss.values()))
+        loss.update({'loss': total_loss})
+        return loss
+
+    def get_pred(self):
+        bbox_pred, bbox_num = self._forward()
+        output = {
+            "bbox": bbox_pred,
+            "bbox_num": bbox_num,
+        }
+        return output
--- a/paddle_detection/ppdet/modeling/architectures/yolo.py
+++ b/paddle_detection/ppdet/modeling/architectures/yolo.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+from ..post_process import JDEBBoxPostProcess
+
+__all__ = ['YOLOv3']
+# YOLOv3,PP-YOLO,PP-YOLOv2,PP-YOLOE,PP-YOLOE+ use the same architecture as YOLOv3
+# PP-YOLOE and PP-YOLOE+ are recommended to use PPYOLOE architecture in ppyoloe.py, especially when use distillation or aux head
+
+
+@register
+class YOLOv3(BaseArch):
+    __category__ = 'architecture'
+    __shared__ = ['data_format']
+    __inject__ = ['post_process']
+
+    def __init__(self,
+                 backbone='DarkNet',
+                 neck='YOLOv3FPN',
+                 yolo_head='YOLOv3Head',
+                 post_process='BBoxPostProcess',
+                 data_format='NCHW',
+                 for_mot=False):
+        """
+        YOLOv3 network, see https://arxiv.org/abs/1804.02767
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            neck (nn.Layer): neck instance
+            yolo_head (nn.Layer): anchor_head instance
+            bbox_post_process (object): `BBoxPostProcess` instance
+            data_format (str): data format, NCHW or NHWC
+            for_mot (bool): whether return other features for multi-object tracking
+                models, default False in pure object detection models.
+        """
+        super(YOLOv3, self).__init__(data_format=data_format)
+        self.backbone = backbone
+        self.neck = neck
+        self.yolo_head = yolo_head
+        self.post_process = post_process
+        self.for_mot = for_mot
+        self.return_idx = isinstance(post_process, JDEBBoxPostProcess)
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        # fpn
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        # head
+        kwargs = {'input_shape': neck.out_shape}
+        yolo_head = create(cfg['yolo_head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "yolo_head": yolo_head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        if self.for_mot:
+            neck_feats = self.neck(body_feats, self.for_mot)
+        else:
+            neck_feats = self.neck(body_feats)
+
+        if isinstance(neck_feats, dict):
+            assert self.for_mot == True
+            emb_feats = neck_feats['emb_feats']
+            neck_feats = neck_feats['yolo_feats']
+
+        if self.training:
+            yolo_losses = self.yolo_head(neck_feats, self.inputs)
+
+            if self.for_mot:
+                return {'det_losses': yolo_losses, 'emb_feats': emb_feats}
+            else:
+                return yolo_losses
+
+        else:
+            yolo_head_outs = self.yolo_head(neck_feats)
+
+            if self.for_mot:
+                # the detection part of JDE MOT model
+                boxes_idx, bbox, bbox_num, nms_keep_idx = self.post_process(
+                    yolo_head_outs, self.yolo_head.mask_anchors)
+                output = {
+                    'bbox': bbox,
+                    'bbox_num': bbox_num,
+                    'boxes_idx': boxes_idx,
+                    'nms_keep_idx': nms_keep_idx,
+                    'emb_feats': emb_feats,
+                }
+            else:
+                if self.return_idx:
+                    # the detection part of JDE MOT model
+                    _, bbox, bbox_num, nms_keep_idx = self.post_process(
+                        yolo_head_outs, self.yolo_head.mask_anchors)
+                elif self.post_process is not None:
+                    # anchor based YOLOs: YOLOv3,PP-YOLO,PP-YOLOv2 use mask_anchors
+                    bbox, bbox_num, nms_keep_idx = self.post_process(
+                        yolo_head_outs, self.yolo_head.mask_anchors,
+                        self.inputs['im_shape'], self.inputs['scale_factor'])
+                else:
+                    # anchor free YOLOs: PP-YOLOE, PP-YOLOE+
+                    bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process(
+                        yolo_head_outs, self.inputs['scale_factor'])
+
+                if self.use_extra_data:
+                    extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
+                    """extra_data:{
+                                'scores': predict scores,
+                                'nms_keep_idx': bbox index before nms,
+                               }
+                    """
+                    extra_data['scores'] = yolo_head_outs[0]  # predict scores (probability)
+                    # Todo: get logits output
+                    extra_data['nms_keep_idx'] = nms_keep_idx
+                    # Todo support for mask_anchors yolo
+                    output = {'bbox': bbox, 'bbox_num': bbox_num, 'extra_data': extra_data}
+                else:
+                    output = {'bbox': bbox, 'bbox_num': bbox_num}
+
+            return output
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
--- a/paddle_detection/ppdet/modeling/architectures/yolof.py
+++ b/paddle_detection/ppdet/modeling/architectures/yolof.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+__all__ = ['YOLOF']
+
+
+@register
+class YOLOF(BaseArch):
+    __category__ = 'architecture'
+
+    def __init__(self,
+                 backbone='ResNet',
+                 neck='DilatedEncoder',
+                 head='YOLOFHead',
+                 for_mot=False):
+        """
+        YOLOF network, see https://arxiv.org/abs/2103.09460
+
+        Args:
+            backbone (nn.Layer): backbone instance
+            neck (nn.Layer): DilatedEncoder instance
+            head (nn.Layer): YOLOFHead instance
+            for_mot (bool): whether return other features for multi-object tracking
+                models, default False in pure object detection models.
+        """
+        super(YOLOF, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.head = head
+        self.for_mot = for_mot
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        # fpn
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        # head
+        kwargs = {'input_shape': neck.out_shape}
+        head = create(cfg['head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "head": head,
+        }
+
+    def _forward(self):
+        body_feats = self.backbone(self.inputs)
+        neck_feats = self.neck(body_feats, self.for_mot)
+
+        if self.training:
+            yolo_losses = self.head(neck_feats, self.inputs)
+            return yolo_losses
+        else:
+            yolo_head_outs = self.head(neck_feats)
+            bbox, bbox_num = self.head.post_process(yolo_head_outs,
+                                                    self.inputs['im_shape'],
+                                                    self.inputs['scale_factor'])
+            output = {'bbox': bbox, 'bbox_num': bbox_num}
+            return output
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
--- a/paddle_detection/ppdet/modeling/architectures/yolox.py
+++ b/paddle_detection/ppdet/modeling/architectures/yolox.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from ppdet.core.workspace import register, create
+from .meta_arch import BaseArch
+
+import random
+import paddle
+import paddle.nn.functional as F
+import paddle.distributed as dist
+
+__all__ = ['YOLOX']
+
+
+@register
+class YOLOX(BaseArch):
+    """
+    YOLOX network, see https://arxiv.org/abs/2107.08430
+
+    Args:
+        backbone (nn.Layer): backbone instance
+        neck (nn.Layer): neck instance
+        head (nn.Layer): head instance
+        for_mot (bool): whether used for MOT or not
+        input_size (list[int]): initial scale, will be reset by self._preprocess()
+        size_stride (int): stride of the size range
+        size_range (list[int]): multi-scale range for training
+        random_interval (int): interval of iter to change self._input_size
+    """
+    __category__ = 'architecture'
+
+    def __init__(self,
+                 backbone='CSPDarkNet',
+                 neck='YOLOCSPPAN',
+                 head='YOLOXHead',
+                 for_mot=False,
+                 input_size=[640, 640],
+                 size_stride=32,
+                 size_range=[15, 25],
+                 random_interval=10):
+        super(YOLOX, self).__init__()
+        self.backbone = backbone
+        self.neck = neck
+        self.head = head
+        self.for_mot = for_mot
+
+        self.input_size = input_size
+        self._input_size = paddle.to_tensor(input_size)
+        self.size_stride = size_stride
+        self.size_range = size_range
+        self.random_interval = random_interval
+        self._step = 0
+
+    @classmethod
+    def from_config(cls, cfg, *args, **kwargs):
+        # backbone
+        backbone = create(cfg['backbone'])
+
+        # fpn
+        kwargs = {'input_shape': backbone.out_shape}
+        neck = create(cfg['neck'], **kwargs)
+
+        # head
+        kwargs = {'input_shape': neck.out_shape}
+        head = create(cfg['head'], **kwargs)
+
+        return {
+            'backbone': backbone,
+            'neck': neck,
+            "head": head,
+        }
+
+    def _forward(self):
+        if self.training:
+            self._preprocess()
+        body_feats = self.backbone(self.inputs)
+        neck_feats = self.neck(body_feats, self.for_mot)
+
+        if self.training:
+            yolox_losses = self.head(neck_feats, self.inputs)
+            yolox_losses.update({'size': self._input_size[0]})
+            return yolox_losses
+        else:
+            head_outs = self.head(neck_feats)
+            bbox, bbox_num = self.head.post_process(
+                head_outs, self.inputs['im_shape'], self.inputs['scale_factor'])
+            return {'bbox': bbox, 'bbox_num': bbox_num}
+
+    def get_loss(self):
+        return self._forward()
+
+    def get_pred(self):
+        return self._forward()
+
+    def _preprocess(self):
+        # YOLOX multi-scale training, interpolate resize before inputs of the network.
+        self._get_size()
+        scale_y = self._input_size[0] / self.input_size[0]
+        scale_x = self._input_size[1] / self.input_size[1]
+        if scale_x != 1 or scale_y != 1:
+            self.inputs['image'] = F.interpolate(
+                self.inputs['image'],
+                size=self._input_size,
+                mode='bilinear',
+                align_corners=False)
+            gt_bboxes = self.inputs['gt_bbox']
+            for i in range(len(gt_bboxes)):
+                if len(gt_bboxes[i]) > 0:
+                    gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x
+                    gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y
+            self.inputs['gt_bbox'] = gt_bboxes
+
+    def _get_size(self):
+        # random_interval = 10 as default, every 10 iters to change self._input_size
+        image_ratio = self.input_size[1] * 1.0 / self.input_size[0]
+        if self._step % self.random_interval == 0:
+            size_factor = random.randint(*self.size_range)
+            size = [
+                self.size_stride * size_factor,
+                self.size_stride * int(size_factor * image_ratio)
+            ]
+            self._input_size = paddle.to_tensor(size)
+        self._step += 1