更换文档检测模型

2024-08-27 14:42:45 +08:00
parent aea6f19951
commit 1514e09c40
2072 changed files with 254336 additions and 4967 deletions
--- a/paddle_detection/ppdet/modeling/losses/init.py
+++ b/paddle_detection/ppdet/modeling/losses/init.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import yolo_loss
+from . import iou_aware_loss
+from . import iou_loss
+from . import ssd_loss
+from . import fcos_loss
+from . import solov2_loss
+from . import ctfocal_loss
+from . import keypoint_loss
+from . import jde_loss
+from . import fairmot_loss
+from . import gfocal_loss
+from . import detr_loss
+from . import sparsercnn_loss
+from . import focal_loss
+from . import smooth_l1_loss
+from . import probiou_loss
+from . import cot_loss
+from . import supcontrast
+from . import queryinst_loss
+from . import clrnet_loss
+from . import clrnet_line_iou_loss
+
+from .yolo_loss import *
+from .iou_aware_loss import *
+from .iou_loss import *
+from .ssd_loss import *
+from .fcos_loss import *
+from .solov2_loss import *
+from .ctfocal_loss import *
+from .keypoint_loss import *
+from .jde_loss import *
+from .fairmot_loss import *
+from .gfocal_loss import *
+from .detr_loss import *
+from .sparsercnn_loss import *
+from .focal_loss import *
+from .smooth_l1_loss import *
+from .pose3d_loss import *
+from .probiou_loss import *
+from .cot_loss import *
+from .supcontrast import *
+from .queryinst_loss import *
+from .clrnet_loss import *
+from .clrnet_line_iou_loss import *
--- a/paddle_detection/ppdet/modeling/losses/clrnet_line_iou_loss.py
+++ b/paddle_detection/ppdet/modeling/losses/clrnet_line_iou_loss.py
@@ -0,0 +1,41 @@
+import paddle
+
+
+def line_iou(pred, target, img_w, length=15, aligned=True):
+    '''
+    Calculate the line iou value between predictions and targets
+    Args:
+        pred: lane predictions, shape: (num_pred, 72)
+        target: ground truth, shape: (num_target, 72)
+        img_w: image width
+        length: extended radius
+        aligned: True for iou loss calculation, False for pair-wise ious in assign
+    '''
+    px1 = pred - length
+    px2 = pred + length
+    tx1 = target - length
+    tx2 = target + length
+
+    if aligned:
+        invalid_mask = target
+        ovr = paddle.minimum(px2, tx2) - paddle.maximum(px1, tx1)
+        union = paddle.maximum(px2, tx2) - paddle.minimum(px1, tx1)
+    else:
+        num_pred = pred.shape[0]
+        invalid_mask = target.tile([num_pred, 1, 1])
+
+        ovr = (paddle.minimum(px2[:, None, :], tx2[None, ...]) - paddle.maximum(
+            px1[:, None, :], tx1[None, ...]))
+        union = (paddle.maximum(px2[:, None, :], tx2[None, ...]) -
+                 paddle.minimum(px1[:, None, :], tx1[None, ...]))
+
+    invalid_masks = (invalid_mask < 0) | (invalid_mask >= img_w)
+
+    ovr[invalid_masks] = 0.
+    union[invalid_masks] = 0.
+    iou = ovr.sum(axis=-1) / (union.sum(axis=-1) + 1e-9)
+    return iou
+
+
+def liou_loss(pred, target, img_w, length=15):
+    return (1 - line_iou(pred, target, img_w, length)).mean()
--- a/paddle_detection/ppdet/modeling/losses/clrnet_loss.py
+++ b/paddle_detection/ppdet/modeling/losses/clrnet_loss.py
@@ -0,0 +1,283 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+from ppdet.modeling.clrnet_utils import accuracy
+from ppdet.modeling.assigners.clrnet_assigner import assign
+from ppdet.modeling.losses.clrnet_line_iou_loss import liou_loss
+
+__all__ = ['CLRNetLoss']
+
+
+class SoftmaxFocalLoss(nn.Layer):
+    def __init__(self, gamma, ignore_lb=255, *args, **kwargs):
+        super(SoftmaxFocalLoss, self).__init__()
+        self.gamma = gamma
+        self.nll = nn.NLLLoss(ignore_index=ignore_lb)
+
+    def forward(self, logits, labels):
+        scores = F.softmax(logits, dim=1)
+        factor = paddle.pow(1. - scores, self.gamma)
+        log_score = F.log_softmax(logits, dim=1)
+        log_score = factor * log_score
+        loss = self.nll(log_score, labels)
+        return loss
+
+
+def focal_loss(input: paddle.Tensor,
+               target: paddle.Tensor,
+               alpha: float,
+               gamma: float=2.0,
+               reduction: str='none',
+               eps: float=1e-8) -> paddle.Tensor:
+    r"""Function that computes Focal loss.
+
+    See :class:`~kornia.losses.FocalLoss` for details.
+    """
+    if not paddle.is_tensor(input):
+        raise TypeError("Input type is not a torch.Tensor. Got {}".format(
+            type(input)))
+
+    if not len(input.shape) >= 2:
+        raise ValueError("Invalid input shape, we expect BxCx*. Got: {}".format(
+            input.shape))
+
+    if input.shape[0] != target.shape[0]:
+        raise ValueError(
+            'Expected input batch_size ({}) to match target batch_size ({}).'.
+            format(input.shape[0], target.shape[0]))
+
+    n = input.shape[0]
+    out_size = (n, ) + tuple(input.shape[2:])
+    if target.shape[1:] != input.shape[2:]:
+        raise ValueError('Expected target size {}, got {}'.format(out_size,
+                                                                  target.shape))
+    if (isinstance(input.place, paddle.CUDAPlace) and
+            isinstance(target.place, paddle.CPUPlace)) | (isinstance(
+                input.place, paddle.CPUPlace) and isinstance(target.place,
+                                                             paddle.CUDAPlace)):
+        raise ValueError(
+            "input and target must be in the same device. Got: {} and {}".
+            format(input.place, target.place))
+
+    # compute softmax over the classes axis
+    input_soft: paddle.Tensor = F.softmax(input, axis=1) + eps
+
+    # create the labels one hot tensor
+    target_one_hot: paddle.Tensor = paddle.to_tensor(
+        F.one_hot(
+            target, num_classes=input.shape[1]).cast(input.dtype),
+        place=input.place)
+
+    # compute the actual focal loss
+    weight = paddle.pow(-input_soft + 1., gamma)
+
+    focal = -alpha * weight * paddle.log(input_soft)
+    loss_tmp = paddle.sum(target_one_hot * focal, axis=1)
+
+    if reduction == 'none':
+        loss = loss_tmp
+    elif reduction == 'mean':
+        loss = paddle.mean(loss_tmp)
+    elif reduction == 'sum':
+        loss = paddle.sum(loss_tmp)
+    else:
+        raise NotImplementedError("Invalid reduction mode: {}".format(
+            reduction))
+    return loss
+
+
+class FocalLoss(nn.Layer):
+    r"""Criterion that computes Focal loss.
+
+    According to [1], the Focal loss is computed as follows:
+
+    .. math::
+
+        \text{FL}(p_t) = -\alpha_t (1 - p_t)^{\gamma} \, \text{log}(p_t)
+
+    where:
+       - :math:`p_t` is the model's estimated probability for each class.
+
+
+    Arguments:
+        alpha (float): Weighting factor :math:`\alpha \in [0, 1]`.
+        gamma (float): Focusing parameter :math:`\gamma >= 0`.
+        reduction (str, optional): Specifies the reduction to apply to the
+         output: ‘none’ | ‘mean’ | ‘sum’. ‘none’: no reduction will be applied,
+         ‘mean’: the sum of the output will be divided by the number of elements
+         in the output, ‘sum’: the output will be summed. Default: ‘none’.
+
+    Shape:
+        - Input: :math:`(N, C, *)` where C = number of classes.
+        - Target: :math:`(N, *)` where each value is
+          :math:`0 ≤ targets[i] ≤ C−1`.
+
+    Examples:
+        >>> N = 5  # num_classes
+        >>> kwargs = {"alpha": 0.5, "gamma": 2.0, "reduction": 'mean'}
+        >>> loss = kornia.losses.FocalLoss(**kwargs)
+        >>> input = torch.randn(1, N, 3, 5, requires_grad=True)
+        >>> target = torch.empty(1, 3, 5, dtype=torch.long).random_(N)
+        >>> output = loss(input, target)
+        >>> output.backward()
+
+    References:
+        [1] https://arxiv.org/abs/1708.02002
+    """
+
+    def __init__(self, alpha: float, gamma: float=2.0,
+                 reduction: str='none') -> None:
+        super(FocalLoss, self).__init__()
+        self.alpha: float = alpha
+        self.gamma: float = gamma
+        self.reduction: str = reduction
+        self.eps: float = 1e-6
+
+    def forward(  # type: ignore
+            self, input: paddle.Tensor, target: paddle.Tensor) -> paddle.Tensor:
+        return focal_loss(input, target, self.alpha, self.gamma, self.reduction,
+                          self.eps)
+
+
+@register
+class CLRNetLoss(nn.Layer):
+    __shared__ = ['img_w', 'img_h', 'num_classes', 'num_points']
+
+    def __init__(self,
+                 cls_loss_weight=2.0,
+                 xyt_loss_weight=0.2,
+                 iou_loss_weight=2.0,
+                 seg_loss_weight=1.0,
+                 refine_layers=3,
+                 num_points=72,
+                 img_w=800,
+                 img_h=320,
+                 num_classes=5,
+                 ignore_label=255,
+                 bg_weight=0.4):
+        super(CLRNetLoss, self).__init__()
+        self.cls_loss_weight = cls_loss_weight
+        self.xyt_loss_weight = xyt_loss_weight
+        self.iou_loss_weight = iou_loss_weight
+        self.seg_loss_weight = seg_loss_weight
+        self.refine_layers = refine_layers
+        self.img_w = img_w
+        self.img_h = img_h
+        self.n_strips = num_points - 1
+        self.num_classes = num_classes
+        self.ignore_label = ignore_label
+        weights = paddle.ones(shape=[self.num_classes])
+        weights[0] = bg_weight
+        self.criterion = nn.NLLLoss(
+            ignore_index=self.ignore_label, weight=weights)
+
+    def forward(self, output, batch):
+        predictions_lists = output['predictions_lists']
+        targets = batch['lane_line'].clone()
+        cls_criterion = FocalLoss(alpha=0.25, gamma=2.0)
+        cls_loss = paddle.to_tensor(0.0)
+        reg_xytl_loss = paddle.to_tensor(0.0)
+        iou_loss = paddle.to_tensor(0.0)
+        cls_acc = []
+        cls_acc_stage = []
+        for stage in range(self.refine_layers):
+            predictions_list = predictions_lists[stage]
+            for predictions, target in zip(predictions_list, targets):
+                target = target[target[:, 1] == 1]
+
+                if len(target) == 0:
+                    # If there are no targets, all predictions have to be negatives (i.e., 0 confidence)
+                    cls_target = paddle.zeros(
+                        [predictions.shape[0]], dtype='int64')
+                    cls_pred = predictions[:, :2]
+                    cls_loss = cls_loss + cls_criterion(cls_pred,
+                                                        cls_target).sum()
+                    continue
+
+                with paddle.no_grad():
+                    matched_row_inds, matched_col_inds = assign(
+                        predictions, target, self.img_w, self.img_h)
+
+                # classification targets
+                cls_target = paddle.zeros([predictions.shape[0]], dtype='int64')
+                cls_target[matched_row_inds] = 1
+                cls_pred = predictions[:, :2]
+
+                # regression targets -> [start_y, start_x, theta] (all transformed to absolute values), only on matched pairs
+                reg_yxtl = predictions.index_select(matched_row_inds)[..., 2:6]
+
+                reg_yxtl[:, 0] *= self.n_strips
+                reg_yxtl[:, 1] *= (self.img_w - 1)
+                reg_yxtl[:, 2] *= 180
+                reg_yxtl[:, 3] *= self.n_strips
+
+                target_yxtl = target.index_select(matched_col_inds)[..., 2:
+                                                                    6].clone()
+
+                # regression targets -> S coordinates (all transformed to absolute values)
+                reg_pred = predictions.index_select(matched_row_inds)[..., 6:]
+                reg_pred *= (self.img_w - 1)
+                reg_targets = target.index_select(matched_col_inds)[...,
+                                                                    6:].clone()
+
+                with paddle.no_grad():
+                    predictions_starts = paddle.clip(
+                        (predictions.index_select(matched_row_inds)[..., 2] *
+                         self.n_strips).round().cast("int64"),
+                        min=0,
+                        max=self.
+                        n_strips)  # ensure the predictions starts is valid
+
+                    target_starts = (
+                        target.index_select(matched_col_inds)[..., 2] *
+                        self.n_strips).round().cast("int64")
+                    target_yxtl[:, -1] -= (
+                        predictions_starts - target_starts)  # reg length
+
+                # Loss calculation
+                cls_loss = cls_loss + cls_criterion(
+                    cls_pred, cls_target).sum() / target.shape[0]
+
+                target_yxtl[:, 0] *= self.n_strips
+                target_yxtl[:, 2] *= 180
+
+                reg_xytl_loss = reg_xytl_loss + F.smooth_l1_loss(
+                    input=reg_yxtl, label=target_yxtl, reduction='none').mean()
+
+                iou_loss = iou_loss + liou_loss(
+                    reg_pred, reg_targets, self.img_w, length=15)
+
+                cls_accuracy = accuracy(cls_pred, cls_target)
+                cls_acc_stage.append(cls_accuracy)
+
+            cls_acc.append(sum(cls_acc_stage) / (len(cls_acc_stage) + 1e-5))
+
+        # extra segmentation loss
+        seg_loss = self.criterion(
+            F.log_softmax(
+                output['seg'], axis=1), batch['seg'].cast('int64'))
+
+        cls_loss /= (len(targets) * self.refine_layers)
+        reg_xytl_loss /= (len(targets) * self.refine_layers)
+        iou_loss /= (len(targets) * self.refine_layers)
+
+        loss = cls_loss * self.cls_loss_weight \
+            + reg_xytl_loss * self.xyt_loss_weight \
+            + seg_loss * self.seg_loss_weight \
+            + iou_loss * self.iou_loss_weight
+
+        return_value = {
+            'loss': loss,
+            'cls_loss': cls_loss * self.cls_loss_weight,
+            'reg_xytl_loss': reg_xytl_loss * self.xyt_loss_weight,
+            'seg_loss': seg_loss * self.seg_loss_weight,
+            'iou_loss': iou_loss * self.iou_loss_weight
+        }
+
+        for i in range(self.refine_layers):
+            if not isinstance(cls_acc[i], paddle.Tensor):
+                cls_acc[i] = paddle.to_tensor(cls_acc[i])
+            return_value['stage_{}_acc'.format(i)] = cls_acc[i]
+
+        return return_value
--- a/paddle_detection/ppdet/modeling/losses/cot_loss.py
+++ b/paddle_detection/ppdet/modeling/losses/cot_loss.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+from ppdet.core.workspace import register
+
+__all__ = ['COTLoss']
+
+@register
+class COTLoss(nn.Layer):
+    __shared__ = ['num_classes']
+    def __init__(self,
+                 num_classes=80, 
+                 cot_scale=1,
+                 cot_lambda=1):
+        super(COTLoss, self).__init__()
+        self.cot_scale = cot_scale
+        self.cot_lambda = cot_lambda    
+        self.num_classes = num_classes    
+        
+    def forward(self, scores, targets, cot_relation):    
+        cls_name = 'loss_bbox_cls_cot'
+        loss_bbox = {}
+
+        tgt_labels, tgt_bboxes, tgt_gt_inds = targets
+        tgt_labels = paddle.concat(tgt_labels) if len(
+            tgt_labels) > 1 else tgt_labels[0]
+        mask = (tgt_labels < self.num_classes)
+        valid_inds = paddle.nonzero(tgt_labels >= 0).flatten()
+        if valid_inds.shape[0] == 0:
+            loss_bbox[cls_name] = paddle.zeros([1], dtype='float32')
+        else:
+            tgt_labels = tgt_labels.cast('int64')
+            valid_cot_targets = []
+            for i in range(tgt_labels.shape[0]):
+                train_label = tgt_labels[i]
+                if train_label < self.num_classes:
+                    valid_cot_targets.append(cot_relation[train_label])
+            coco_targets = paddle.to_tensor(valid_cot_targets)
+            coco_targets.stop_gradient = True
+            coco_loss = - coco_targets * F.log_softmax(scores[mask][:, :-1] * self.cot_scale)
+            loss_bbox[cls_name] = self.cot_lambda * paddle.mean(paddle.sum(coco_loss, axis=-1))
+        return loss_bbox
--- a/paddle_detection/ppdet/modeling/losses/ctfocal_loss.py
+++ b/paddle_detection/ppdet/modeling/losses/ctfocal_loss.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+
+from ppdet.core.workspace import register, serializable
+
+__all__ = ['CTFocalLoss']
+
+
+@register
+@serializable
+class CTFocalLoss(object):
+    """
+    CTFocalLoss: CornerNet & CenterNet Focal Loss
+    Args:
+        loss_weight (float): loss weight
+        gamma (float): gamma parameter for Focal Loss
+    """
+
+    def __init__(self, loss_weight=1., gamma=2.0):
+        self.loss_weight = loss_weight
+        self.gamma = gamma
+
+    def __call__(self, pred, target):
+        """
+        Calculate the loss
+        Args:
+            pred (Tensor): heatmap prediction
+            target (Tensor): target for positive samples
+        Return:
+            ct_focal_loss (Tensor): Focal Loss used in CornerNet & CenterNet.
+                Note that the values in target are in [0, 1] since gaussian is
+                used to reduce the punishment and we treat [0, 1) as neg example.
+        """
+        fg_map = paddle.cast(target == 1, 'float32')
+        fg_map.stop_gradient = True
+        bg_map = paddle.cast(target < 1, 'float32')
+        bg_map.stop_gradient = True
+
+        neg_weights = paddle.pow(1 - target, 4)
+        pos_loss = 0 - paddle.log(pred) * paddle.pow(1 - pred,
+                                                     self.gamma) * fg_map
+
+        neg_loss = 0 - paddle.log(1 - pred) * paddle.pow(
+            pred, self.gamma) * neg_weights * bg_map
+        pos_loss = paddle.sum(pos_loss)
+        neg_loss = paddle.sum(neg_loss)
+
+        fg_num = paddle.sum(fg_map)
+        ct_focal_loss = (pos_loss + neg_loss) / (
+            fg_num + paddle.cast(fg_num == 0, 'float32'))
+        return ct_focal_loss * self.loss_weight
--- a/paddle_detection/ppdet/modeling/losses/detr_loss.py
+++ b/paddle_detection/ppdet/modeling/losses/detr_loss.py
@@ -0,0 +1,631 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+from .iou_loss import GIoULoss
+from ..transformers import bbox_cxcywh_to_xyxy, sigmoid_focal_loss, varifocal_loss_with_logits
+from ..bbox_utils import bbox_iou
+
+__all__ = ['DETRLoss', 'DINOLoss']
+
+
+@register
+class DETRLoss(nn.Layer):
+    __shared__ = ['num_classes', 'use_focal_loss']
+    __inject__ = ['matcher']
+
+    def __init__(self,
+                 num_classes=80,
+                 matcher='HungarianMatcher',
+                 loss_coeff={
+                     'class': 1,
+                     'bbox': 5,
+                     'giou': 2,
+                     'no_object': 0.1,
+                     'mask': 1,
+                     'dice': 1
+                 },
+                 aux_loss=True,
+                 use_focal_loss=False,
+                 use_vfl=False,
+                 use_uni_match=False,
+                 uni_match_ind=0):
+        r"""
+        Args:
+            num_classes (int): The number of classes.
+            matcher (HungarianMatcher): It computes an assignment between the targets
+                and the predictions of the network.
+            loss_coeff (dict): The coefficient of loss.
+            aux_loss (bool): If 'aux_loss = True', loss at each decoder layer are to be used.
+            use_focal_loss (bool): Use focal loss or not.
+        """
+        super(DETRLoss, self).__init__()
+
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.loss_coeff = loss_coeff
+        self.aux_loss = aux_loss
+        self.use_focal_loss = use_focal_loss
+        self.use_vfl = use_vfl
+        self.use_uni_match = use_uni_match
+        self.uni_match_ind = uni_match_ind
+
+        if not self.use_focal_loss:
+            self.loss_coeff['class'] = paddle.full([num_classes + 1],
+                                                   loss_coeff['class'])
+            self.loss_coeff['class'][-1] = loss_coeff['no_object']
+        self.giou_loss = GIoULoss()
+
+    def _get_loss_class(self,
+                        logits,
+                        gt_class,
+                        match_indices,
+                        bg_index,
+                        num_gts,
+                        postfix="",
+                        iou_score=None,
+                        gt_score=None):
+        # logits: [b, query, num_classes], gt_class: list[[n, 1]]
+        name_class = "loss_class" + postfix
+
+        target_label = paddle.full(logits.shape[:2], bg_index, dtype='int64')
+        bs, num_query_objects = target_label.shape
+        num_gt = sum(len(a) for a in gt_class)
+        if num_gt > 0:
+            index, updates = self._get_index_updates(num_query_objects,
+                                                     gt_class, match_indices)
+            target_label = paddle.scatter(
+                target_label.reshape([-1, 1]), index, updates.astype('int64'))
+            target_label = target_label.reshape([bs, num_query_objects])
+        if self.use_focal_loss:
+            target_label = F.one_hot(target_label,
+                                     self.num_classes + 1)[..., :-1]
+            if iou_score is not None and self.use_vfl:
+                if gt_score is not None:
+                    target_score = paddle.zeros([bs, num_query_objects])
+                    target_score = paddle.scatter(
+                        target_score.reshape([-1, 1]), index, gt_score)
+                    target_score = target_score.reshape(
+                        [bs, num_query_objects, 1]) * target_label
+
+                    target_score_iou = paddle.zeros([bs, num_query_objects])
+                    target_score_iou = paddle.scatter(
+                        target_score_iou.reshape([-1, 1]), index, iou_score)
+                    target_score_iou = target_score_iou.reshape(
+                        [bs, num_query_objects, 1]) * target_label
+                    target_score = paddle.multiply(target_score,
+                                                   target_score_iou)
+                    loss_ = self.loss_coeff[
+                        'class'] * varifocal_loss_with_logits(
+                            logits, target_score, target_label,
+                            num_gts / num_query_objects)
+                else:
+                    target_score = paddle.zeros([bs, num_query_objects])
+                    if num_gt > 0:
+                        target_score = paddle.scatter(
+                            target_score.reshape([-1, 1]), index, iou_score)
+                    target_score = target_score.reshape(
+                        [bs, num_query_objects, 1]) * target_label
+                    loss_ = self.loss_coeff[
+                        'class'] * varifocal_loss_with_logits(
+                            logits, target_score, target_label,
+                            num_gts / num_query_objects)
+            else:
+                loss_ = self.loss_coeff['class'] * sigmoid_focal_loss(
+                    logits, target_label, num_gts / num_query_objects)
+        else:
+            loss_ = F.cross_entropy(
+                logits, target_label, weight=self.loss_coeff['class'])
+        return {name_class: loss_}
+
+    def _get_loss_bbox(self, boxes, gt_bbox, match_indices, num_gts,
+                       postfix=""):
+        # boxes: [b, query, 4], gt_bbox: list[[n, 4]]
+        name_bbox = "loss_bbox" + postfix
+        name_giou = "loss_giou" + postfix
+
+        loss = dict()
+        if sum(len(a) for a in gt_bbox) == 0:
+            loss[name_bbox] = paddle.to_tensor([0.])
+            loss[name_giou] = paddle.to_tensor([0.])
+            return loss
+
+        src_bbox, target_bbox = self._get_src_target_assign(boxes, gt_bbox,
+                                                            match_indices)
+        loss[name_bbox] = self.loss_coeff['bbox'] * F.l1_loss(
+            src_bbox, target_bbox, reduction='sum') / num_gts
+        loss[name_giou] = self.giou_loss(
+            bbox_cxcywh_to_xyxy(src_bbox), bbox_cxcywh_to_xyxy(target_bbox))
+        loss[name_giou] = loss[name_giou].sum() / num_gts
+        loss[name_giou] = self.loss_coeff['giou'] * loss[name_giou]
+        return loss
+
+    def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts,
+                       postfix=""):
+        # masks: [b, query, h, w], gt_mask: list[[n, H, W]]
+        name_mask = "loss_mask" + postfix
+        name_dice = "loss_dice" + postfix
+
+        loss = dict()
+        if sum(len(a) for a in gt_mask) == 0:
+            loss[name_mask] = paddle.to_tensor([0.])
+            loss[name_dice] = paddle.to_tensor([0.])
+            return loss
+
+        src_masks, target_masks = self._get_src_target_assign(masks, gt_mask,
+                                                              match_indices)
+        src_masks = F.interpolate(
+            src_masks.unsqueeze(0),
+            size=target_masks.shape[-2:],
+            mode="bilinear")[0]
+        loss[name_mask] = self.loss_coeff['mask'] * F.sigmoid_focal_loss(
+            src_masks,
+            target_masks,
+            paddle.to_tensor(
+                [num_gts], dtype='float32'))
+        loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss(
+            src_masks, target_masks, num_gts)
+        return loss
+
+    def _dice_loss(self, inputs, targets, num_gts):
+        inputs = F.sigmoid(inputs)
+        inputs = inputs.flatten(1)
+        targets = targets.flatten(1)
+        numerator = 2 * (inputs * targets).sum(1)
+        denominator = inputs.sum(-1) + targets.sum(-1)
+        loss = 1 - (numerator + 1) / (denominator + 1)
+        return loss.sum() / num_gts
+
+    def _get_loss_aux(self,
+                      boxes,
+                      logits,
+                      gt_bbox,
+                      gt_class,
+                      bg_index,
+                      num_gts,
+                      dn_match_indices=None,
+                      postfix="",
+                      masks=None,
+                      gt_mask=None,
+                      gt_score=None):
+        loss_class = []
+        loss_bbox, loss_giou = [], []
+        loss_mask, loss_dice = [], []
+        if dn_match_indices is not None:
+            match_indices = dn_match_indices
+        elif self.use_uni_match:
+            match_indices = self.matcher(
+                boxes[self.uni_match_ind],
+                logits[self.uni_match_ind],
+                gt_bbox,
+                gt_class,
+                masks=masks[self.uni_match_ind] if masks is not None else None,
+                gt_mask=gt_mask)
+        for i, (aux_boxes, aux_logits) in enumerate(zip(boxes, logits)):
+            aux_masks = masks[i] if masks is not None else None
+            if not self.use_uni_match and dn_match_indices is None:
+                match_indices = self.matcher(
+                    aux_boxes,
+                    aux_logits,
+                    gt_bbox,
+                    gt_class,
+                    masks=aux_masks,
+                    gt_mask=gt_mask)
+            if self.use_vfl:
+                if sum(len(a) for a in gt_bbox) > 0:
+                    src_bbox, target_bbox = self._get_src_target_assign(
+                        aux_boxes.detach(), gt_bbox, match_indices)
+                    iou_score = bbox_iou(
+                        bbox_cxcywh_to_xyxy(src_bbox).split(4, -1),
+                        bbox_cxcywh_to_xyxy(target_bbox).split(4, -1))
+                else:
+                    iou_score = None
+                if gt_score is not None:
+                    _, target_score = self._get_src_target_assign(
+                        logits[-1].detach(), gt_score, match_indices)
+            else:
+                iou_score = None
+            loss_class.append(
+                self._get_loss_class(
+                    aux_logits,
+                    gt_class,
+                    match_indices,
+                    bg_index,
+                    num_gts,
+                    postfix,
+                    iou_score,
+                    gt_score=target_score
+                    if gt_score is not None else None)['loss_class' + postfix])
+            loss_ = self._get_loss_bbox(aux_boxes, gt_bbox, match_indices,
+                                        num_gts, postfix)
+            loss_bbox.append(loss_['loss_bbox' + postfix])
+            loss_giou.append(loss_['loss_giou' + postfix])
+            if masks is not None and gt_mask is not None:
+                loss_ = self._get_loss_mask(aux_masks, gt_mask, match_indices,
+                                            num_gts, postfix)
+                loss_mask.append(loss_['loss_mask' + postfix])
+                loss_dice.append(loss_['loss_dice' + postfix])
+        loss = {
+            "loss_class_aux" + postfix: paddle.add_n(loss_class),
+            "loss_bbox_aux" + postfix: paddle.add_n(loss_bbox),
+            "loss_giou_aux" + postfix: paddle.add_n(loss_giou)
+        }
+        if masks is not None and gt_mask is not None:
+            loss["loss_mask_aux" + postfix] = paddle.add_n(loss_mask)
+            loss["loss_dice_aux" + postfix] = paddle.add_n(loss_dice)
+        return loss
+
+    def _get_index_updates(self, num_query_objects, target, match_indices):
+        batch_idx = paddle.concat([
+            paddle.full_like(src, i) for i, (src, _) in enumerate(match_indices)
+        ])
+        src_idx = paddle.concat([src for (src, _) in match_indices])
+        src_idx += (batch_idx * num_query_objects)
+        target_assign = paddle.concat([
+            paddle.gather(
+                t, dst, axis=0) for t, (_, dst) in zip(target, match_indices)
+        ])
+        return src_idx, target_assign
+
+    def _get_src_target_assign(self, src, target, match_indices):
+        src_assign = paddle.concat([
+            paddle.gather(
+                t, I, axis=0) if len(I) > 0 else paddle.zeros([0, t.shape[-1]])
+            for t, (I, _) in zip(src, match_indices)
+        ])
+        target_assign = paddle.concat([
+            paddle.gather(
+                t, J, axis=0) if len(J) > 0 else paddle.zeros([0, t.shape[-1]])
+            for t, (_, J) in zip(target, match_indices)
+        ])
+        return src_assign, target_assign
+
+    def _get_num_gts(self, targets, dtype="float32"):
+        num_gts = sum(len(a) for a in targets)
+        num_gts = paddle.to_tensor([num_gts], dtype=dtype)
+        if paddle.distributed.get_world_size() > 1:
+            paddle.distributed.all_reduce(num_gts)
+            num_gts /= paddle.distributed.get_world_size()
+        num_gts = paddle.clip(num_gts, min=1.)
+        return num_gts
+
+    def _get_prediction_loss(self,
+                             boxes,
+                             logits,
+                             gt_bbox,
+                             gt_class,
+                             masks=None,
+                             gt_mask=None,
+                             postfix="",
+                             dn_match_indices=None,
+                             num_gts=1,
+                             gt_score=None):
+        if dn_match_indices is None:
+            match_indices = self.matcher(
+                boxes, logits, gt_bbox, gt_class, masks=masks, gt_mask=gt_mask)
+        else:
+            match_indices = dn_match_indices
+
+        if self.use_vfl:
+            if gt_score is not None:  #ssod
+                _, target_score = self._get_src_target_assign(
+                    logits[-1].detach(), gt_score, match_indices)
+            elif sum(len(a) for a in gt_bbox) > 0:
+                src_bbox, target_bbox = self._get_src_target_assign(
+                    boxes.detach(), gt_bbox, match_indices)
+                iou_score = bbox_iou(
+                    bbox_cxcywh_to_xyxy(src_bbox).split(4, -1),
+                    bbox_cxcywh_to_xyxy(target_bbox).split(4, -1))
+            else:
+                iou_score = None
+        else:
+            iou_score = None
+
+        loss = dict()
+        loss.update(
+            self._get_loss_class(
+                logits,
+                gt_class,
+                match_indices,
+                self.num_classes,
+                num_gts,
+                postfix,
+                iou_score,
+                gt_score=target_score if gt_score is not None else None))
+        loss.update(
+            self._get_loss_bbox(boxes, gt_bbox, match_indices, num_gts,
+                                postfix))
+        if masks is not None and gt_mask is not None:
+            loss.update(
+                self._get_loss_mask(masks, gt_mask, match_indices, num_gts,
+                                    postfix))
+        return loss
+
+    def forward(self,
+                boxes,
+                logits,
+                gt_bbox,
+                gt_class,
+                masks=None,
+                gt_mask=None,
+                postfix="",
+                gt_score=None,
+                **kwargs):
+        r"""
+        Args:
+            boxes (Tensor): [l, b, query, 4]
+            logits (Tensor): [l, b, query, num_classes]
+            gt_bbox (List(Tensor)): list[[n, 4]]
+            gt_class (List(Tensor)): list[[n, 1]]
+            masks (Tensor, optional): [l, b, query, h, w]
+            gt_mask (List(Tensor), optional): list[[n, H, W]]
+            postfix (str): postfix of loss name
+        """
+
+        dn_match_indices = kwargs.get("dn_match_indices", None)
+        num_gts = kwargs.get("num_gts", None)
+        if num_gts is None:
+            num_gts = self._get_num_gts(gt_class)
+
+        total_loss = self._get_prediction_loss(
+            boxes[-1],
+            logits[-1],
+            gt_bbox,
+            gt_class,
+            masks=masks[-1] if masks is not None else None,
+            gt_mask=gt_mask,
+            postfix=postfix,
+            dn_match_indices=dn_match_indices,
+            num_gts=num_gts,
+            gt_score=gt_score if gt_score is not None else None)
+
+        if self.aux_loss:
+            total_loss.update(
+                self._get_loss_aux(
+                    boxes[:-1],
+                    logits[:-1],
+                    gt_bbox,
+                    gt_class,
+                    self.num_classes,
+                    num_gts,
+                    dn_match_indices,
+                    postfix,
+                    masks=masks[:-1] if masks is not None else None,
+                    gt_mask=gt_mask,
+                    gt_score=gt_score if gt_score is not None else None))
+
+        return total_loss
+
+
+@register
+class DINOLoss(DETRLoss):
+    def forward(self,
+                boxes,
+                logits,
+                gt_bbox,
+                gt_class,
+                masks=None,
+                gt_mask=None,
+                postfix="",
+                dn_out_bboxes=None,
+                dn_out_logits=None,
+                dn_meta=None,
+                gt_score=None,
+                **kwargs):
+        num_gts = self._get_num_gts(gt_class)
+        total_loss = super(DINOLoss, self).forward(
+            boxes,
+            logits,
+            gt_bbox,
+            gt_class,
+            num_gts=num_gts,
+            gt_score=gt_score)
+
+        if dn_meta is not None:
+            dn_positive_idx, dn_num_group = \
+                dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
+            assert len(gt_class) == len(dn_positive_idx)
+
+            # denoising match indices
+            dn_match_indices = self.get_dn_match_indices(
+                gt_class, dn_positive_idx, dn_num_group)
+
+            # compute denoising training loss
+            num_gts *= dn_num_group
+            dn_loss = super(DINOLoss, self).forward(
+                dn_out_bboxes,
+                dn_out_logits,
+                gt_bbox,
+                gt_class,
+                postfix="_dn",
+                dn_match_indices=dn_match_indices,
+                num_gts=num_gts,
+                gt_score=gt_score)
+            total_loss.update(dn_loss)
+        else:
+            total_loss.update(
+                {k + '_dn': paddle.to_tensor([0.])
+                 for k in total_loss.keys()})
+
+        return total_loss
+
+    @staticmethod
+    def get_dn_match_indices(labels, dn_positive_idx, dn_num_group):
+        dn_match_indices = []
+        for i in range(len(labels)):
+            num_gt = len(labels[i])
+            if num_gt > 0:
+                gt_idx = paddle.arange(end=num_gt, dtype="int64")
+                gt_idx = gt_idx.tile([dn_num_group])
+                assert len(dn_positive_idx[i]) == len(gt_idx)
+                dn_match_indices.append((dn_positive_idx[i], gt_idx))
+            else:
+                dn_match_indices.append((paddle.zeros(
+                    [0], dtype="int64"), paddle.zeros(
+                        [0], dtype="int64")))
+        return dn_match_indices
+
+
+@register
+class MaskDINOLoss(DETRLoss):
+    __shared__ = ['num_classes', 'use_focal_loss', 'num_sample_points']
+    __inject__ = ['matcher']
+
+    def __init__(self,
+                 num_classes=80,
+                 matcher='HungarianMatcher',
+                 loss_coeff={
+                     'class': 4,
+                     'bbox': 5,
+                     'giou': 2,
+                     'mask': 5,
+                     'dice': 5
+                 },
+                 aux_loss=True,
+                 use_focal_loss=False,
+                 num_sample_points=12544,
+                 oversample_ratio=3.0,
+                 important_sample_ratio=0.75):
+        super(MaskDINOLoss, self).__init__(num_classes, matcher, loss_coeff,
+                                           aux_loss, use_focal_loss)
+        assert oversample_ratio >= 1
+        assert important_sample_ratio <= 1 and important_sample_ratio >= 0
+
+        self.num_sample_points = num_sample_points
+        self.oversample_ratio = oversample_ratio
+        self.important_sample_ratio = important_sample_ratio
+        self.num_oversample_points = int(num_sample_points * oversample_ratio)
+        self.num_important_points = int(num_sample_points *
+                                        important_sample_ratio)
+        self.num_random_points = num_sample_points - self.num_important_points
+
+    def forward(self,
+                boxes,
+                logits,
+                gt_bbox,
+                gt_class,
+                masks=None,
+                gt_mask=None,
+                postfix="",
+                dn_out_bboxes=None,
+                dn_out_logits=None,
+                dn_out_masks=None,
+                dn_meta=None,
+                **kwargs):
+        num_gts = self._get_num_gts(gt_class)
+        total_loss = super(MaskDINOLoss, self).forward(
+            boxes,
+            logits,
+            gt_bbox,
+            gt_class,
+            masks=masks,
+            gt_mask=gt_mask,
+            num_gts=num_gts)
+
+        if dn_meta is not None:
+            dn_positive_idx, dn_num_group = \
+                dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
+            assert len(gt_class) == len(dn_positive_idx)
+
+            # denoising match indices
+            dn_match_indices = DINOLoss.get_dn_match_indices(
+                gt_class, dn_positive_idx, dn_num_group)
+
+            # compute denoising training loss
+            num_gts *= dn_num_group
+            dn_loss = super(MaskDINOLoss, self).forward(
+                dn_out_bboxes,
+                dn_out_logits,
+                gt_bbox,
+                gt_class,
+                masks=dn_out_masks,
+                gt_mask=gt_mask,
+                postfix="_dn",
+                dn_match_indices=dn_match_indices,
+                num_gts=num_gts)
+            total_loss.update(dn_loss)
+        else:
+            total_loss.update(
+                {k + '_dn': paddle.to_tensor([0.])
+                 for k in total_loss.keys()})
+
+        return total_loss
+
+    def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts,
+                       postfix=""):
+        # masks: [b, query, h, w], gt_mask: list[[n, H, W]]
+        name_mask = "loss_mask" + postfix
+        name_dice = "loss_dice" + postfix
+
+        loss = dict()
+        if sum(len(a) for a in gt_mask) == 0:
+            loss[name_mask] = paddle.to_tensor([0.])
+            loss[name_dice] = paddle.to_tensor([0.])
+            return loss
+
+        src_masks, target_masks = self._get_src_target_assign(masks, gt_mask,
+                                                              match_indices)
+        # sample points
+        sample_points = self._get_point_coords_by_uncertainty(src_masks)
+        sample_points = 2.0 * sample_points.unsqueeze(1) - 1.0
+
+        src_masks = F.grid_sample(
+            src_masks.unsqueeze(1), sample_points,
+            align_corners=False).squeeze([1, 2])
+
+        target_masks = F.grid_sample(
+            target_masks.unsqueeze(1), sample_points,
+            align_corners=False).squeeze([1, 2]).detach()
+
+        loss[name_mask] = self.loss_coeff[
+            'mask'] * F.binary_cross_entropy_with_logits(
+                src_masks, target_masks,
+                reduction='none').mean(1).sum() / num_gts
+        loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss(
+            src_masks, target_masks, num_gts)
+        return loss
+
+    def _get_point_coords_by_uncertainty(self, masks):
+        # Sample points based on their uncertainty.
+        masks = masks.detach()
+        num_masks = masks.shape[0]
+        sample_points = paddle.rand(
+            [num_masks, 1, self.num_oversample_points, 2])
+
+        out_mask = F.grid_sample(
+            masks.unsqueeze(1), 2.0 * sample_points - 1.0,
+            align_corners=False).squeeze([1, 2])
+        out_mask = -paddle.abs(out_mask)
+
+        _, topk_ind = paddle.topk(out_mask, self.num_important_points, axis=1)
+        batch_ind = paddle.arange(end=num_masks, dtype=topk_ind.dtype)
+        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_important_points])
+        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
+
+        sample_points = paddle.gather_nd(sample_points.squeeze(1), topk_ind)
+        if self.num_random_points > 0:
+            sample_points = paddle.concat(
+                [
+                    sample_points,
+                    paddle.rand([num_masks, self.num_random_points, 2])
+                ],
+                axis=1)
+        return sample_points
--- a/paddle_detection/ppdet/modeling/losses/fairmot_loss.py
+++ b/paddle_detection/ppdet/modeling/losses/fairmot_loss.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+from paddle.nn.initializer import Constant
+from ppdet.core.workspace import register
+
+__all__ = ['FairMOTLoss']
+
+
+@register
+class FairMOTLoss(nn.Layer):
+    def __init__(self):
+        super(FairMOTLoss, self).__init__()
+        self.det_weight = self.create_parameter(
+            shape=[1], default_initializer=Constant(-1.85))
+        self.reid_weight = self.create_parameter(
+            shape=[1], default_initializer=Constant(-1.05))
+
+    def forward(self, det_loss, reid_loss):
+        loss = paddle.exp(-self.det_weight) * det_loss + paddle.exp(
+            -self.reid_weight) * reid_loss + (self.det_weight + self.reid_weight
+                                              )
+        loss *= 0.5
+        return {'loss': loss}
--- a/paddle_detection/ppdet/modeling/losses/fcos_loss.py
+++ b/paddle_detection/ppdet/modeling/losses/fcos_loss.py
--- a/paddle_detection/ppdet/modeling/losses/focal_loss.py
+++ b/paddle_detection/ppdet/modeling/losses/focal_loss.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn.functional as F
+import paddle.nn as nn
+from ppdet.core.workspace import register
+
+__all__ = ['FocalLoss', 'Weighted_FocalLoss']
+
+@register
+class FocalLoss(nn.Layer):
+    """A wrapper around paddle.nn.functional.sigmoid_focal_loss.
+    Args:
+        use_sigmoid (bool): currently only support use_sigmoid=True
+        alpha (float): parameter alpha in Focal Loss
+        gamma (float): parameter gamma in Focal Loss
+        loss_weight (float): final loss will be multiplied by this
+    """
+    def __init__(self,
+                 use_sigmoid=True,
+                 alpha=0.25,
+                 gamma=2.0,
+                 loss_weight=1.0):
+        super(FocalLoss, self).__init__()
+        assert use_sigmoid == True, \
+            'Focal Loss only supports sigmoid at the moment'
+        self.use_sigmoid = use_sigmoid
+        self.alpha = alpha
+        self.gamma = gamma
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, reduction='none'):
+        """forward function.
+        Args:
+            pred (Tensor): logits of class prediction, of shape (N, num_classes)
+            target (Tensor): target class label, of shape (N, )
+            reduction (str): the way to reduce loss, one of (none, sum, mean)
+        """
+        num_classes = pred.shape[1]
+        target = F.one_hot(target, num_classes+1).cast(pred.dtype)
+        target = target[:, :-1].detach()
+        loss = F.sigmoid_focal_loss(
+            pred, target, alpha=self.alpha, gamma=self.gamma,
+            reduction=reduction)
+        return loss * self.loss_weight
+
+
+@register
+class Weighted_FocalLoss(FocalLoss):
+    """A wrapper around paddle.nn.functional.sigmoid_focal_loss.
+    Args:
+        use_sigmoid (bool): currently only support use_sigmoid=True
+        alpha (float): parameter alpha in Focal Loss
+        gamma (float): parameter gamma in Focal Loss
+        loss_weight (float): final loss will be multiplied by this
+    """
+    def __init__(self,
+                 use_sigmoid=True,
+                 alpha=0.25,
+                 gamma=2.0,
+                 loss_weight=1.0,
+                 reduction="mean"):
+        super(FocalLoss, self).__init__()
+        assert use_sigmoid == True, \
+            'Focal Loss only supports sigmoid at the moment'
+        self.use_sigmoid = use_sigmoid
+        self.alpha = alpha
+        self.gamma = gamma
+        self.loss_weight = loss_weight
+        self.reduction = reduction
+
+    def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None):
+        """forward function.
+        Args:
+            pred (Tensor): logits of class prediction, of shape (N, num_classes)
+            target (Tensor): target class label, of shape (N, )
+            reduction (str): the way to reduce loss, one of (none, sum, mean)
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        num_classes = pred.shape[1]
+        target = F.one_hot(target, num_classes + 1).astype(pred.dtype)
+        target = target[:, :-1].detach()
+        loss = F.sigmoid_focal_loss(
+            pred, target, alpha=self.alpha, gamma=self.gamma,
+            reduction='none')
+
+        if weight is not None:
+            if weight.shape != loss.shape:
+                if weight.shape[0] == loss.shape[0]:
+                    # For most cases, weight is of shape (num_priors, ),
+                    #  which means it does not have the second axis num_class
+                    weight = weight.reshape((-1, 1))
+                else:
+                    # Sometimes, weight per anchor per class is also needed. e.g.
+                    #  in FSAF. But it may be flattened of shape
+                    #  (num_priors x num_class, ), while loss is still of shape
+                    #  (num_priors, num_class).
+                    assert weight.numel() == loss.numel()
+                    weight = weight.reshape((loss.shape[0], -1))
+            assert weight.ndim == loss.ndim
+            loss = loss * weight
+
+        # if avg_factor is not specified, just reduce the loss
+        if avg_factor is None:
+            if reduction == 'mean':
+                loss = loss.mean()
+            elif reduction == 'sum':
+                loss = loss.sum()
+        else:
+            # if reduction is mean, then average the loss by avg_factor
+            if reduction == 'mean':
+                # Avoid causing ZeroDivisionError when avg_factor is 0.0,
+                # i.e., all labels of an image belong to ignore index.
+                eps = 1e-10
+                loss = loss.sum() / (avg_factor + eps)
+            # if reduction is 'none', then do nothing, otherwise raise an error
+            elif reduction != 'none':
+                raise ValueError('avg_factor can not be used with reduction="sum"')
+
+        return loss * self.loss_weight
--- a/paddle_detection/ppdet/modeling/losses/gfocal_loss.py
+++ b/paddle_detection/ppdet/modeling/losses/gfocal_loss.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The code is based on:
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/gfocal_loss.py
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling import ops
+
+__all__ = ['QualityFocalLoss', 'DistributionFocalLoss']
+
+
+def quality_focal_loss(pred, target, beta=2.0, use_sigmoid=True):
+    """
+    Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning
+    Qualified and Distributed Bounding Boxes for Dense Object Detection
+    <https://arxiv.org/abs/2006.04388>`_.
+    Args:
+        pred (Tensor): Predicted joint representation of classification
+            and quality (IoU) estimation with shape (N, C), C is the number of
+            classes.
+        target (tuple([Tensor])): Target category label with shape (N,)
+            and target quality label with shape (N,).
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+    Returns:
+        Tensor: Loss tensor with shape (N,).
+    """
+    assert len(target) == 2, """target for QFL must be a tuple of two elements,
+        including category label and quality label, respectively"""
+    # label denotes the category id, score denotes the quality score
+    label, score = target
+    if use_sigmoid:
+        func = F.binary_cross_entropy_with_logits
+    else:
+        func = F.binary_cross_entropy
+
+    # negatives are supervised by 0 quality score
+    pred_sigmoid = F.sigmoid(pred) if use_sigmoid else pred
+    scale_factor = pred_sigmoid
+    zerolabel = paddle.zeros(pred.shape, dtype='float32')
+    loss = func(pred, zerolabel, reduction='none') * scale_factor.pow(beta)
+
+    # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
+    bg_class_ind = pred.shape[1]
+    pos = paddle.logical_and((label >= 0),
+                             (label < bg_class_ind)).nonzero().squeeze(1)
+    if pos.shape[0] == 0:
+        return loss.sum(axis=1)
+    pos_label = paddle.gather(label, pos, axis=0)
+    pos_mask = np.zeros(pred.shape, dtype=np.int32)
+    pos_mask[pos.numpy(), pos_label.numpy()] = 1
+    pos_mask = paddle.to_tensor(pos_mask, dtype='bool')
+    score = score.unsqueeze(-1).expand([-1, pred.shape[1]]).cast('float32')
+    # positives are supervised by bbox quality (IoU) score
+    scale_factor_new = score - pred_sigmoid
+
+    loss_pos = func(
+        pred, score, reduction='none') * scale_factor_new.abs().pow(beta)
+    loss = loss * paddle.logical_not(pos_mask) + loss_pos * pos_mask
+    loss = loss.sum(axis=1)
+    return loss
+
+
+def distribution_focal_loss(pred, label):
+    """Distribution Focal Loss (DFL) is from `Generalized Focal Loss: Learning
+    Qualified and Distributed Bounding Boxes for Dense Object Detection
+    <https://arxiv.org/abs/2006.04388>`_.
+    Args:
+        pred (Tensor): Predicted general distribution of bounding boxes
+            (before softmax) with shape (N, n+1), n is the max value of the
+            integral set `{0, ..., n}` in paper.
+        label (Tensor): Target distance label for bounding boxes with
+            shape (N,).
+    Returns:
+        Tensor: Loss tensor with shape (N,).
+    """
+    dis_left = label.cast('int64')
+    dis_right = dis_left + 1
+    weight_left = dis_right.cast('float32') - label
+    weight_right = label - dis_left.cast('float32')
+    loss = F.cross_entropy(pred, dis_left, reduction='none') * weight_left \
+        + F.cross_entropy(pred, dis_right, reduction='none') * weight_right
+    return loss
+
+
+@register
+@serializable
+class QualityFocalLoss(nn.Layer):
+    r"""Quality Focal Loss (QFL) is a variant of `Generalized Focal Loss:
+    Learning Qualified and Distributed Bounding Boxes for Dense Object
+    Detection <https://arxiv.org/abs/2006.04388>`_.
+    Args:
+        use_sigmoid (bool): Whether sigmoid operation is conducted in QFL.
+            Defaults to True.
+        beta (float): The beta parameter for calculating the modulating factor.
+            Defaults to 2.0.
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Loss weight of current loss.
+    """
+
+    def __init__(self,
+                 use_sigmoid=True,
+                 beta=2.0,
+                 reduction='mean',
+                 loss_weight=1.0):
+        super(QualityFocalLoss, self).__init__()
+        self.use_sigmoid = use_sigmoid
+        self.beta = beta
+        assert reduction in ('none', 'mean', 'sum')
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, weight=None, avg_factor=None):
+        """Forward function.
+        Args:
+            pred (Tensor): Predicted joint representation of
+                classification and quality (IoU) estimation with shape (N, C),
+                C is the number of classes.
+            target (tuple([Tensor])): Target category label with shape
+                (N,) and target quality label with shape (N,).
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+        """
+
+        loss = self.loss_weight * quality_focal_loss(
+            pred, target, beta=self.beta, use_sigmoid=self.use_sigmoid)
+
+        if weight is not None:
+            loss = loss * weight
+        if avg_factor is None:
+            if self.reduction == 'none':
+                return loss
+            elif self.reduction == 'mean':
+                return loss.mean()
+            elif self.reduction == 'sum':
+                return loss.sum()
+        else:
+            # if reduction is mean, then average the loss by avg_factor
+            if self.reduction == 'mean':
+                loss = loss.sum() / avg_factor
+            # if reduction is 'none', then do nothing, otherwise raise an error
+            elif self.reduction != 'none':
+                raise ValueError(
+                    'avg_factor can not be used with reduction="sum"')
+        return loss
+
+
+@register
+@serializable
+class DistributionFocalLoss(nn.Layer):
+    """Distribution Focal Loss (DFL) is a variant of `Generalized Focal Loss:
+    Learning Qualified and Distributed Bounding Boxes for Dense Object
+    Detection <https://arxiv.org/abs/2006.04388>`_.
+    Args:
+        reduction (str): Options are `'none'`, `'mean'` and `'sum'`.
+        loss_weight (float): Loss weight of current loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(DistributionFocalLoss, self).__init__()
+        assert reduction in ('none', 'mean', 'sum')
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, weight=None, avg_factor=None):
+        """Forward function.
+        Args:
+            pred (Tensor): Predicted general distribution of bounding
+                boxes (before softmax) with shape (N, n+1), n is the max value
+                of the integral set `{0, ..., n}` in paper.
+            target (Tensor): Target distance label for bounding boxes
+                with shape (N,).
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+        """
+        loss = self.loss_weight * distribution_focal_loss(pred, target)
+        if weight is not None:
+            loss = loss * weight
+        if avg_factor is None:
+            if self.reduction == 'none':
+                return loss
+            elif self.reduction == 'mean':
+                return loss.mean()
+            elif self.reduction == 'sum':
+                return loss.sum()
+        else:
+            # if reduction is mean, then average the loss by avg_factor
+            if self.reduction == 'mean':
+                loss = loss.sum() / avg_factor
+            # if reduction is 'none', then do nothing, otherwise raise an error
+            elif self.reduction != 'none':
+                raise ValueError(
+                    'avg_factor can not be used with reduction="sum"')
+        return loss
--- a/paddle_detection/ppdet/modeling/losses/iou_aware_loss.py
+++ b/paddle_detection/ppdet/modeling/losses/iou_aware_loss.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+from .iou_loss import IouLoss
+from ..bbox_utils import bbox_iou
+
+
+@register
+@serializable
+class IouAwareLoss(IouLoss):
+    """
+    iou aware loss, see https://arxiv.org/abs/1912.05992
+    Args:
+        loss_weight (float): iou aware loss weight, default is 1.0
+        max_height (int): max height of input to support random shape input
+        max_width (int): max width of input to support random shape input
+    """
+
+    def __init__(self, loss_weight=1.0, giou=False, diou=False, ciou=False):
+        super(IouAwareLoss, self).__init__(
+            loss_weight=loss_weight, giou=giou, diou=diou, ciou=ciou)
+
+    def __call__(self, ioup, pbox, gbox):
+        iou = bbox_iou(
+            pbox, gbox, giou=self.giou, diou=self.diou, ciou=self.ciou)
+        iou.stop_gradient = True
+        loss_iou_aware = F.binary_cross_entropy_with_logits(
+            ioup, iou, reduction='none')
+        loss_iou_aware = loss_iou_aware * self.loss_weight
+        return loss_iou_aware
--- a/paddle_detection/ppdet/modeling/losses/iou_loss.py
+++ b/paddle_detection/ppdet/modeling/losses/iou_loss.py
@@ -0,0 +1,295 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import math
+import paddle
+
+from ppdet.core.workspace import register, serializable
+from ..bbox_utils import bbox_iou
+
+__all__ = ['IouLoss', 'GIoULoss', 'DIouLoss', 'SIoULoss']
+
+
+@register
+@serializable
+class IouLoss(object):
+    """
+    iou loss, see https://arxiv.org/abs/1908.03851
+    loss = 1.0 - iou * iou
+    Args:
+        loss_weight (float): iou loss weight, default is 2.5
+        max_height (int): max height of input to support random shape input
+        max_width (int): max width of input to support random shape input
+        ciou_term (bool): whether to add ciou_term
+        loss_square (bool): whether to square the iou term
+    """
+
+    def __init__(self,
+                 loss_weight=2.5,
+                 giou=False,
+                 diou=False,
+                 ciou=False,
+                 loss_square=True):
+        self.loss_weight = loss_weight
+        self.giou = giou
+        self.diou = diou
+        self.ciou = ciou
+        self.loss_square = loss_square
+
+    def __call__(self, pbox, gbox):
+        iou = bbox_iou(
+            pbox, gbox, giou=self.giou, diou=self.diou, ciou=self.ciou)
+        if self.loss_square:
+            loss_iou = 1 - iou * iou
+        else:
+            loss_iou = 1 - iou
+
+        loss_iou = loss_iou * self.loss_weight
+        return loss_iou
+
+
+@register
+@serializable
+class GIoULoss(object):
+    """
+    Generalized Intersection over Union, see https://arxiv.org/abs/1902.09630
+    Args:
+        loss_weight (float): giou loss weight, default as 1
+        eps (float): epsilon to avoid divide by zero, default as 1e-10
+        reduction (string): Options are "none", "mean" and "sum". default as none
+    """
+
+    def __init__(self, loss_weight=1., eps=1e-10, reduction='none'):
+        self.loss_weight = loss_weight
+        self.eps = eps
+        assert reduction in ('none', 'mean', 'sum')
+        self.reduction = reduction
+
+    def bbox_overlap(self, box1, box2, eps=1e-10):
+        """calculate the iou of box1 and box2
+        Args:
+            box1 (Tensor): box1 with the shape (..., 4)
+            box2 (Tensor): box1 with the shape (..., 4)
+            eps (float): epsilon to avoid divide by zero
+        Return:
+            iou (Tensor): iou of box1 and box2
+            overlap (Tensor): overlap of box1 and box2
+            union (Tensor): union of box1 and box2
+        """
+        x1, y1, x2, y2 = box1
+        x1g, y1g, x2g, y2g = box2
+
+        xkis1 = paddle.maximum(x1, x1g)
+        ykis1 = paddle.maximum(y1, y1g)
+        xkis2 = paddle.minimum(x2, x2g)
+        ykis2 = paddle.minimum(y2, y2g)
+        w_inter = (xkis2 - xkis1).clip(0)
+        h_inter = (ykis2 - ykis1).clip(0)
+        overlap = w_inter * h_inter
+
+        area1 = (x2 - x1) * (y2 - y1)
+        area2 = (x2g - x1g) * (y2g - y1g)
+        union = area1 + area2 - overlap + eps
+        iou = overlap / union
+
+        return iou, overlap, union
+
+    def __call__(self, pbox, gbox, iou_weight=1., loc_reweight=None):
+        x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
+        x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)
+        box1 = [x1, y1, x2, y2]
+        box2 = [x1g, y1g, x2g, y2g]
+        iou, overlap, union = self.bbox_overlap(box1, box2, self.eps)
+        xc1 = paddle.minimum(x1, x1g)
+        yc1 = paddle.minimum(y1, y1g)
+        xc2 = paddle.maximum(x2, x2g)
+        yc2 = paddle.maximum(y2, y2g)
+
+        area_c = (xc2 - xc1) * (yc2 - yc1) + self.eps
+        miou = iou - ((area_c - union) / area_c)
+        if loc_reweight is not None:
+            loc_reweight = paddle.reshape(loc_reweight, shape=(-1, 1))
+            loc_thresh = 0.9
+            giou = 1 - (1 - loc_thresh
+                        ) * miou - loc_thresh * miou * loc_reweight
+        else:
+            giou = 1 - miou
+        if self.reduction == 'none':
+            loss = giou
+        elif self.reduction == 'sum':
+            loss = paddle.sum(giou * iou_weight)
+        else:
+            loss = paddle.mean(giou * iou_weight)
+        return loss * self.loss_weight
+
+
+@register
+@serializable
+class DIouLoss(GIoULoss):
+    """
+    Distance-IoU Loss, see https://arxiv.org/abs/1911.08287
+    Args:
+        loss_weight (float): giou loss weight, default as 1
+        eps (float): epsilon to avoid divide by zero, default as 1e-10
+        use_complete_iou_loss (bool): whether to use complete iou loss
+    """
+
+    def __init__(self, loss_weight=1., eps=1e-10, use_complete_iou_loss=True):
+        super(DIouLoss, self).__init__(loss_weight=loss_weight, eps=eps)
+        self.use_complete_iou_loss = use_complete_iou_loss
+
+    def __call__(self, pbox, gbox, iou_weight=1.):
+        x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
+        x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)
+        cx = (x1 + x2) / 2
+        cy = (y1 + y2) / 2
+        w = x2 - x1
+        h = y2 - y1
+
+        cxg = (x1g + x2g) / 2
+        cyg = (y1g + y2g) / 2
+        wg = x2g - x1g
+        hg = y2g - y1g
+
+        x2 = paddle.maximum(x1, x2)
+        y2 = paddle.maximum(y1, y2)
+
+        # A and B
+        xkis1 = paddle.maximum(x1, x1g)
+        ykis1 = paddle.maximum(y1, y1g)
+        xkis2 = paddle.minimum(x2, x2g)
+        ykis2 = paddle.minimum(y2, y2g)
+
+        # A or B
+        xc1 = paddle.minimum(x1, x1g)
+        yc1 = paddle.minimum(y1, y1g)
+        xc2 = paddle.maximum(x2, x2g)
+        yc2 = paddle.maximum(y2, y2g)
+
+        intsctk = (xkis2 - xkis1) * (ykis2 - ykis1)
+        intsctk = intsctk * paddle.greater_than(
+            xkis2, xkis1) * paddle.greater_than(ykis2, ykis1)
+        unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g
+                                                        ) - intsctk + self.eps
+        iouk = intsctk / unionk
+
+        # DIOU term
+        dist_intersection = (cx - cxg) * (cx - cxg) + (cy - cyg) * (cy - cyg)
+        dist_union = (xc2 - xc1) * (xc2 - xc1) + (yc2 - yc1) * (yc2 - yc1)
+        diou_term = (dist_intersection + self.eps) / (dist_union + self.eps)
+
+        # CIOU term
+        ciou_term = 0
+        if self.use_complete_iou_loss:
+            ar_gt = wg / hg
+            ar_pred = w / h
+            arctan = paddle.atan(ar_gt) - paddle.atan(ar_pred)
+            ar_loss = 4. / np.pi / np.pi * arctan * arctan
+            alpha = ar_loss / (1 - iouk + ar_loss + self.eps)
+            alpha.stop_gradient = True
+            ciou_term = alpha * ar_loss
+
+        diou = paddle.mean((1 - iouk + ciou_term + diou_term) * iou_weight)
+
+        return diou * self.loss_weight
+
+
+@register
+@serializable
+class SIoULoss(GIoULoss):
+    """
+    see https://arxiv.org/pdf/2205.12740.pdf 
+    Args:
+        loss_weight (float): siou loss weight, default as 1
+        eps (float): epsilon to avoid divide by zero, default as 1e-10
+        theta (float): default as 4
+        reduction (str): Options are "none", "mean" and "sum". default as none
+    """
+
+    def __init__(self, loss_weight=1., eps=1e-10, theta=4., reduction='none'):
+        super(SIoULoss, self).__init__(loss_weight=loss_weight, eps=eps)
+        self.loss_weight = loss_weight
+        self.eps = eps
+        self.theta = theta
+        self.reduction = reduction
+
+    def __call__(self, pbox, gbox):
+        x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
+        x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)
+
+        box1 = [x1, y1, x2, y2]
+        box2 = [x1g, y1g, x2g, y2g]
+        iou = bbox_iou(box1, box2)
+
+        cx = (x1 + x2) / 2
+        cy = (y1 + y2) / 2
+        w = x2 - x1 + self.eps
+        h = y2 - y1 + self.eps
+
+        cxg = (x1g + x2g) / 2
+        cyg = (y1g + y2g) / 2
+        wg = x2g - x1g + self.eps
+        hg = y2g - y1g + self.eps
+
+        x2 = paddle.maximum(x1, x2)
+        y2 = paddle.maximum(y1, y2)
+
+        # A or B
+        xc1 = paddle.minimum(x1, x1g)
+        yc1 = paddle.minimum(y1, y1g)
+        xc2 = paddle.maximum(x2, x2g)
+        yc2 = paddle.maximum(y2, y2g)
+
+        cw_out = xc2 - xc1
+        ch_out = yc2 - yc1
+
+        ch = paddle.maximum(cy, cyg) - paddle.minimum(cy, cyg)
+        cw = paddle.maximum(cx, cxg) - paddle.minimum(cx, cxg)
+
+        # angle cost
+        dist_intersection = paddle.sqrt((cx - cxg)**2 + (cy - cyg)**2)
+        sin_angle_alpha = ch / dist_intersection
+        sin_angle_beta = cw / dist_intersection
+        thred = paddle.pow(paddle.to_tensor(2), 0.5) / 2
+        thred.stop_gradient = True
+        sin_alpha = paddle.where(sin_angle_alpha > thred, sin_angle_beta,
+                                 sin_angle_alpha)
+        angle_cost = paddle.cos(paddle.asin(sin_alpha) * 2 - math.pi / 2)
+
+        # distance cost
+        gamma = 2 - angle_cost
+        # gamma.stop_gradient = True
+        beta_x = ((cxg - cx) / cw_out)**2
+        beta_y = ((cyg - cy) / ch_out)**2
+        dist_cost = 1 - paddle.exp(-gamma * beta_x) + 1 - paddle.exp(-gamma *
+                                                                     beta_y)
+
+        # shape cost
+        omega_w = paddle.abs(w - wg) / paddle.maximum(w, wg)
+        omega_h = paddle.abs(hg - h) / paddle.maximum(h, hg)
+        omega = (1 - paddle.exp(-omega_w))**self.theta + (
+            1 - paddle.exp(-omega_h))**self.theta
+        siou_loss = 1 - iou + (omega + dist_cost) / 2
+
+        if self.reduction == 'mean':
+            siou_loss = paddle.mean(siou_loss)
+        elif self.reduction == 'sum':
+            siou_loss = paddle.sum(siou_loss)
+
+        return siou_loss * self.loss_weight
--- a/paddle_detection/ppdet/modeling/losses/jde_loss.py
+++ b/paddle_detection/ppdet/modeling/losses/jde_loss.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+
+__all__ = ['JDEDetectionLoss', 'JDEEmbeddingLoss', 'JDELoss']
+
+
+@register
+class JDEDetectionLoss(nn.Layer):
+    __shared__ = ['num_classes']
+
+    def __init__(self, num_classes=1, for_mot=True):
+        super(JDEDetectionLoss, self).__init__()
+        self.num_classes = num_classes
+        self.for_mot = for_mot
+
+    def det_loss(self, p_det, anchor, t_conf, t_box):
+        pshape = paddle.shape(p_det)
+        pshape.stop_gradient = True
+        nB, nGh, nGw = pshape[0], pshape[-2], pshape[-1]
+        nA = len(anchor)
+        p_det = paddle.reshape(
+            p_det, [nB, nA, self.num_classes + 5, nGh, nGw]).transpose(
+                (0, 1, 3, 4, 2))
+
+        # 1. loss_conf: cross_entropy
+        p_conf = p_det[:, :, :, :, 4:6]
+        p_conf_flatten = paddle.reshape(p_conf, [-1, 2])
+        t_conf_flatten = t_conf.flatten()
+        t_conf_flatten = paddle.cast(t_conf_flatten, dtype="int64")
+        t_conf_flatten.stop_gradient = True
+        loss_conf = F.cross_entropy(
+            p_conf_flatten, t_conf_flatten, ignore_index=-1, reduction='mean')
+        loss_conf.stop_gradient = False
+
+        # 2. loss_box: smooth_l1_loss
+        p_box = p_det[:, :, :, :, :4]
+        p_box_flatten = paddle.reshape(p_box, [-1, 4])
+        t_box_flatten = paddle.reshape(t_box, [-1, 4])
+        fg_inds = paddle.nonzero(t_conf_flatten > 0).flatten()
+        if fg_inds.numel() > 0:
+            reg_delta = paddle.gather(p_box_flatten, fg_inds)
+            reg_target = paddle.gather(t_box_flatten, fg_inds)
+        else:
+            reg_delta = paddle.to_tensor([0, 0, 0, 0], dtype='float32')
+            reg_delta.stop_gradient = False
+            reg_target = paddle.to_tensor([0, 0, 0, 0], dtype='float32')
+        reg_target.stop_gradient = True
+        loss_box = F.smooth_l1_loss(
+            reg_delta, reg_target, reduction='mean', delta=1.0)
+        loss_box.stop_gradient = False
+
+        return loss_conf, loss_box
+
+    def forward(self, det_outs, targets, anchors):
+        """
+        Args:
+            det_outs (list[Tensor]): output from detection head, each one
+                is a 4-D Tensor with shape [N, C, H, W].
+            targets (dict): contains 'im_id', 'gt_bbox', 'gt_ide', 'image',
+                'im_shape', 'scale_factor' and 'tbox', 'tconf', 'tide' of
+                each FPN level.
+            anchors (list[list]): anchor setting of JDE model, N row M col, N is
+                the anchor levels(FPN levels), M is the anchor scales each
+                level.
+        """
+        assert len(det_outs) == len(anchors)
+        loss_confs = []
+        loss_boxes = []
+        for i, (p_det, anchor) in enumerate(zip(det_outs, anchors)):
+            t_conf = targets['tconf{}'.format(i)]
+            t_box = targets['tbox{}'.format(i)]
+
+            loss_conf, loss_box = self.det_loss(p_det, anchor, t_conf, t_box)
+            loss_confs.append(loss_conf)
+            loss_boxes.append(loss_box)
+        if self.for_mot:
+            return {'loss_confs': loss_confs, 'loss_boxes': loss_boxes}
+        else:
+            jde_conf_losses = sum(loss_confs)
+            jde_box_losses = sum(loss_boxes)
+            jde_det_losses = {
+                "loss_conf": jde_conf_losses,
+                "loss_box": jde_box_losses,
+                "loss": jde_conf_losses + jde_box_losses,
+            }
+            return jde_det_losses
+
+
+@register
+class JDEEmbeddingLoss(nn.Layer):
+    def __init__(self, ):
+        super(JDEEmbeddingLoss, self).__init__()
+        self.phony = self.create_parameter(shape=[1], dtype="float32")
+
+    def emb_loss(self, p_ide, t_conf, t_ide, emb_scale, classifier):
+        emb_dim = p_ide.shape[1]
+        p_ide = p_ide.transpose((0, 2, 3, 1))
+        p_ide_flatten = paddle.reshape(p_ide, [-1, emb_dim])
+        mask = t_conf > 0
+        mask = paddle.cast(mask, dtype="int64")
+        mask.stop_gradient = True
+        emb_mask = mask.max(1).flatten()
+        emb_mask_inds = paddle.nonzero(emb_mask > 0).flatten()
+        emb_mask_inds.stop_gradient = True
+        # use max(1) to decide the id, TODO: more reseanable strategy
+        t_ide_flatten = t_ide.max(1).flatten()
+        t_ide_flatten = paddle.cast(t_ide_flatten, dtype="int64")
+        valid_inds = paddle.nonzero(t_ide_flatten != -1).flatten()
+
+        if emb_mask_inds.numel() == 0 or valid_inds.numel() == 0:
+            # loss_ide = paddle.to_tensor([0]) # will be error in gradient backward
+            loss_ide = self.phony * 0  # todo
+        else:
+            embedding = paddle.gather(p_ide_flatten, emb_mask_inds)
+            embedding = emb_scale * F.normalize(embedding)
+            logits = classifier(embedding)
+
+            ide_target = paddle.gather(t_ide_flatten, emb_mask_inds)
+
+            loss_ide = F.cross_entropy(
+                logits, ide_target, ignore_index=-1, reduction='mean')
+        loss_ide.stop_gradient = False
+
+        return loss_ide
+
+    def forward(self, ide_outs, targets, emb_scale, classifier):
+        loss_ides = []
+        for i, p_ide in enumerate(ide_outs):
+            t_conf = targets['tconf{}'.format(i)]
+            t_ide = targets['tide{}'.format(i)]
+
+            loss_ide = self.emb_loss(p_ide, t_conf, t_ide, emb_scale,
+                                     classifier)
+            loss_ides.append(loss_ide)
+        return loss_ides
+
+
+@register
+class JDELoss(nn.Layer):
+    def __init__(self):
+        super(JDELoss, self).__init__()
+
+    def forward(self, loss_confs, loss_boxes, loss_ides, loss_params_cls,
+                loss_params_reg, loss_params_ide, targets):
+        assert len(loss_confs) == len(loss_boxes) == len(loss_ides)
+        assert len(loss_params_cls) == len(loss_params_reg) == len(
+            loss_params_ide)
+        assert len(loss_confs) == len(loss_params_cls)
+
+        batchsize = targets['gt_bbox'].shape[0]
+        nTargets = paddle.nonzero(paddle.sum(targets['gt_bbox'], axis=2)).shape[
+            0] / batchsize
+        nTargets = paddle.to_tensor(nTargets, dtype='float32')
+        nTargets.stop_gradient = True
+
+        jde_losses = []
+        for i, (loss_conf, loss_box, loss_ide, l_conf_p, l_box_p,
+                l_ide_p) in enumerate(
+                    zip(loss_confs, loss_boxes, loss_ides, loss_params_cls,
+                        loss_params_reg, loss_params_ide)):
+
+            jde_loss = l_conf_p(loss_conf) + l_box_p(loss_box) + l_ide_p(
+                loss_ide)
+            jde_losses.append(jde_loss)
+
+        loss_all = {
+            "loss_conf": sum(loss_confs),
+            "loss_box": sum(loss_boxes),
+            "loss_ide": sum(loss_ides),
+            "loss": sum(jde_losses),
+            "nTargets": nTargets,
+        }
+        return loss_all
--- a/paddle_detection/ppdet/modeling/losses/keypoint_loss.py
+++ b/paddle_detection/ppdet/modeling/losses/keypoint_loss.py
@@ -0,0 +1,632 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from itertools import cycle, islice
+from collections import abc
+import numpy as np
+import paddle
+import paddle.nn as nn
+
+from ppdet.core.workspace import register, serializable
+
+__all__ = ['HrHRNetLoss', 'KeyPointMSELoss', 'OKSLoss', 'CenterFocalLoss', 'L1Loss']
+
+
+@register
+@serializable
+class KeyPointMSELoss(nn.Layer):
+    def __init__(self, use_target_weight=True, loss_scale=0.5):
+        """
+        KeyPointMSELoss layer
+
+        Args:
+            use_target_weight (bool): whether to use target weight
+        """
+        super(KeyPointMSELoss, self).__init__()
+        self.criterion = nn.MSELoss(reduction='mean')
+        self.use_target_weight = use_target_weight
+        self.loss_scale = loss_scale
+
+    def forward(self, output, records):
+        target = records['target']
+        target_weight = records['target_weight']
+        batch_size = output.shape[0]
+        num_joints = output.shape[1]
+        heatmaps_pred = output.reshape(
+            (batch_size, num_joints, -1)).split(num_joints, 1)
+        heatmaps_gt = target.reshape(
+            (batch_size, num_joints, -1)).split(num_joints, 1)
+        loss = 0
+        for idx in range(num_joints):
+            heatmap_pred = heatmaps_pred[idx].squeeze()
+            heatmap_gt = heatmaps_gt[idx].squeeze()
+            if self.use_target_weight:
+                loss += self.loss_scale * self.criterion(
+                    heatmap_pred.multiply(target_weight[:, idx]),
+                    heatmap_gt.multiply(target_weight[:, idx]))
+            else:
+                loss += self.loss_scale * self.criterion(heatmap_pred,
+                                                         heatmap_gt)
+        keypoint_losses = dict()
+        keypoint_losses['loss'] = loss / num_joints
+        return keypoint_losses
+
+
+@register
+@serializable
+class HrHRNetLoss(nn.Layer):
+    def __init__(self, num_joints, swahr):
+        """
+        HrHRNetLoss layer
+
+        Args:
+            num_joints (int): number of keypoints
+        """
+        super(HrHRNetLoss, self).__init__()
+        if swahr:
+            self.heatmaploss = HeatMapSWAHRLoss(num_joints)
+        else:
+            self.heatmaploss = HeatMapLoss()
+        self.aeloss = AELoss()
+        self.ziploss = ZipLoss(
+            [self.heatmaploss, self.heatmaploss, self.aeloss])
+
+    def forward(self, inputs, records):
+        targets = []
+        targets.append([records['heatmap_gt1x'], records['mask_1x']])
+        targets.append([records['heatmap_gt2x'], records['mask_2x']])
+        targets.append(records['tagmap'])
+        keypoint_losses = dict()
+        loss = self.ziploss(inputs, targets)
+        keypoint_losses['heatmap_loss'] = loss[0] + loss[1]
+        keypoint_losses['pull_loss'] = loss[2][0]
+        keypoint_losses['push_loss'] = loss[2][1]
+        keypoint_losses['loss'] = recursive_sum(loss)
+        return keypoint_losses
+
+
+class HeatMapLoss(object):
+    def __init__(self, loss_factor=1.0):
+        super(HeatMapLoss, self).__init__()
+        self.loss_factor = loss_factor
+
+    def __call__(self, preds, targets):
+        heatmap, mask = targets
+        loss = ((preds - heatmap)**2 * mask.cast('float').unsqueeze(1))
+        loss = paddle.clip(loss, min=0, max=2).mean()
+        loss *= self.loss_factor
+        return loss
+
+
+class HeatMapSWAHRLoss(object):
+    def __init__(self, num_joints, loss_factor=1.0):
+        super(HeatMapSWAHRLoss, self).__init__()
+        self.loss_factor = loss_factor
+        self.num_joints = num_joints
+
+    def __call__(self, preds, targets):
+        heatmaps_gt, mask = targets
+        heatmaps_pred = preds[0]
+        scalemaps_pred = preds[1]
+
+        heatmaps_scaled_gt = paddle.where(heatmaps_gt > 0, 0.5 * heatmaps_gt * (
+            1 + (1 +
+                 (scalemaps_pred - 1.) * paddle.log(heatmaps_gt + 1e-10))**2),
+                                          heatmaps_gt)
+
+        regularizer_loss = paddle.mean(
+            paddle.pow((scalemaps_pred - 1.) * (heatmaps_gt > 0).astype(float),
+                       2))
+        omiga = 0.01
+        # thres = 2**(-1/omiga), threshold for positive weight
+        hm_weight = heatmaps_scaled_gt**(
+            omiga
+        ) * paddle.abs(1 - heatmaps_pred) + paddle.abs(heatmaps_pred) * (
+            1 - heatmaps_scaled_gt**(omiga))
+
+        loss = (((heatmaps_pred - heatmaps_scaled_gt)**2) *
+                mask.cast('float').unsqueeze(1)) * hm_weight
+        loss = loss.mean()
+        loss = self.loss_factor * (loss + 1.0 * regularizer_loss)
+        return loss
+
+
+class AELoss(object):
+    def __init__(self, pull_factor=0.001, push_factor=0.001):
+        super(AELoss, self).__init__()
+        self.pull_factor = pull_factor
+        self.push_factor = push_factor
+
+    def apply_single(self, pred, tagmap):
+        if tagmap.numpy()[:, :, 3].sum() == 0:
+            return (paddle.zeros([1]), paddle.zeros([1]))
+        nonzero = paddle.nonzero(tagmap[:, :, 3] > 0)
+        if nonzero.shape[0] == 0:
+            return (paddle.zeros([1]), paddle.zeros([1]))
+        p_inds = paddle.unique(nonzero[:, 0])
+        num_person = p_inds.shape[0]
+        if num_person == 0:
+            return (paddle.zeros([1]), paddle.zeros([1]))
+
+        pull = 0
+        tagpull_num = 0
+        embs_all = []
+        person_unvalid = 0
+        for person_idx in p_inds.numpy():
+            valid_single = tagmap[person_idx.item()]
+            validkpts = paddle.nonzero(valid_single[:, 3] > 0)
+            valid_single = paddle.index_select(valid_single, validkpts)
+            emb = paddle.gather_nd(pred, valid_single[:, :3])
+            if emb.shape[0] == 1:
+                person_unvalid += 1
+            mean = paddle.mean(emb, axis=0)
+            embs_all.append(mean)
+            pull += paddle.mean(paddle.pow(emb - mean, 2), axis=0)
+            tagpull_num += emb.shape[0]
+        pull /= max(num_person - person_unvalid, 1)
+        if num_person < 2:
+            return pull, paddle.zeros([1])
+
+        embs_all = paddle.stack(embs_all)
+        A = embs_all.expand([num_person, num_person])
+        B = A.transpose([1, 0])
+        diff = A - B
+
+        diff = paddle.pow(diff, 2)
+        push = paddle.exp(-diff)
+        push = paddle.sum(push) - num_person
+
+        push /= 2 * num_person * (num_person - 1)
+        return pull, push
+
+    def __call__(self, preds, tagmaps):
+        bs = preds.shape[0]
+        losses = [
+            self.apply_single(preds[i:i + 1].squeeze(),
+                              tagmaps[i:i + 1].squeeze()) for i in range(bs)
+        ]
+        pull = self.pull_factor * sum(loss[0] for loss in losses) / len(losses)
+        push = self.push_factor * sum(loss[1] for loss in losses) / len(losses)
+        return pull, push
+
+
+class ZipLoss(object):
+    def __init__(self, loss_funcs):
+        super(ZipLoss, self).__init__()
+        self.loss_funcs = loss_funcs
+
+    def __call__(self, inputs, targets):
+        assert len(self.loss_funcs) == len(targets) >= len(inputs)
+
+        def zip_repeat(*args):
+            longest = max(map(len, args))
+            filled = [islice(cycle(x), longest) for x in args]
+            return zip(*filled)
+
+        return tuple(
+            fn(x, y)
+            for x, y, fn in zip_repeat(inputs, targets, self.loss_funcs))
+
+
+def recursive_sum(inputs):
+    if isinstance(inputs, abc.Sequence):
+        return sum([recursive_sum(x) for x in inputs])
+    return inputs
+
+
+def oks_overlaps(kpt_preds, kpt_gts, kpt_valids, kpt_areas, sigmas):
+    if not kpt_gts.astype('bool').any():
+        return kpt_preds.sum()*0
+    
+    sigmas = paddle.to_tensor(sigmas, dtype=kpt_preds.dtype)
+    variances = (sigmas * 2)**2
+
+    assert kpt_preds.shape[0] == kpt_gts.shape[0]
+    kpt_preds = kpt_preds.reshape((-1, kpt_preds.shape[-1] // 2, 2))
+    kpt_gts = kpt_gts.reshape((-1, kpt_gts.shape[-1] // 2, 2))
+
+    squared_distance = (kpt_preds[:, :, 0] - kpt_gts[:, :, 0]) ** 2 + \
+        (kpt_preds[:, :, 1] - kpt_gts[:, :, 1]) ** 2
+    assert (kpt_valids.sum(-1) > 0).all()
+    squared_distance0 = squared_distance / (
+        kpt_areas[:, None] * variances[None, :] * 2)
+    squared_distance1 = paddle.exp(-squared_distance0)
+    squared_distance1 = squared_distance1 * kpt_valids
+    oks = squared_distance1.sum(axis=1) / kpt_valids.sum(axis=1)
+
+    return oks
+
+
+def oks_loss(pred,
+             target,
+             weight,
+             valid=None,
+             area=None,
+             linear=False,
+             sigmas=None,
+             eps=1e-6,
+             avg_factor=None, 
+             reduction=None):
+    """Oks loss.
+
+    Computing the oks loss between a set of predicted poses and target poses.
+    The loss is calculated as negative log of oks.
+
+    Args:
+        pred (Tensor): Predicted poses of format (x1, y1, x2, y2, ...),
+            shape (n, K*2).
+        target (Tensor): Corresponding gt poses, shape (n, K*2).
+        linear (bool, optional): If True, use linear scale of loss instead of
+            log scale. Default: False.
+        eps (float): Eps to avoid log(0).
+
+    Returns:
+        Tensor: Loss tensor.
+    """
+    oks = oks_overlaps(pred, target, valid, area, sigmas).clip(min=eps)
+    if linear:
+        loss = 1 - oks
+    else:
+        loss = -oks.log()
+
+    if weight is not None:
+        if weight.shape != loss.shape:
+            if weight.shape[0] == loss.shape[0]:
+                # For most cases, weight is of shape (num_priors, ),
+                #  which means it does not have the second axis num_class
+                weight = weight.reshape((-1, 1))
+            else:
+                # Sometimes, weight per anchor per class is also needed. e.g.
+                #  in FSAF. But it may be flattened of shape
+                #  (num_priors x num_class, ), while loss is still of shape
+                #  (num_priors, num_class).
+                assert weight.numel() == loss.numel()
+                weight = weight.reshape((loss.shape[0], -1))
+        assert weight.ndim == loss.ndim
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        if reduction == 'mean':
+            loss = loss.mean()
+        elif reduction == 'sum':
+            loss = loss.sum()
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            # Avoid causing ZeroDivisionError when avg_factor is 0.0,
+            # i.e., all labels of an image belong to ignore index.
+            eps = 1e-10
+            loss = loss.sum() / (avg_factor + eps)
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+
+
+    return loss
+
+@register
+@serializable
+class OKSLoss(nn.Layer):
+    """OKSLoss.
+
+    Computing the oks loss between a set of predicted poses and target poses.
+
+    Args:
+        linear (bool): If True, use linear scale of loss instead of log scale.
+            Default: False.
+        eps (float): Eps to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+    """
+
+    def __init__(self,
+                 linear=False,
+                 num_keypoints=17,
+                 eps=1e-6,
+                 reduction='mean',
+                 loss_weight=1.0):
+        super(OKSLoss, self).__init__()
+        self.linear = linear
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        if num_keypoints == 17:
+            self.sigmas = np.array([
+                .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
+                1.07, .87, .87, .89, .89
+            ], dtype=np.float32) / 10.0
+        elif num_keypoints == 14:
+            self.sigmas = np.array([
+                .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89,
+                .79, .79
+            ]) / 10.0
+        else:
+            raise ValueError(f'Unsupported keypoints number {num_keypoints}')
+
+    def forward(self,
+                pred,
+                target,
+                valid,
+                area,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        """Forward function.
+
+        Args:
+            pred (Tensor): The prediction.
+            target (Tensor): The learning target of the prediction.
+            valid (Tensor): The visible flag of the target pose.
+            area (Tensor): The area of the target pose.
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if (weight is not None) and (not paddle.any(weight > 0)) and (
+                reduction != 'none'):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # iou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * oks_loss(
+            pred,
+            target,
+            weight,
+            valid=valid,
+            area=area,
+            linear=self.linear,
+            sigmas=self.sigmas,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            **kwargs)
+        return loss
+
+
+def center_focal_loss(pred, gt, weight=None, mask=None, avg_factor=None, reduction=None):
+    """Modified focal loss. Exactly the same as CornerNet.
+    Runs faster and costs a little bit more memory.
+
+    Args:
+        pred (Tensor): The prediction with shape [bs, c, h, w].
+        gt (Tensor): The learning target of the prediction in gaussian
+            distribution, with shape [bs, c, h, w].
+        mask (Tensor): The valid mask. Defaults to None.
+    """
+    if not gt.astype('bool').any():
+        return pred.sum()*0
+    pos_inds = gt.equal(1).astype('float32')
+    if mask is None:
+        neg_inds = gt.less_than(paddle.to_tensor([1], dtype='float32')).astype('float32')
+    else:
+        neg_inds = gt.less_than(paddle.to_tensor([1], dtype='float32')).astype('float32') * mask.equal(0).astype('float32')
+
+    neg_weights = paddle.pow(1 - gt, 4)
+
+    loss = 0
+
+    pos_loss = paddle.log(pred) * paddle.pow(1 - pred, 2) * pos_inds
+    neg_loss = paddle.log(1 - pred) * paddle.pow(pred, 2) * neg_weights * \
+        neg_inds
+
+    num_pos = pos_inds.astype('float32').sum()
+    pos_loss = pos_loss.sum()
+    neg_loss = neg_loss.sum()
+
+    if num_pos == 0:
+        loss = loss - neg_loss
+    else:
+        loss = loss - (pos_loss + neg_loss) / num_pos
+
+    if weight is not None:
+        if weight.shape != loss.shape:
+            if weight.shape[0] == loss.shape[0]:
+                # For most cases, weight is of shape (num_priors, ),
+                #  which means it does not have the second axis num_class
+                weight = weight.reshape((-1, 1))
+            else:
+                # Sometimes, weight per anchor per class is also needed. e.g.
+                #  in FSAF. But it may be flattened of shape
+                #  (num_priors x num_class, ), while loss is still of shape
+                #  (num_priors, num_class).
+                assert weight.numel() == loss.numel()
+                weight = weight.reshape((loss.shape[0], -1))
+        assert weight.ndim == loss.ndim
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        if reduction == 'mean':
+            loss = loss.mean()
+        elif reduction == 'sum':
+            loss = loss.sum()
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            # Avoid causing ZeroDivisionError when avg_factor is 0.0,
+            # i.e., all labels of an image belong to ignore index.
+            eps = 1e-10
+            loss = loss.sum() / (avg_factor + eps)
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+
+    return loss
+
+@register
+@serializable
+class CenterFocalLoss(nn.Layer):
+    """CenterFocalLoss is a variant of focal loss.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/1808.01244>`_
+
+    Args:
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Loss weight of current loss.
+    """
+
+    def __init__(self,
+                 reduction='none',
+                 loss_weight=1.0):
+        super(CenterFocalLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                mask=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (Tensor): The prediction.
+            target (Tensor): The learning target of the prediction in gaussian
+                distribution.
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            mask (Tensor): The valid mask. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_reg = self.loss_weight * center_focal_loss(
+            pred,
+            target,
+            weight,
+            mask=mask,
+            reduction=reduction,
+            avg_factor=avg_factor)
+        return loss_reg
+
+def l1_loss(pred, target, weight=None, reduction='mean', avg_factor=None):
+    """L1 loss.
+
+    Args:
+        pred (Tensor): The prediction.
+        target (Tensor): The learning target of the prediction.
+
+    Returns:
+        Tensor: Calculated loss
+    """
+    if not target.astype('bool').any():
+        return pred.sum() * 0
+
+    assert pred.shape == target.shape
+    loss = paddle.abs(pred - target)
+
+    if weight is not None:
+        if weight.shape != loss.shape:
+            if weight.shape[0] == loss.shape[0]:
+                # For most cases, weight is of shape (num_priors, ),
+                #  which means it does not have the second axis num_class
+                weight = weight.reshape((-1, 1))
+            else:
+                # Sometimes, weight per anchor per class is also needed. e.g.
+                #  in FSAF. But it may be flattened of shape
+                #  (num_priors x num_class, ), while loss is still of shape
+                #  (num_priors, num_class).
+                assert weight.numel() == loss.numel()
+                weight = weight.reshape((loss.shape[0], -1))
+        assert weight.ndim == loss.ndim
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        if reduction == 'mean':
+            loss = loss.mean()
+        elif reduction == 'sum':
+            loss = loss.sum()
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            # Avoid causing ZeroDivisionError when avg_factor is 0.0,
+            # i.e., all labels of an image belong to ignore index.
+            eps = 1e-10
+            loss = loss.sum() / (avg_factor + eps)
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+
+
+    return loss
+
+@register
+@serializable
+class L1Loss(nn.Layer):
+    """L1 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(L1Loss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (Tensor): The prediction.
+            target (Tensor): The learning target of the prediction.
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss_bbox = self.loss_weight * l1_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss_bbox
+
--- a/paddle_detection/ppdet/modeling/losses/pose3d_loss.py
+++ b/paddle_detection/ppdet/modeling/losses/pose3d_loss.py
@@ -0,0 +1,250 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from itertools import cycle, islice
+from collections import abc
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppdet.core.workspace import register, serializable
+from ppdet.utils.logger import setup_logger
+logger = setup_logger('ppdet.engine')
+
+__all__ = ['Pose3DLoss']
+
+
+@register
+@serializable
+class Pose3DLoss(nn.Layer):
+    def __init__(self, weight_3d=1.0, weight_2d=0.0, reduction='none'):
+        """
+        KeyPointMSELoss layer
+
+        Args:
+            weight_3d (float): weight of 3d loss
+            weight_2d (float): weight of 2d loss
+            reduction (bool): whether use reduction to loss
+        """
+        super(Pose3DLoss, self).__init__()
+        self.weight_3d = weight_3d
+        self.weight_2d = weight_2d
+        self.criterion_2dpose = nn.MSELoss(reduction=reduction)
+        self.criterion_3dpose = nn.L1Loss(reduction=reduction)
+        self.criterion_smoothl1 = nn.SmoothL1Loss(
+            reduction=reduction, delta=1.0)
+        self.criterion_vertices = nn.L1Loss()
+
+    def forward(self, pred3d, pred2d, inputs):
+        """
+        mpjpe: mpjpe loss between 3d joints
+        keypoint_2d_loss: 2d joints loss compute by criterion_2dpose
+        """
+        gt_3d_joints = inputs['joints_3d']
+        gt_2d_joints = inputs['joints_2d']
+        has_3d_joints = inputs['has_3d_joints']
+        has_2d_joints = inputs['has_2d_joints']
+
+        loss_3d = mpjpe_focal(pred3d, gt_3d_joints, has_3d_joints)
+        loss = self.weight_3d * loss_3d
+        epoch = inputs['epoch_id']
+        if self.weight_2d > 0:
+            weight = self.weight_2d * pow(0.1, (epoch // 8))
+            if epoch > 8:
+                weight = 0
+            loss_2d = keypoint_2d_loss(self.criterion_2dpose, pred2d,
+                                       gt_2d_joints, has_2d_joints)
+            loss += weight * loss_2d
+        return loss
+
+
+def filter_3d_joints(pred, gt, has_3d_joints):
+    """ 
+    filter 3d joints
+    """
+    gt = gt[has_3d_joints == 1]
+    gt = gt[:, :, :3]
+    pred = pred[has_3d_joints == 1]
+
+    gt_pelvis = (gt[:, 2, :] + gt[:, 3, :]) / 2
+    gt = gt - gt_pelvis[:, None, :]
+    pred_pelvis = (pred[:, 2, :] + pred[:, 3, :]) / 2
+    pred = pred - pred_pelvis[:, None, :]
+    return pred, gt
+
+
+def mpjpe(pred, gt, has_3d_joints):
+    """ 
+    mPJPE loss
+    """
+    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)
+    error = paddle.sqrt((paddle.minimum((pred - gt), paddle.to_tensor(1.2))**2
+                         ).sum(axis=-1)).mean()
+    return error
+
+
+def mpjpe_focal(pred, gt, has_3d_joints):
+    """ 
+    mPJPE loss
+    """
+    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)
+    mse_error = ((pred - gt)**2).sum(axis=-1)
+    mpjpe_error = paddle.sqrt(mse_error)
+    mean = mpjpe_error.mean()
+    std = mpjpe_error.std()
+    atte = 2 * F.sigmoid(6 * (mpjpe_error - mean) / std)
+    mse_error *= atte
+    return mse_error.mean()
+
+
+def mpjpe_mse(pred, gt, has_3d_joints, weight=1.):
+    """ 
+    mPJPE loss
+    """
+    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)
+    error = (((pred - gt)**2).sum(axis=-1)).mean()
+    return error
+
+
+def mpjpe_criterion(pred, gt, has_3d_joints, criterion_pose3d):
+    """ 
+    mPJPE loss of self define criterion
+    """
+    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)
+    error = paddle.sqrt(criterion_pose3d(pred, gt)).mean()
+    return error
+
+
+@register
+@serializable
+def weighted_mpjpe(pred, gt, has_3d_joints):
+    """ 
+    Weighted_mPJPE
+    """
+    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)
+    weight = paddle.linalg.norm(pred, p=2, axis=-1)
+    weight = paddle.to_tensor(
+        [1.5, 1.3, 1.2, 1.2, 1.3, 1.5, 1.5, 1.3, 1.2, 1.2, 1.3, 1.5, 1., 1.])
+    error = (weight * paddle.linalg.norm(pred - gt, p=2, axis=-1)).mean()
+    return error
+
+
+@register
+@serializable
+def normed_mpjpe(pred, gt, has_3d_joints):
+    """
+    Normalized MPJPE (scale only), adapted from:
+    https://github.com/hrhodin/UnsupervisedGeometryAwareRepresentationLearning/blob/master/losses/poses.py
+    """
+    assert pred.shape == gt.shape
+    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)
+
+    norm_predicted = paddle.mean(
+        paddle.sum(pred**2, axis=3, keepdim=True), axis=2, keepdim=True)
+    norm_target = paddle.mean(
+        paddle.sum(gt * pred, axis=3, keepdim=True), axis=2, keepdim=True)
+    scale = norm_target / norm_predicted
+    return mpjpe(scale * pred, gt)
+
+
+@register
+@serializable
+def mpjpe_np(pred, gt, has_3d_joints):
+    """ 
+    mPJPE_NP
+    """
+    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)
+    error = np.sqrt(((pred - gt)**2).sum(axis=-1)).mean()
+    return error
+
+
+@register
+@serializable
+def mean_per_vertex_error(pred, gt, has_smpl):
+    """
+    Compute mPVE
+    """
+    pred = pred[has_smpl == 1]
+    gt = gt[has_smpl == 1]
+    with paddle.no_grad():
+        error = paddle.sqrt(((pred - gt)**2).sum(axis=-1)).mean()
+        return error
+
+
+@register
+@serializable
+def keypoint_2d_loss(criterion_keypoints, pred_keypoints_2d, gt_keypoints_2d,
+                     has_pose_2d):
+    """
+    Compute 2D reprojection loss if 2D keypoint annotations are available.
+    The confidence (conf) is binary and indicates whether the keypoints exist or not.
+    """
+    conf = gt_keypoints_2d[:, :, -1].unsqueeze(-1).clone()
+    loss = (conf * criterion_keypoints(
+        pred_keypoints_2d, gt_keypoints_2d[:, :, :-1] * 0.001)).mean()
+    return loss
+
+
+@register
+@serializable
+def keypoint_3d_loss(criterion_keypoints, pred_keypoints_3d, gt_keypoints_3d,
+                     has_pose_3d):
+    """
+    Compute 3D keypoint loss if 3D keypoint annotations are available.
+    """
+    conf = gt_keypoints_3d[:, :, -1].unsqueeze(-1).clone()
+    gt_keypoints_3d = gt_keypoints_3d[:, :, :-1].clone()
+    gt_keypoints_3d = gt_keypoints_3d[has_pose_3d == 1]
+    conf = conf[has_pose_3d == 1]
+    pred_keypoints_3d = pred_keypoints_3d[has_pose_3d == 1]
+    if len(gt_keypoints_3d) > 0:
+        gt_pelvis = (gt_keypoints_3d[:, 2, :] + gt_keypoints_3d[:, 3, :]) / 2
+        gt_keypoints_3d = gt_keypoints_3d - gt_pelvis[:, None, :]
+        pred_pelvis = (
+            pred_keypoints_3d[:, 2, :] + pred_keypoints_3d[:, 3, :]) / 2
+        pred_keypoints_3d = pred_keypoints_3d - pred_pelvis[:, None, :]
+        return (conf * criterion_keypoints(pred_keypoints_3d,
+                                           gt_keypoints_3d)).mean()
+    else:
+        return paddle.to_tensor([1.]).fill_(0.)
+
+
+@register
+@serializable
+def vertices_loss(criterion_vertices, pred_vertices, gt_vertices, has_smpl):
+    """
+    Compute per-vertex loss if vertex annotations are available.
+    """
+    pred_vertices_with_shape = pred_vertices[has_smpl == 1]
+    gt_vertices_with_shape = gt_vertices[has_smpl == 1]
+    if len(gt_vertices_with_shape) > 0:
+        return criterion_vertices(pred_vertices_with_shape,
+                                  gt_vertices_with_shape)
+    else:
+        return paddle.to_tensor([1.]).fill_(0.)
+
+
+@register
+@serializable
+def rectify_pose(pose):
+    pose = pose.copy()
+    R_mod = cv2.Rodrigues(np.array([np.pi, 0, 0]))[0]
+    R_root = cv2.Rodrigues(pose[:3])[0]
+    new_root = R_root.dot(R_mod)
+    pose[:3] = cv2.Rodrigues(new_root)[0].reshape(3)
+    return pose
--- a/paddle_detection/ppdet/modeling/losses/probiou_loss.py
+++ b/paddle_detection/ppdet/modeling/losses/probiou_loss.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+import paddle
+import paddle.nn.functional as F
+
+from ppdet.core.workspace import register, serializable
+
+__all__ = ['ProbIoULoss']
+
+
+def gbb_form(boxes):
+    xy, wh, angle = paddle.split(boxes, [2, 2, 1], axis=-1)
+    return paddle.concat([xy, wh.pow(2) / 12., angle], axis=-1)
+
+
+def rotated_form(a_, b_, angles):
+    cos_a = paddle.cos(angles)
+    sin_a = paddle.sin(angles)
+    a = a_ * paddle.pow(cos_a, 2) + b_ * paddle.pow(sin_a, 2)
+    b = a_ * paddle.pow(sin_a, 2) + b_ * paddle.pow(cos_a, 2)
+    c = (a_ - b_) * cos_a * sin_a
+    return a, b, c
+
+
+def probiou_loss(pred, target, eps=1e-3, mode='l1'):
+    """
+        pred    -> a matrix [N,5](x,y,w,h,angle - in radians) containing ours predicted box ;in case of HBB angle == 0
+        target  -> a matrix [N,5](x,y,w,h,angle - in radians) containing ours target    box ;in case of HBB angle == 0
+        eps     -> threshold to avoid infinite values
+        mode    -> ('l1' in [0,1] or 'l2' in [0,inf]) metrics according our paper
+
+    """
+
+    gbboxes1 = gbb_form(pred)
+    gbboxes2 = gbb_form(target)
+
+    x1, y1, a1_, b1_, c1_ = gbboxes1[:,
+                                     0], gbboxes1[:,
+                                                  1], gbboxes1[:,
+                                                               2], gbboxes1[:,
+                                                                            3], gbboxes1[:,
+                                                                                         4]
+    x2, y2, a2_, b2_, c2_ = gbboxes2[:,
+                                     0], gbboxes2[:,
+                                                  1], gbboxes2[:,
+                                                               2], gbboxes2[:,
+                                                                            3], gbboxes2[:,
+                                                                                         4]
+
+    a1, b1, c1 = rotated_form(a1_, b1_, c1_)
+    a2, b2, c2 = rotated_form(a2_, b2_, c2_)
+
+    t1 = 0.25 * ((a1 + a2) * (paddle.pow(y1 - y2, 2)) + (b1 + b2) * (paddle.pow(x1 - x2, 2))) + \
+         0.5 * ((c1+c2)*(x2-x1)*(y1-y2))
+    t2 = (a1 + a2) * (b1 + b2) - paddle.pow(c1 + c2, 2)
+    t3_ = (a1 * b1 - c1 * c1) * (a2 * b2 - c2 * c2)
+    t3 = 0.5 * paddle.log(t2 / (4 * paddle.sqrt(F.relu(t3_)) + eps))
+
+    B_d = (t1 / t2) + t3
+    # B_d = t1 + t2 + t3
+
+    B_d = paddle.clip(B_d, min=eps, max=100.0)
+    l1 = paddle.sqrt(1.0 - paddle.exp(-B_d) + eps)
+    l_i = paddle.pow(l1, 2.0)
+    l2 = -paddle.log(1.0 - l_i + eps)
+
+    if mode == 'l1':
+        probiou = l1
+    if mode == 'l2':
+        probiou = l2
+
+    return probiou
+
+
+@serializable
+@register
+class ProbIoULoss(object):
+    """ ProbIoU Loss, refer to https://arxiv.org/abs/2106.06072 for details """
+
+    def __init__(self, mode='l1', eps=1e-3):
+        super(ProbIoULoss, self).__init__()
+        self.mode = mode
+        self.eps = eps
+
+    def __call__(self, pred_rboxes, assigned_rboxes):
+        return probiou_loss(pred_rboxes, assigned_rboxes, self.eps, self.mode)
--- a/paddle_detection/ppdet/modeling/losses/queryinst_loss.py
+++ b/paddle_detection/ppdet/modeling/losses/queryinst_loss.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn.functional as F
+
+from ppdet.core.workspace import register
+from ppdet.modeling.losses.iou_loss import GIoULoss
+from .sparsercnn_loss import HungarianMatcher
+
+__all__ = ['QueryInstLoss']
+
+
+@register
+class QueryInstLoss(object):
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 num_classes=80,
+                 focal_loss_alpha=0.25,
+                 focal_loss_gamma=2.0,
+                 class_weight=2.0,
+                 l1_weight=5.0,
+                 giou_weight=2.0,
+                 mask_weight=8.0):
+        super(QueryInstLoss, self).__init__()
+
+        self.num_classes = num_classes
+        self.focal_loss_alpha = focal_loss_alpha
+        self.focal_loss_gamma = focal_loss_gamma
+        self.loss_weights = {
+            "loss_cls": class_weight,
+            "loss_bbox": l1_weight,
+            "loss_giou": giou_weight,
+            "loss_mask": mask_weight
+        }
+        self.giou_loss = GIoULoss(eps=1e-6, reduction='sum')
+
+        self.matcher = HungarianMatcher(focal_loss_alpha, focal_loss_gamma,
+                                        class_weight, l1_weight, giou_weight)
+
+    def loss_classes(self, class_logits, targets, indices, avg_factor):
+        tgt_labels = paddle.full(
+            class_logits.shape[:2], self.num_classes, dtype='int32')
+
+        if sum(len(v['labels']) for v in targets) > 0:
+            tgt_classes = paddle.concat([
+                paddle.gather(
+                    tgt['labels'], tgt_idx, axis=0)
+                for tgt, (_, tgt_idx) in zip(targets, indices)
+            ])
+            batch_idx, src_idx = self._get_src_permutation_idx(indices)
+            for i, (batch_i, src_i) in enumerate(zip(batch_idx, src_idx)):
+                tgt_labels[int(batch_i), int(src_i)] = tgt_classes[i]
+
+        tgt_labels = tgt_labels.flatten(0, 1).unsqueeze(-1)
+
+        tgt_labels_onehot = paddle.cast(
+            tgt_labels == paddle.arange(0, self.num_classes), dtype='float32')
+        tgt_labels_onehot.stop_gradient = True
+
+        src_logits = class_logits.flatten(0, 1)
+
+        loss_cls = F.sigmoid_focal_loss(
+            src_logits,
+            tgt_labels_onehot,
+            alpha=self.focal_loss_alpha,
+            gamma=self.focal_loss_gamma,
+            reduction='sum') / avg_factor
+        losses = {'loss_cls': loss_cls * self.loss_weights['loss_cls']}
+        return losses
+
+    def loss_bboxes(self, bbox_pred, targets, indices, avg_factor):
+        bboxes = paddle.concat([
+            paddle.gather(
+                src, src_idx, axis=0)
+            for src, (src_idx, _) in zip(bbox_pred, indices)
+        ])
+
+        tgt_bboxes = paddle.concat([
+            paddle.gather(
+                tgt['boxes'], tgt_idx, axis=0)
+            for tgt, (_, tgt_idx) in zip(targets, indices)
+        ])
+        tgt_bboxes.stop_gradient = True
+
+        im_shapes = paddle.concat([tgt['img_whwh_tgt'] for tgt in targets])
+        bboxes_norm = bboxes / im_shapes
+        tgt_bboxes_norm = tgt_bboxes / im_shapes
+
+        loss_giou = self.giou_loss(bboxes, tgt_bboxes) / avg_factor
+        loss_bbox = F.l1_loss(
+            bboxes_norm, tgt_bboxes_norm, reduction='sum') / avg_factor
+        losses = {
+            'loss_bbox': loss_bbox * self.loss_weights['loss_bbox'],
+            'loss_giou': loss_giou * self.loss_weights['loss_giou']
+        }
+        return losses
+
+    def loss_masks(self, pos_bbox_pred, mask_logits, targets, indices,
+                   avg_factor):
+        tgt_segm = [
+            paddle.gather(
+                tgt['gt_segm'], tgt_idx, axis=0)
+            for tgt, (_, tgt_idx) in zip(targets, indices)
+        ]
+
+        tgt_masks = []
+        for i in range(len(indices)):
+            gt_segm = tgt_segm[i].unsqueeze(1)
+            if len(gt_segm) == 0:
+                continue
+            boxes = pos_bbox_pred[i]
+            boxes[:, 0::2] = paddle.clip(
+                boxes[:, 0::2], min=0, max=gt_segm.shape[3])
+            boxes[:, 1::2] = paddle.clip(
+                boxes[:, 1::2], min=0, max=gt_segm.shape[2])
+            boxes_num = paddle.to_tensor([1] * len(boxes), dtype='int32')
+            gt_mask = paddle.vision.ops.roi_align(
+                gt_segm,
+                boxes,
+                boxes_num,
+                output_size=mask_logits.shape[-2:],
+                aligned=True)
+            tgt_masks.append(gt_mask)
+        tgt_masks = paddle.concat(tgt_masks).squeeze(1)
+        tgt_masks = paddle.cast(tgt_masks >= 0.5, dtype='float32')
+        tgt_masks.stop_gradient = True
+
+        tgt_labels = paddle.concat([
+            paddle.gather(
+                tgt['labels'], tgt_idx, axis=0)
+            for tgt, (_, tgt_idx) in zip(targets, indices)
+        ])
+
+        mask_label = F.one_hot(tgt_labels, self.num_classes).unsqueeze([2, 3])
+        mask_label = paddle.expand_as(mask_label, mask_logits)
+        mask_label.stop_gradient = True
+
+        src_masks = paddle.gather_nd(mask_logits, paddle.nonzero(mask_label))
+        shape = mask_logits.shape
+        src_masks = paddle.reshape(src_masks, [shape[0], shape[2], shape[3]])
+        src_masks = F.sigmoid(src_masks)
+
+        X = src_masks.flatten(1)
+        Y = tgt_masks.flatten(1)
+        inter = paddle.sum(X * Y, 1)
+        union = paddle.sum(X * X, 1) + paddle.sum(Y * Y, 1)
+        dice = (2 * inter) / (union + 2e-5)
+
+        loss_mask = (1 - dice).sum() / avg_factor
+        losses = {'loss_mask': loss_mask * self.loss_weights['loss_mask']}
+        return losses
+
+    @staticmethod
+    def _get_src_permutation_idx(indices):
+        batch_idx = paddle.concat(
+            [paddle.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = paddle.concat([src for (src, _) in indices])
+        return batch_idx, src_idx
--- a/paddle_detection/ppdet/modeling/losses/smooth_l1_loss.py
+++ b/paddle_detection/ppdet/modeling/losses/smooth_l1_loss.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+
+__all__ = ['SmoothL1Loss']
+
+@register
+class SmoothL1Loss(nn.Layer):
+    """Smooth L1 Loss.
+    Args:
+        beta (float): controls smooth region, it becomes L1 Loss when beta=0.0
+        loss_weight (float): the final loss will be multiplied by this 
+    """
+    def __init__(self,
+                 beta=1.0,
+                 loss_weight=1.0):
+        super(SmoothL1Loss, self).__init__()
+        assert beta >= 0
+        self.beta = beta
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, reduction='none'):
+        """forward function, based on fvcore.
+        Args:
+            pred (Tensor): prediction tensor
+            target (Tensor): target tensor, pred.shape must be the same as target.shape
+            reduction (str): the way to reduce loss, one of (none, sum, mean)
+        """
+        assert reduction in ('none', 'sum', 'mean')
+        target = target.detach()
+        if self.beta < 1e-5:
+            loss = paddle.abs(pred - target)
+        else:
+            n = paddle.abs(pred - target)
+            cond = n < self.beta
+            loss = paddle.where(cond, 0.5 * n ** 2 / self.beta, n - 0.5 * self.beta)
+        if reduction == 'mean':
+            loss = loss.mean() if loss.size > 0 else 0.0 * loss.sum()
+        elif reduction == 'sum':
+            loss = loss.sum()
+        return loss * self.loss_weight
--- a/paddle_detection/ppdet/modeling/losses/solov2_loss.py
+++ b/paddle_detection/ppdet/modeling/losses/solov2_loss.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+
+__all__ = ['SOLOv2Loss']
+
+
+@register
+@serializable
+class SOLOv2Loss(object):
+    """
+    SOLOv2Loss
+    Args:
+        ins_loss_weight (float): Weight of instance loss.
+        focal_loss_gamma (float): Gamma parameter for focal loss.
+        focal_loss_alpha (float): Alpha parameter for focal loss.
+    """
+
+    def __init__(self,
+                 ins_loss_weight=3.0,
+                 focal_loss_gamma=2.0,
+                 focal_loss_alpha=0.25):
+        self.ins_loss_weight = ins_loss_weight
+        self.focal_loss_gamma = focal_loss_gamma
+        self.focal_loss_alpha = focal_loss_alpha
+
+    def _dice_loss(self, input, target):
+        input = paddle.reshape(input, shape=(paddle.shape(input)[0], -1))
+        target = paddle.reshape(target, shape=(paddle.shape(target)[0], -1))
+        a = paddle.sum(input * target, axis=1)
+        b = paddle.sum(input * input, axis=1) + 0.001
+        c = paddle.sum(target * target, axis=1) + 0.001
+        d = (2 * a) / (b + c)
+        return 1 - d
+
+    def __call__(self, ins_pred_list, ins_label_list, cate_preds, cate_labels,
+                 num_ins):
+        """
+        Get loss of network of SOLOv2.
+        Args:
+            ins_pred_list (list): Variable list of instance branch output.
+            ins_label_list (list): List of instance labels pre batch.
+            cate_preds (list): Concat Variable list of categroy branch output.
+            cate_labels (list): Concat list of categroy labels pre batch.
+            num_ins (int): Number of positive samples in a mini-batch.
+        Returns:
+            loss_ins (Variable): The instance loss Variable of SOLOv2 network.
+            loss_cate (Variable): The category loss Variable of SOLOv2 network.
+        """
+
+        #1. Ues dice_loss to calculate instance loss
+        loss_ins = []
+        total_weights = paddle.zeros(shape=[1], dtype='float32')
+        for input, target in zip(ins_pred_list, ins_label_list):
+            if input is None:
+                continue
+            target = paddle.cast(target, 'float32')
+            target = paddle.reshape(
+                target,
+                shape=[-1, paddle.shape(input)[-2], paddle.shape(input)[-1]])
+            weights = paddle.cast(
+                paddle.sum(target, axis=[1, 2]) > 0, 'float32')
+            input = F.sigmoid(input)
+            dice_out = paddle.multiply(self._dice_loss(input, target), weights)
+            total_weights += paddle.sum(weights)
+            loss_ins.append(dice_out)
+        loss_ins = paddle.sum(paddle.concat(loss_ins)) / total_weights
+        loss_ins = loss_ins * self.ins_loss_weight
+
+        #2. Ues sigmoid_focal_loss to calculate category loss
+        # expand onehot labels
+        num_classes = cate_preds.shape[-1]
+        cate_labels_bin = F.one_hot(cate_labels, num_classes=num_classes + 1)
+        cate_labels_bin = cate_labels_bin[:, 1:]
+
+        loss_cate = F.sigmoid_focal_loss(
+            cate_preds,
+            label=cate_labels_bin,
+            normalizer=num_ins + 1.,
+            gamma=self.focal_loss_gamma,
+            alpha=self.focal_loss_alpha)
+
+        return loss_ins, loss_cate
--- a/paddle_detection/ppdet/modeling/losses/sparsercnn_loss.py
+++ b/paddle_detection/ppdet/modeling/losses/sparsercnn_loss.py
@@ -0,0 +1,430 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is based on https://github.com/PeizeSun/SparseR-CNN/blob/main/projects/SparseRCNN/sparsercnn/loss.py
+Ths copyright of PeizeSun/SparseR-CNN is as follows:
+MIT License [see LICENSE for details]
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from scipy.optimize import linear_sum_assignment
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.metric import accuracy
+from ppdet.core.workspace import register
+from ppdet.modeling.losses.iou_loss import GIoULoss
+
+__all__ = ["SparseRCNNLoss"]
+
+
+@register
+class SparseRCNNLoss(nn.Layer):
+    """ This class computes the loss for SparseRCNN.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 losses,
+                 focal_loss_alpha,
+                 focal_loss_gamma,
+                 num_classes=80,
+                 class_weight=2.,
+                 l1_weight=5.,
+                 giou_weight=2.):
+        """ Create the criterion.
+        Parameters:
+            num_classes: number of object categories, omitting the special no-object category
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+            matcher: module able to compute a matching between targets and proposals
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        weight_dict = {
+            "loss_ce": class_weight,
+            "loss_bbox": l1_weight,
+            "loss_giou": giou_weight
+        }
+        self.weight_dict = weight_dict
+        self.losses = losses
+        self.giou_loss = GIoULoss(reduction="sum")
+
+        self.focal_loss_alpha = focal_loss_alpha
+        self.focal_loss_gamma = focal_loss_gamma
+
+        self.matcher = HungarianMatcher(focal_loss_alpha, focal_loss_gamma,
+                                        class_weight, l1_weight, giou_weight)
+
+    def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
+        """Classification loss (NLL)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        assert 'pred_logits' in outputs
+        src_logits = outputs['pred_logits']
+
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = paddle.concat([
+            paddle.gather(
+                t["labels"], J, axis=0) for t, (_, J) in zip(targets, indices)
+        ])
+        target_classes = paddle.full(
+            src_logits.shape[:2], self.num_classes, dtype="int32")
+        for i, ind in enumerate(zip(idx[0], idx[1])):
+            target_classes[int(ind[0]), int(ind[1])] = target_classes_o[i]
+        target_classes.stop_gradient = True
+
+        src_logits = src_logits.flatten(start_axis=0, stop_axis=1)
+
+        # prepare one_hot target.
+        target_classes = target_classes.flatten(start_axis=0, stop_axis=1)
+        class_ids = paddle.arange(0, self.num_classes)
+        labels = (target_classes.unsqueeze(-1) == class_ids).astype("float32")
+        labels.stop_gradient = True
+
+        # comp focal loss.
+        class_loss = sigmoid_focal_loss(
+            src_logits,
+            labels,
+            alpha=self.focal_loss_alpha,
+            gamma=self.focal_loss_gamma,
+            reduction="sum", ) / num_boxes
+        losses = {'loss_ce': class_loss}
+
+        if log:
+            label_acc = target_classes_o.unsqueeze(-1)
+            src_idx = [src for (src, _) in indices]
+
+            pred_list = []
+            for i in range(outputs["pred_logits"].shape[0]):
+                pred_list.append(
+                    paddle.gather(
+                        outputs["pred_logits"][i], src_idx[i], axis=0))
+
+            pred = F.sigmoid(paddle.concat(pred_list, axis=0))
+            acc = accuracy(pred, label_acc.astype("int64"))
+            losses["acc"] = acc
+
+        return losses
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
+           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
+           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        assert 'pred_boxes' in outputs  # [batch_size, num_proposals, 4]
+        src_idx = [src for (src, _) in indices]
+        src_boxes_list = []
+
+        for i in range(outputs["pred_boxes"].shape[0]):
+            src_boxes_list.append(
+                paddle.gather(
+                    outputs["pred_boxes"][i], src_idx[i], axis=0))
+
+        src_boxes = paddle.concat(src_boxes_list, axis=0)
+
+        target_boxes = paddle.concat(
+            [
+                paddle.gather(
+                    t['boxes'], I, axis=0)
+                for t, (_, I) in zip(targets, indices)
+            ],
+            axis=0)
+        target_boxes.stop_gradient = True
+        losses = {}
+
+        losses['loss_giou'] = self.giou_loss(src_boxes,
+                                             target_boxes) / num_boxes
+
+        image_size = paddle.concat([v["img_whwh_tgt"] for v in targets])
+        src_boxes_ = src_boxes / image_size
+        target_boxes_ = target_boxes / image_size
+
+        loss_bbox = F.l1_loss(src_boxes_, target_boxes_, reduction='sum')
+        losses['loss_bbox'] = loss_bbox / num_boxes
+
+        return losses
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = paddle.concat(
+            [paddle.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = paddle.concat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = paddle.concat(
+            [paddle.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = paddle.concat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
+        loss_map = {
+            'labels': self.loss_labels,
+            'boxes': self.loss_boxes,
+        }
+        assert loss in loss_map, f'do you really want to compute {loss} loss?'
+        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
+
+    def forward(self, outputs, targets):
+        """ This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {
+            k: v
+            for k, v in outputs.items() if k != 'aux_outputs'
+        }
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+
+        # Compute the average number of target boxes across all nodes, for normalization purposes
+        num_boxes = sum(len(t["labels"]) for t in targets)
+        num_boxes = paddle.to_tensor(
+            [num_boxes],
+            dtype="float32",
+            place=next(iter(outputs.values())).place)
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(
+                self.get_loss(loss, outputs, targets, indices, num_boxes))
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if 'aux_outputs' in outputs:
+            for i, aux_outputs in enumerate(outputs['aux_outputs']):
+                indices = self.matcher(aux_outputs, targets)
+                for loss in self.losses:
+                    kwargs = {}
+                    if loss == 'labels':
+                        # Logging is enabled only for the last layer
+                        kwargs = {'log': False}
+                    l_dict = self.get_loss(loss, aux_outputs, targets, indices,
+                                           num_boxes, **kwargs)
+
+                    w_dict = {}
+                    for k in l_dict.keys():
+                        if k in self.weight_dict:
+                            w_dict[k + f'_{i}'] = l_dict[k] * self.weight_dict[
+                                k]
+                        else:
+                            w_dict[k + f'_{i}'] = l_dict[k]
+                    losses.update(w_dict)
+
+        return losses
+
+
+class HungarianMatcher(nn.Layer):
+    """This class computes an assignment between the targets and the predictions of the network
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+
+    def __init__(self,
+                 focal_loss_alpha,
+                 focal_loss_gamma,
+                 cost_class: float=1,
+                 cost_bbox: float=1,
+                 cost_giou: float=1):
+        """Creates the matcher
+        Params:
+            cost_class: This is the relative weight of the classification error in the matching cost
+            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
+            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
+        """
+        super().__init__()
+        self.cost_class = cost_class
+        self.cost_bbox = cost_bbox
+        self.cost_giou = cost_giou
+        self.focal_loss_alpha = focal_loss_alpha
+        self.focal_loss_gamma = focal_loss_gamma
+        assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
+
+    @paddle.no_grad()
+    def forward(self, outputs, targets):
+        """ Performs the matching
+        Args:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
+                 eg. outputs = {"pred_logits": pred_logits, "pred_boxes": pred_boxes}
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
+                 eg. targets = [{"labels":labels, "boxes": boxes}, ...,{"labels":labels, "boxes": boxes}]
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        bs, num_queries = outputs["pred_logits"].shape[:2]
+
+        if sum(len(v["labels"]) for v in targets) == 0:
+            return [(paddle.to_tensor(
+                [], dtype=paddle.int64), paddle.to_tensor(
+                    [], dtype=paddle.int64)) for _ in range(bs)]
+
+        # We flatten to compute the cost matrices in a batch
+        out_prob = F.sigmoid(outputs["pred_logits"].flatten(
+            start_axis=0, stop_axis=1))
+        out_bbox = outputs["pred_boxes"].flatten(start_axis=0, stop_axis=1)
+
+        # Also concat the target labels and boxes
+        tgt_ids = paddle.concat([v["labels"] for v in targets])
+        assert (tgt_ids > -1).all()
+        tgt_bbox = paddle.concat([v["boxes"] for v in targets])
+
+        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+        # but approximate it in 1 - proba[target class].
+        # The 1 is a constant that doesn't change the matching, it can be ommitted.
+
+        # Compute the classification cost.
+        alpha = self.focal_loss_alpha
+        gamma = self.focal_loss_gamma
+
+        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(
+            1 - out_prob + 1e-8).log())
+        pos_cost_class = alpha * ((1 - out_prob)
+                                  **gamma) * (-(out_prob + 1e-8).log())
+
+        cost_class = paddle.gather(
+            pos_cost_class, tgt_ids, axis=1) - paddle.gather(
+                neg_cost_class, tgt_ids, axis=1)
+
+        # Compute the L1 cost between boxes
+        image_size_out = paddle.concat(
+            [v["img_whwh"].unsqueeze(0) for v in targets])
+        image_size_out = image_size_out.unsqueeze(1).tile(
+            [1, num_queries, 1]).flatten(
+                start_axis=0, stop_axis=1)
+        image_size_tgt = paddle.concat([v["img_whwh_tgt"] for v in targets])
+
+        out_bbox_ = out_bbox / image_size_out
+        tgt_bbox_ = tgt_bbox / image_size_tgt
+        cost_bbox = F.l1_loss(
+            out_bbox_.unsqueeze(-2), tgt_bbox_,
+            reduction='none').sum(-1)  # [batch_size * num_queries, num_tgts]
+
+        # Compute the giou cost betwen boxes
+        cost_giou = -get_bboxes_giou(out_bbox, tgt_bbox)
+
+        # Final cost matrix
+        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
+        C = C.reshape([bs, num_queries, -1])
+
+        sizes = [len(v["boxes"]) for v in targets]
+
+        indices = [
+            linear_sum_assignment(c[i].numpy())
+            for i, c in enumerate(C.split(sizes, -1))
+        ]
+        return [(paddle.to_tensor(
+            i, dtype="int32"), paddle.to_tensor(
+                j, dtype="int32")) for i, j in indices]
+
+
+def box_area(boxes):
+    assert (boxes[:, 2:] >= boxes[:, :2]).all()
+    wh = boxes[:, 2:] - boxes[:, :2]
+    return wh[:, 0] * wh[:, 1]
+
+
+def boxes_iou(boxes1, boxes2):
+    '''
+    Compute iou
+
+    Args:
+        boxes1 (paddle.tensor) shape (N, 4)
+        boxes2 (paddle.tensor) shape (M, 4)
+
+    Return:
+        (paddle.tensor) shape (N, M)
+    '''
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    lt = paddle.maximum(boxes1.unsqueeze(-2)[:, :, :2], boxes2[:, :2])
+    rb = paddle.minimum(boxes1.unsqueeze(-2)[:, :, 2:], boxes2[:, 2:])
+
+    wh = (rb - lt).astype("float32").clip(min=1e-9)
+    inter = wh[:, :, 0] * wh[:, :, 1]
+
+    union = area1.unsqueeze(-1) + area2 - inter + 1e-9
+
+    iou = inter / union
+    return iou, union
+
+
+def get_bboxes_giou(boxes1, boxes2, eps=1e-9):
+    """calculate the ious of boxes1 and boxes2
+
+    Args:
+        boxes1 (Tensor): shape [N, 4]
+        boxes2 (Tensor): shape [M, 4]
+        eps (float): epsilon to avoid divide by zero
+
+    Return:
+        ious (Tensor): ious of boxes1 and boxes2, with the shape [N, M]
+    """
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+
+    iou, union = boxes_iou(boxes1, boxes2)
+
+    lt = paddle.minimum(boxes1.unsqueeze(-2)[:, :, :2], boxes2[:, :2])
+    rb = paddle.maximum(boxes1.unsqueeze(-2)[:, :, 2:], boxes2[:, 2:])
+
+    wh = (rb - lt).astype("float32").clip(min=eps)
+    enclose_area = wh[:, :, 0] * wh[:, :, 1]
+
+    giou = iou - (enclose_area - union) / enclose_area
+
+    return giou
+
+
+def sigmoid_focal_loss(inputs, targets, alpha, gamma, reduction="sum"):
+
+    assert reduction in ["sum", "mean"
+                         ], f'do not support this {reduction} reduction?'
+
+    p = F.sigmoid(inputs)
+    ce_loss = F.binary_cross_entropy_with_logits(
+        inputs, targets, reduction="none")
+    p_t = p * targets + (1 - p) * (1 - targets)
+    loss = ce_loss * ((1 - p_t)**gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    if reduction == "mean":
+        loss = loss.mean()
+    elif reduction == "sum":
+        loss = loss.sum()
+
+    return loss
--- a/paddle_detection/ppdet/modeling/losses/ssd_loss.py
+++ b/paddle_detection/ppdet/modeling/losses/ssd_loss.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+from ..bbox_utils import iou_similarity, bbox2delta
+
+__all__ = ['SSDLoss']
+
+
+@register
+class SSDLoss(nn.Layer):
+    """
+    SSDLoss
+
+    Args:
+        overlap_threshold (float32, optional): IoU threshold for negative bboxes
+            and positive bboxes, 0.5 by default.
+        neg_pos_ratio (float): The ratio of negative samples / positive samples.
+        loc_loss_weight (float): The weight of loc_loss.
+        conf_loss_weight (float): The weight of conf_loss.
+        prior_box_var (list): Variances corresponding to prior box coord, [0.1,
+            0.1, 0.2, 0.2] by default.
+    """
+
+    def __init__(self,
+                 overlap_threshold=0.5,
+                 neg_pos_ratio=3.0,
+                 loc_loss_weight=1.0,
+                 conf_loss_weight=1.0,
+                 prior_box_var=[0.1, 0.1, 0.2, 0.2]):
+        super(SSDLoss, self).__init__()
+        self.overlap_threshold = overlap_threshold
+        self.neg_pos_ratio = neg_pos_ratio
+        self.loc_loss_weight = loc_loss_weight
+        self.conf_loss_weight = conf_loss_weight
+        self.prior_box_var = [1. / a for a in prior_box_var]
+
+    def _bipartite_match_for_batch(self, gt_bbox, gt_label, prior_boxes,
+                                   bg_index):
+        """
+        Args:
+            gt_bbox (Tensor): [B, N, 4]
+            gt_label (Tensor): [B, N, 1]
+            prior_boxes (Tensor): [A, 4]
+            bg_index (int): Background class index
+        """
+        batch_size, num_priors = gt_bbox.shape[0], prior_boxes.shape[0]
+        ious = iou_similarity(gt_bbox.reshape((-1, 4)), prior_boxes).reshape(
+            (batch_size, -1, num_priors))
+
+        # For each prior box, get the max IoU of all GTs.
+        prior_max_iou, prior_argmax_iou = ious.max(axis=1), ious.argmax(axis=1)
+        # For each GT, get the max IoU of all prior boxes.
+        gt_max_iou, gt_argmax_iou = ious.max(axis=2), ious.argmax(axis=2)
+
+        # Gather target bbox and label according to 'prior_argmax_iou' index.
+        batch_ind = paddle.arange(end=batch_size, dtype='int64').unsqueeze(-1)
+        prior_argmax_iou = paddle.stack(
+            [batch_ind.tile([1, num_priors]), prior_argmax_iou], axis=-1)
+        targets_bbox = paddle.gather_nd(gt_bbox, prior_argmax_iou)
+        targets_label = paddle.gather_nd(gt_label, prior_argmax_iou)
+        # Assign negative
+        bg_index_tensor = paddle.full([batch_size, num_priors, 1], bg_index,
+                                      'int64')
+        targets_label = paddle.where(
+            prior_max_iou.unsqueeze(-1) < self.overlap_threshold,
+            bg_index_tensor, targets_label)
+
+        # Ensure each GT can match the max IoU prior box.
+        batch_ind = (batch_ind * num_priors + gt_argmax_iou).flatten()
+        targets_bbox = paddle.scatter(
+            targets_bbox.reshape([-1, 4]), batch_ind,
+            gt_bbox.reshape([-1, 4])).reshape([batch_size, -1, 4])
+        targets_label = paddle.scatter(
+            targets_label.reshape([-1, 1]), batch_ind,
+            gt_label.reshape([-1, 1])).reshape([batch_size, -1, 1])
+        targets_label[:, :1] = bg_index
+
+        # Encode box
+        prior_boxes = prior_boxes.unsqueeze(0).tile([batch_size, 1, 1])
+        targets_bbox = bbox2delta(
+            prior_boxes.reshape([-1, 4]),
+            targets_bbox.reshape([-1, 4]), self.prior_box_var)
+        targets_bbox = targets_bbox.reshape([batch_size, -1, 4])
+
+        return targets_bbox, targets_label
+
+    def _mine_hard_example(self,
+                           conf_loss,
+                           targets_label,
+                           bg_index,
+                           mine_neg_ratio=0.01):
+        pos = (targets_label != bg_index).astype(conf_loss.dtype)
+        num_pos = pos.sum(axis=1, keepdim=True)
+        neg = (targets_label == bg_index).astype(conf_loss.dtype)
+
+        conf_loss = conf_loss.detach() * neg
+        loss_idx = conf_loss.argsort(axis=1, descending=True)
+        idx_rank = loss_idx.argsort(axis=1)
+        num_negs = []
+        for i in range(conf_loss.shape[0]):
+            cur_num_pos = num_pos[i]
+            num_neg = paddle.clip(
+                cur_num_pos * self.neg_pos_ratio, max=pos.shape[1])
+            num_neg = num_neg if num_neg > 0 else paddle.to_tensor(
+                [pos.shape[1] * mine_neg_ratio])
+            num_negs.append(num_neg)
+        num_negs = paddle.stack(num_negs).expand_as(idx_rank)
+        neg_mask = (idx_rank < num_negs).astype(conf_loss.dtype)
+
+        return (neg_mask + pos).astype('bool')
+
+    def forward(self, boxes, scores, gt_bbox, gt_label, prior_boxes):
+        boxes = paddle.concat(boxes, axis=1)
+        scores = paddle.concat(scores, axis=1)
+        gt_label = gt_label.unsqueeze(-1).astype('int64')
+        prior_boxes = paddle.concat(prior_boxes, axis=0)
+        bg_index = scores.shape[-1] - 1
+
+        # Match bbox and get targets.
+        targets_bbox, targets_label = \
+            self._bipartite_match_for_batch(gt_bbox, gt_label, prior_boxes, bg_index)
+        targets_bbox.stop_gradient = True
+        targets_label.stop_gradient = True
+
+        # Compute regression loss.
+        # Select positive samples.
+        bbox_mask = paddle.tile(targets_label != bg_index, [1, 1, 4])
+        if bbox_mask.astype(boxes.dtype).sum() > 0:
+            location = paddle.masked_select(boxes, bbox_mask)
+            targets_bbox = paddle.masked_select(targets_bbox, bbox_mask)
+            loc_loss = F.smooth_l1_loss(location, targets_bbox, reduction='sum')
+            loc_loss = loc_loss * self.loc_loss_weight
+        else:
+            loc_loss = paddle.zeros([1])
+
+        # Compute confidence loss.
+        conf_loss = F.cross_entropy(scores, targets_label, reduction="none")
+        # Mining hard examples.
+        label_mask = self._mine_hard_example(
+            conf_loss.squeeze(-1), targets_label.squeeze(-1), bg_index)
+        conf_loss = paddle.masked_select(conf_loss, label_mask.unsqueeze(-1))
+        conf_loss = conf_loss.sum() * self.conf_loss_weight
+
+        # Compute overall weighted loss.
+        normalizer = (targets_label != bg_index).astype('float32').sum().clip(
+            min=1)
+        loss = (conf_loss + loc_loss) / normalizer
+
+        return loss
--- a/paddle_detection/ppdet/modeling/losses/supcontrast.py
+++ b/paddle_detection/ppdet/modeling/losses/supcontrast.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+import random
+from ppdet.core.workspace import register
+
+
+__all__ = ['SupContrast']
+
+
+@register
+class SupContrast(nn.Layer):
+    __shared__ = [
+        'num_classes'
+    ]
+    def __init__(self, num_classes=80, temperature=2.5, sample_num=4096, thresh=0.75):
+        super(SupContrast, self).__init__()
+        self.num_classes = num_classes
+        self.temperature = temperature
+        self.sample_num = sample_num
+        self.thresh = thresh
+    def forward(self, features, labels, scores):
+        
+        assert features.shape[0] == labels.shape[0] == scores.shape[0]
+        positive_mask = (labels < self.num_classes)
+        positive_features, positive_labels, positive_scores = features[positive_mask], labels[positive_mask], \
+                                                              scores[positive_mask]
+        
+        negative_mask = (labels == self.num_classes)
+        negative_features, negative_labels, negative_scores = features[negative_mask], labels[negative_mask], \
+                                                              scores[negative_mask]
+        
+        N = negative_features.shape[0]
+        S = self.sample_num - positive_mask.sum()   
+        index = paddle.to_tensor(random.sample(range(N), int(S)), dtype='int32')
+
+        negative_features = paddle.index_select(x=negative_features, index=index, axis=0)
+        negative_labels = paddle.index_select(x=negative_labels, index=index, axis=0)
+        negative_scores = paddle.index_select(x=negative_scores, index=index, axis=0)
+        
+        features = paddle.concat([positive_features, negative_features], 0)
+        labels = paddle.concat([positive_labels, negative_labels], 0)
+        scores = paddle.concat([positive_scores, negative_scores], 0)
+
+        if len(labels.shape) == 1:
+            labels = labels.reshape([-1, 1])
+        label_mask = paddle.equal(labels, labels.T).detach()
+        similarity = (paddle.matmul(features, features.T) / self.temperature)
+
+        sim_row_max = paddle.max(similarity, axis=1, keepdim=True)
+        similarity = similarity - sim_row_max
+
+        logits_mask = paddle.ones_like(similarity).detach()
+        logits_mask.fill_diagonal_(0)
+
+        exp_sim = paddle.exp(similarity) * logits_mask
+        log_prob = similarity - paddle.log(exp_sim.sum(axis=1, keepdim=True))
+
+        per_label_log_prob = (log_prob * logits_mask * label_mask).sum(1) / label_mask.sum(1)
+        keep = scores > self.thresh
+        per_label_log_prob = per_label_log_prob[keep]
+        loss = -per_label_log_prob
+
+        return loss.mean()
--- a/paddle_detection/ppdet/modeling/losses/varifocal_loss.py
+++ b/paddle_detection/ppdet/modeling/losses/varifocal_loss.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+# The code is based on:
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/varifocal_loss.py
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling import ops
+
+__all__ = ['VarifocalLoss']
+
+
+def varifocal_loss(pred,
+                   target,
+                   alpha=0.75,
+                   gamma=2.0,
+                   iou_weighted=True,
+                   use_sigmoid=True):
+    """`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_
+
+    Args:
+        pred (Tensor): The prediction with shape (N, C), C is the
+            number of classes
+        target (Tensor): The learning target of the iou-aware
+            classification score with shape (N, C), C is the number of classes.
+        alpha (float, optional): A balance factor for the negative part of
+            Varifocal Loss, which is different from the alpha of Focal Loss.
+            Defaults to 0.75.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        iou_weighted (bool, optional): Whether to weight the loss of the
+            positive example with the iou target. Defaults to True.
+    """
+    # pred and target should be of the same size
+    assert pred.shape == target.shape
+    if use_sigmoid:
+        pred_new = F.sigmoid(pred)
+    else:
+        pred_new = pred
+    target = target.cast(pred.dtype)
+    if iou_weighted:
+        focal_weight = target * (target > 0.0).cast('float32') + \
+            alpha * (pred_new - target).abs().pow(gamma) * \
+            (target <= 0.0).cast('float32')
+    else:
+        focal_weight = (target > 0.0).cast('float32') + \
+            alpha * (pred_new - target).abs().pow(gamma) * \
+            (target <= 0.0).cast('float32')
+
+    if use_sigmoid:
+        loss = F.binary_cross_entropy_with_logits(
+            pred, target, reduction='none') * focal_weight
+    else:
+        loss = F.binary_cross_entropy(
+            pred, target, reduction='none') * focal_weight
+        loss = loss.sum(axis=1)
+    return loss
+
+
+@register
+@serializable
+class VarifocalLoss(nn.Layer):
+    def __init__(self,
+                 use_sigmoid=True,
+                 alpha=0.75,
+                 gamma=2.0,
+                 iou_weighted=True,
+                 reduction='mean',
+                 loss_weight=1.0):
+        """`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_
+
+        Args:
+            use_sigmoid (bool, optional): Whether the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            alpha (float, optional): A balance factor for the negative part of
+                Varifocal Loss, which is different from the alpha of Focal
+                Loss. Defaults to 0.75.
+            gamma (float, optional): The gamma for calculating the modulating
+                factor. Defaults to 2.0.
+            iou_weighted (bool, optional): Whether to weight the loss of the
+                positive examples with the iou target. Defaults to True.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'. Options are "none", "mean" and
+                "sum".
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+        """
+        super(VarifocalLoss, self).__init__()
+        assert alpha >= 0.0
+        self.use_sigmoid = use_sigmoid
+        self.alpha = alpha
+        self.gamma = gamma
+        self.iou_weighted = iou_weighted
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self, pred, target, weight=None, avg_factor=None):
+        """Forward function.
+
+        Args:
+            pred (Tensor): The prediction.
+            target (Tensor): The learning target of the prediction.
+            weight (Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+        Returns:
+            Tensor: The calculated loss
+        """
+        loss = self.loss_weight * varifocal_loss(
+            pred,
+            target,
+            alpha=self.alpha,
+            gamma=self.gamma,
+            iou_weighted=self.iou_weighted,
+            use_sigmoid=self.use_sigmoid)
+
+        if weight is not None:
+            loss = loss * weight
+        if avg_factor is None:
+            if self.reduction == 'none':
+                return loss
+            elif self.reduction == 'mean':
+                return loss.mean()
+            elif self.reduction == 'sum':
+                return loss.sum()
+        else:
+            # if reduction is mean, then average the loss by avg_factor
+            if self.reduction == 'mean':
+                loss = loss.sum() / avg_factor
+            # if reduction is 'none', then do nothing, otherwise raise an error
+            elif self.reduction != 'none':
+                raise ValueError(
+                    'avg_factor can not be used with reduction="sum"')
+        return loss
--- a/paddle_detection/ppdet/modeling/losses/yolo_loss.py
+++ b/paddle_detection/ppdet/modeling/losses/yolo_loss.py
@@ -0,0 +1,207 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register
+
+from ..bbox_utils import decode_yolo, xywh2xyxy, batch_iou_similarity
+
+__all__ = ['YOLOv3Loss']
+
+
+def bbox_transform(pbox, anchor, downsample):
+    pbox = decode_yolo(pbox, anchor, downsample)
+    pbox = xywh2xyxy(pbox)
+    return pbox
+
+
+@register
+class YOLOv3Loss(nn.Layer):
+
+    __inject__ = ['iou_loss', 'iou_aware_loss']
+    __shared__ = ['num_classes']
+
+    def __init__(self,
+                 num_classes=80,
+                 ignore_thresh=0.7,
+                 label_smooth=False,
+                 downsample=[32, 16, 8],
+                 scale_x_y=1.,
+                 iou_loss=None,
+                 iou_aware_loss=None):
+        """
+        YOLOv3Loss layer
+
+        Args:
+            num_calsses (int): number of foreground classes
+            ignore_thresh (float): threshold to ignore confidence loss
+            label_smooth (bool): whether to use label smoothing
+            downsample (list): downsample ratio for each detection block
+            scale_x_y (float): scale_x_y factor
+            iou_loss (object): IoULoss instance
+            iou_aware_loss (object): IouAwareLoss instance  
+        """
+        super(YOLOv3Loss, self).__init__()
+        self.num_classes = num_classes
+        self.ignore_thresh = ignore_thresh
+        self.label_smooth = label_smooth
+        self.downsample = downsample
+        self.scale_x_y = scale_x_y
+        self.iou_loss = iou_loss
+        self.iou_aware_loss = iou_aware_loss
+        self.distill_pairs = []
+
+    def obj_loss(self, pbox, gbox, pobj, tobj, anchor, downsample):
+        # pbox
+        pbox = decode_yolo(pbox, anchor, downsample)
+        pbox = xywh2xyxy(pbox)
+        pbox = paddle.concat(pbox, axis=-1)
+        b = pbox.shape[0]
+        pbox = pbox.reshape((b, -1, 4))
+        # gbox
+        gxy = gbox[:, :, 0:2] - gbox[:, :, 2:4] * 0.5
+        gwh = gbox[:, :, 0:2] + gbox[:, :, 2:4] * 0.5
+        gbox = paddle.concat([gxy, gwh], axis=-1)
+
+        iou = batch_iou_similarity(pbox, gbox)
+        iou.stop_gradient = True
+        iou_max = iou.max(2)  # [N, M1]
+        iou_mask = paddle.cast(iou_max <= self.ignore_thresh, dtype=pbox.dtype)
+        iou_mask.stop_gradient = True
+
+        pobj = pobj.reshape((b, -1))
+        tobj = tobj.reshape((b, -1))
+        obj_mask = paddle.cast(tobj > 0, dtype=pbox.dtype)
+        obj_mask.stop_gradient = True
+
+        loss_obj = F.binary_cross_entropy_with_logits(
+            pobj, obj_mask, reduction='none')
+        loss_obj_pos = (loss_obj * tobj)
+        loss_obj_neg = (loss_obj * (1 - obj_mask) * iou_mask)
+        return loss_obj_pos + loss_obj_neg
+
+    def cls_loss(self, pcls, tcls):
+        if self.label_smooth:
+            delta = min(1. / self.num_classes, 1. / 40)
+            pos, neg = 1 - delta, delta
+            # 1 for positive, 0 for negative
+            tcls = pos * paddle.cast(
+                tcls > 0., dtype=tcls.dtype) + neg * paddle.cast(
+                    tcls <= 0., dtype=tcls.dtype)
+
+        loss_cls = F.binary_cross_entropy_with_logits(
+            pcls, tcls, reduction='none')
+        return loss_cls
+
+    def yolov3_loss(self, p, t, gt_box, anchor, downsample, scale=1.,
+                    eps=1e-10):
+        na = len(anchor)
+        b, c, h, w = p.shape
+        if self.iou_aware_loss:
+            ioup, p = p[:, 0:na, :, :], p[:, na:, :, :]
+            ioup = ioup.unsqueeze(-1)
+        p = p.reshape((b, na, -1, h, w)).transpose((0, 1, 3, 4, 2))
+        x, y = p[:, :, :, :, 0:1], p[:, :, :, :, 1:2]
+        w, h = p[:, :, :, :, 2:3], p[:, :, :, :, 3:4]
+        obj, pcls = p[:, :, :, :, 4:5], p[:, :, :, :, 5:]
+        self.distill_pairs.append([x, y, w, h, obj, pcls])
+
+        t = t.transpose((0, 1, 3, 4, 2))
+        tx, ty = t[:, :, :, :, 0:1], t[:, :, :, :, 1:2]
+        tw, th = t[:, :, :, :, 2:3], t[:, :, :, :, 3:4]
+        tscale = t[:, :, :, :, 4:5]
+        tobj, tcls = t[:, :, :, :, 5:6], t[:, :, :, :, 6:]
+
+        tscale_obj = tscale * tobj
+        loss = dict()
+
+        x = scale * F.sigmoid(x) - 0.5 * (scale - 1.)
+        y = scale * F.sigmoid(y) - 0.5 * (scale - 1.)
+
+        if abs(scale - 1.) < eps:
+            loss_x = F.binary_cross_entropy(x, tx, reduction='none')
+            loss_y = F.binary_cross_entropy(y, ty, reduction='none')
+            loss_xy = tscale_obj * (loss_x + loss_y)
+        else:
+            loss_x = paddle.abs(x - tx)
+            loss_y = paddle.abs(y - ty)
+            loss_xy = tscale_obj * (loss_x + loss_y)
+
+        loss_xy = loss_xy.sum([1, 2, 3, 4]).mean()
+
+        loss_w = paddle.abs(w - tw)
+        loss_h = paddle.abs(h - th)
+        loss_wh = tscale_obj * (loss_w + loss_h)
+        loss_wh = loss_wh.sum([1, 2, 3, 4]).mean()
+
+        loss['loss_xy'] = loss_xy
+        loss['loss_wh'] = loss_wh
+
+        if self.iou_loss is not None:
+            # warn: do not modify x, y, w, h in place
+            box, tbox = [x, y, w, h], [tx, ty, tw, th]
+            pbox = bbox_transform(box, anchor, downsample)
+            gbox = bbox_transform(tbox, anchor, downsample)
+            loss_iou = self.iou_loss(pbox, gbox)
+            loss_iou = loss_iou * tscale_obj
+            loss_iou = loss_iou.sum([1, 2, 3, 4]).mean()
+            loss['loss_iou'] = loss_iou
+
+        if self.iou_aware_loss is not None:
+            box, tbox = [x, y, w, h], [tx, ty, tw, th]
+            pbox = bbox_transform(box, anchor, downsample)
+            gbox = bbox_transform(tbox, anchor, downsample)
+            loss_iou_aware = self.iou_aware_loss(ioup, pbox, gbox)
+            loss_iou_aware = loss_iou_aware * tobj
+            loss_iou_aware = loss_iou_aware.sum([1, 2, 3, 4]).mean()
+            loss['loss_iou_aware'] = loss_iou_aware
+
+        box = [x, y, w, h]
+        loss_obj = self.obj_loss(box, gt_box, obj, tobj, anchor, downsample)
+        loss_obj = loss_obj.sum(-1).mean()
+        loss['loss_obj'] = loss_obj
+        loss_cls = self.cls_loss(pcls, tcls) * tobj
+        loss_cls = loss_cls.sum([1, 2, 3, 4]).mean()
+        loss['loss_cls'] = loss_cls
+        return loss
+
+    def forward(self, inputs, targets, anchors):
+        np = len(inputs)
+        gt_targets = [targets['target{}'.format(i)] for i in range(np)]
+        gt_box = targets['gt_bbox']
+        yolo_losses = dict()
+        self.distill_pairs.clear()
+        for x, t, anchor, downsample in zip(inputs, gt_targets, anchors,
+                                            self.downsample):
+            yolo_loss = self.yolov3_loss(
+                x.astype('float32'), t, gt_box, anchor, downsample,
+                self.scale_x_y)
+            for k, v in yolo_loss.items():
+                if k in yolo_losses:
+                    yolo_losses[k] += v
+                else:
+                    yolo_losses[k] = v
+
+        loss = 0
+        for k, v in yolo_losses.items():
+            loss += v
+
+        yolo_losses['loss'] = loss
+        return yolo_losses