更换文档检测模型

2024-08-27 14:42:45 +08:00
parent aea6f19951
commit 1514e09c40
2072 changed files with 254336 additions and 4967 deletions
--- a/paddle_detection/ppdet/modeling/assigners/pose_utils.py
+++ b/paddle_detection/ppdet/modeling/assigners/pose_utils.py
@@ -0,0 +1,275 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+
+from ppdet.core.workspace import register
+
+__all__ = ['KptL1Cost', 'OksCost', 'ClassificationCost']
+
+
+def masked_fill(x, mask, value):
+    y = paddle.full(x.shape, value, x.dtype)
+    return paddle.where(mask, y, x)
+
+
+@register
+class KptL1Cost(object):
+    """KptL1Cost.
+
+    this function based on: https://github.com/hikvision-research/opera/blob/main/opera/core/bbox/match_costs/match_cost.py
+
+    Args:
+        weight (int | float, optional): loss_weight.
+    """
+
+    def __init__(self, weight=1.0):
+        self.weight = weight
+
+    def __call__(self, kpt_pred, gt_keypoints, valid_kpt_flag):
+        """
+        Args:
+            kpt_pred (Tensor): Predicted keypoints with normalized coordinates
+                (x_{i}, y_{i}), which are all in range [0, 1]. Shape
+                [num_query, K, 2].
+            gt_keypoints (Tensor): Ground truth keypoints with normalized
+                coordinates (x_{i}, y_{i}). Shape [num_gt, K, 2].
+            valid_kpt_flag (Tensor): valid flag of ground truth keypoints.
+                Shape [num_gt, K].
+
+        Returns:
+            paddle.Tensor: kpt_cost value with weight.
+        """
+        kpt_cost = []
+        for i in range(len(gt_keypoints)):
+            if gt_keypoints[i].size == 0:
+                kpt_cost.append(kpt_pred.sum() * 0)
+            kpt_pred_tmp = kpt_pred.clone()
+            valid_flag = valid_kpt_flag[i] > 0
+            valid_flag_expand = valid_flag.unsqueeze(0).unsqueeze(-1).expand_as(
+                kpt_pred_tmp)
+            if not valid_flag_expand.all():
+                kpt_pred_tmp = masked_fill(kpt_pred_tmp, ~valid_flag_expand, 0)
+            cost = F.pairwise_distance(
+                kpt_pred_tmp.reshape((kpt_pred_tmp.shape[0], -1)),
+                gt_keypoints[i].reshape((-1, )).unsqueeze(0),
+                p=1,
+                keepdim=True)
+            avg_factor = paddle.clip(
+                valid_flag.astype('float32').sum() * 2, 1.0)
+            cost = cost / avg_factor
+            kpt_cost.append(cost)
+        kpt_cost = paddle.concat(kpt_cost, axis=1)
+        return kpt_cost * self.weight
+
+
+@register
+class OksCost(object):
+    """OksCost.
+
+    this function based on: https://github.com/hikvision-research/opera/blob/main/opera/core/bbox/match_costs/match_cost.py
+
+    Args:
+        num_keypoints (int): number of keypoints
+        weight (int | float, optional): loss_weight.
+    """
+
+    def __init__(self, num_keypoints=17, weight=1.0):
+        self.weight = weight
+        if num_keypoints == 17:
+            self.sigmas = np.array(
+                [
+                    .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
+                    1.07, .87, .87, .89, .89
+                ],
+                dtype=np.float32) / 10.0
+        elif num_keypoints == 14:
+            self.sigmas = np.array(
+                [
+                    .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89,
+                    .89, .79, .79
+                ],
+                dtype=np.float32) / 10.0
+        else:
+            raise ValueError(f'Unsupported keypoints number {num_keypoints}')
+
+    def __call__(self, kpt_pred, gt_keypoints, valid_kpt_flag, gt_areas):
+        """
+        Args:
+            kpt_pred (Tensor): Predicted keypoints with unnormalized
+                coordinates (x_{i}, y_{i}). Shape [num_query, K, 2].
+            gt_keypoints (Tensor): Ground truth keypoints with unnormalized
+                coordinates (x_{i}, y_{i}). Shape [num_gt, K, 2].
+            valid_kpt_flag (Tensor): valid flag of ground truth keypoints.
+                Shape [num_gt, K].
+            gt_areas (Tensor): Ground truth mask areas. Shape [num_gt,].
+
+        Returns:
+            paddle.Tensor: oks_cost value with weight.
+        """
+        sigmas = paddle.to_tensor(self.sigmas)
+        variances = (sigmas * 2)**2
+
+        oks_cost = []
+        assert len(gt_keypoints) == len(gt_areas)
+        for i in range(len(gt_keypoints)):
+            if gt_keypoints[i].size == 0:
+                oks_cost.append(kpt_pred.sum() * 0)
+            squared_distance = \
+                (kpt_pred[:, :, 0] - gt_keypoints[i, :, 0].unsqueeze(0)) ** 2 + \
+                (kpt_pred[:, :, 1] - gt_keypoints[i, :, 1].unsqueeze(0)) ** 2
+            vis_flag = (valid_kpt_flag[i] > 0).astype('int')
+            vis_ind = vis_flag.nonzero(as_tuple=False)[:, 0]
+            num_vis_kpt = vis_ind.shape[0]
+            # assert num_vis_kpt > 0
+            if num_vis_kpt == 0:
+                oks_cost.append(paddle.zeros((squared_distance.shape[0], 1)))
+                continue
+            area = gt_areas[i]
+
+            squared_distance0 = squared_distance / (area * variances * 2)
+            squared_distance0 = paddle.index_select(
+                squared_distance0, vis_ind, axis=1)
+            squared_distance1 = paddle.exp(-squared_distance0).sum(axis=1,
+                                                                   keepdim=True)
+            oks = squared_distance1 / num_vis_kpt
+            # The 1 is a constant that doesn't change the matching, so omitted.
+            oks_cost.append(-oks)
+        oks_cost = paddle.concat(oks_cost, axis=1)
+        return oks_cost * self.weight
+
+
+@register
+class ClassificationCost:
+    """ClsSoftmaxCost.
+
+     Args:
+         weight (int | float, optional): loss_weight
+    """
+
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                (num_query, num_class).
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+        Returns:
+            paddle.Tensor: cls_cost value with weight
+        """
+        # Following the official DETR repo, contrary to the loss that
+        # NLL is used, we approximate it in 1 - cls_score[gt_label].
+        # The 1 is a constant that doesn't change the matching,
+        # so it can be omitted.
+        cls_score = cls_pred.softmax(-1)
+        cls_cost = -cls_score[:, gt_labels]
+        return cls_cost * self.weight
+
+
+@register
+class FocalLossCost:
+    """FocalLossCost.
+
+     Args:
+         weight (int | float, optional): loss_weight
+         alpha (int | float, optional): focal_loss alpha
+         gamma (int | float, optional): focal_loss gamma
+         eps (float, optional): default 1e-12
+         binary_input (bool, optional): Whether the input is binary,
+            default False.
+    """
+
+    def __init__(self,
+                 weight=1.,
+                 alpha=0.25,
+                 gamma=2,
+                 eps=1e-12,
+                 binary_input=False):
+        self.weight = weight
+        self.alpha = alpha
+        self.gamma = gamma
+        self.eps = eps
+        self.binary_input = binary_input
+
+    def _focal_loss_cost(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                (num_query, num_class).
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+        Returns:
+            paddle.Tensor: cls_cost value with weight
+        """
+        if gt_labels.size == 0:
+            return cls_pred.sum() * 0
+        cls_pred = F.sigmoid(cls_pred)
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+
+        cls_cost = paddle.index_select(
+            pos_cost, gt_labels, axis=1) - paddle.index_select(
+                neg_cost, gt_labels, axis=1)
+        return cls_cost * self.weight
+
+    def _mask_focal_loss_cost(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classfication logits
+                in shape (num_query, d1, ..., dn), dtype=paddle.float32.
+            gt_labels (Tensor): Ground truth in shape (num_gt, d1, ..., dn),
+                dtype=paddle.long. Labels should be binary.
+
+        Returns:
+            Tensor: Focal cost matrix with weight in shape\
+                (num_query, num_gt).
+        """
+        cls_pred = cls_pred.flatten(1)
+        gt_labels = gt_labels.flatten(1).float()
+        n = cls_pred.shape[1]
+        cls_pred = F.sigmoid(cls_pred)
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+
+        cls_cost = paddle.einsum('nc,mc->nm', pos_cost, gt_labels) + \
+            paddle.einsum('nc,mc->nm', neg_cost, (1 - gt_labels))
+        return cls_cost / n * self.weight
+
+    def __call__(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classfication logits.
+            gt_labels (Tensor)): Labels.
+
+        Returns:
+            Tensor: Focal cost matrix with weight in shape\
+                (num_query, num_gt).
+        """
+        if self.binary_input:
+            return self._mask_focal_loss_cost(cls_pred, gt_labels)
+        else:
+            return self._focal_loss_cost(cls_pred, gt_labels)