Files
2024-08-27 14:42:45 +08:00

276 lines
9.7 KiB
Python

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle
import paddle.nn.functional as F
from ppdet.core.workspace import register
__all__ = ['KptL1Cost', 'OksCost', 'ClassificationCost']
def masked_fill(x, mask, value):
y = paddle.full(x.shape, value, x.dtype)
return paddle.where(mask, y, x)
@register
class KptL1Cost(object):
"""KptL1Cost.
this function based on: https://github.com/hikvision-research/opera/blob/main/opera/core/bbox/match_costs/match_cost.py
Args:
weight (int | float, optional): loss_weight.
"""
def __init__(self, weight=1.0):
self.weight = weight
def __call__(self, kpt_pred, gt_keypoints, valid_kpt_flag):
"""
Args:
kpt_pred (Tensor): Predicted keypoints with normalized coordinates
(x_{i}, y_{i}), which are all in range [0, 1]. Shape
[num_query, K, 2].
gt_keypoints (Tensor): Ground truth keypoints with normalized
coordinates (x_{i}, y_{i}). Shape [num_gt, K, 2].
valid_kpt_flag (Tensor): valid flag of ground truth keypoints.
Shape [num_gt, K].
Returns:
paddle.Tensor: kpt_cost value with weight.
"""
kpt_cost = []
for i in range(len(gt_keypoints)):
if gt_keypoints[i].size == 0:
kpt_cost.append(kpt_pred.sum() * 0)
kpt_pred_tmp = kpt_pred.clone()
valid_flag = valid_kpt_flag[i] > 0
valid_flag_expand = valid_flag.unsqueeze(0).unsqueeze(-1).expand_as(
kpt_pred_tmp)
if not valid_flag_expand.all():
kpt_pred_tmp = masked_fill(kpt_pred_tmp, ~valid_flag_expand, 0)
cost = F.pairwise_distance(
kpt_pred_tmp.reshape((kpt_pred_tmp.shape[0], -1)),
gt_keypoints[i].reshape((-1, )).unsqueeze(0),
p=1,
keepdim=True)
avg_factor = paddle.clip(
valid_flag.astype('float32').sum() * 2, 1.0)
cost = cost / avg_factor
kpt_cost.append(cost)
kpt_cost = paddle.concat(kpt_cost, axis=1)
return kpt_cost * self.weight
@register
class OksCost(object):
"""OksCost.
this function based on: https://github.com/hikvision-research/opera/blob/main/opera/core/bbox/match_costs/match_cost.py
Args:
num_keypoints (int): number of keypoints
weight (int | float, optional): loss_weight.
"""
def __init__(self, num_keypoints=17, weight=1.0):
self.weight = weight
if num_keypoints == 17:
self.sigmas = np.array(
[
.26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
1.07, .87, .87, .89, .89
],
dtype=np.float32) / 10.0
elif num_keypoints == 14:
self.sigmas = np.array(
[
.79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89,
.89, .79, .79
],
dtype=np.float32) / 10.0
else:
raise ValueError(f'Unsupported keypoints number {num_keypoints}')
def __call__(self, kpt_pred, gt_keypoints, valid_kpt_flag, gt_areas):
"""
Args:
kpt_pred (Tensor): Predicted keypoints with unnormalized
coordinates (x_{i}, y_{i}). Shape [num_query, K, 2].
gt_keypoints (Tensor): Ground truth keypoints with unnormalized
coordinates (x_{i}, y_{i}). Shape [num_gt, K, 2].
valid_kpt_flag (Tensor): valid flag of ground truth keypoints.
Shape [num_gt, K].
gt_areas (Tensor): Ground truth mask areas. Shape [num_gt,].
Returns:
paddle.Tensor: oks_cost value with weight.
"""
sigmas = paddle.to_tensor(self.sigmas)
variances = (sigmas * 2)**2
oks_cost = []
assert len(gt_keypoints) == len(gt_areas)
for i in range(len(gt_keypoints)):
if gt_keypoints[i].size == 0:
oks_cost.append(kpt_pred.sum() * 0)
squared_distance = \
(kpt_pred[:, :, 0] - gt_keypoints[i, :, 0].unsqueeze(0)) ** 2 + \
(kpt_pred[:, :, 1] - gt_keypoints[i, :, 1].unsqueeze(0)) ** 2
vis_flag = (valid_kpt_flag[i] > 0).astype('int')
vis_ind = vis_flag.nonzero(as_tuple=False)[:, 0]
num_vis_kpt = vis_ind.shape[0]
# assert num_vis_kpt > 0
if num_vis_kpt == 0:
oks_cost.append(paddle.zeros((squared_distance.shape[0], 1)))
continue
area = gt_areas[i]
squared_distance0 = squared_distance / (area * variances * 2)
squared_distance0 = paddle.index_select(
squared_distance0, vis_ind, axis=1)
squared_distance1 = paddle.exp(-squared_distance0).sum(axis=1,
keepdim=True)
oks = squared_distance1 / num_vis_kpt
# The 1 is a constant that doesn't change the matching, so omitted.
oks_cost.append(-oks)
oks_cost = paddle.concat(oks_cost, axis=1)
return oks_cost * self.weight
@register
class ClassificationCost:
"""ClsSoftmaxCost.
Args:
weight (int | float, optional): loss_weight
"""
def __init__(self, weight=1.):
self.weight = weight
def __call__(self, cls_pred, gt_labels):
"""
Args:
cls_pred (Tensor): Predicted classification logits, shape
(num_query, num_class).
gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
Returns:
paddle.Tensor: cls_cost value with weight
"""
# Following the official DETR repo, contrary to the loss that
# NLL is used, we approximate it in 1 - cls_score[gt_label].
# The 1 is a constant that doesn't change the matching,
# so it can be omitted.
cls_score = cls_pred.softmax(-1)
cls_cost = -cls_score[:, gt_labels]
return cls_cost * self.weight
@register
class FocalLossCost:
"""FocalLossCost.
Args:
weight (int | float, optional): loss_weight
alpha (int | float, optional): focal_loss alpha
gamma (int | float, optional): focal_loss gamma
eps (float, optional): default 1e-12
binary_input (bool, optional): Whether the input is binary,
default False.
"""
def __init__(self,
weight=1.,
alpha=0.25,
gamma=2,
eps=1e-12,
binary_input=False):
self.weight = weight
self.alpha = alpha
self.gamma = gamma
self.eps = eps
self.binary_input = binary_input
def _focal_loss_cost(self, cls_pred, gt_labels):
"""
Args:
cls_pred (Tensor): Predicted classification logits, shape
(num_query, num_class).
gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
Returns:
paddle.Tensor: cls_cost value with weight
"""
if gt_labels.size == 0:
return cls_pred.sum() * 0
cls_pred = F.sigmoid(cls_pred)
neg_cost = -(1 - cls_pred + self.eps).log() * (
1 - self.alpha) * cls_pred.pow(self.gamma)
pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
1 - cls_pred).pow(self.gamma)
cls_cost = paddle.index_select(
pos_cost, gt_labels, axis=1) - paddle.index_select(
neg_cost, gt_labels, axis=1)
return cls_cost * self.weight
def _mask_focal_loss_cost(self, cls_pred, gt_labels):
"""
Args:
cls_pred (Tensor): Predicted classfication logits
in shape (num_query, d1, ..., dn), dtype=paddle.float32.
gt_labels (Tensor): Ground truth in shape (num_gt, d1, ..., dn),
dtype=paddle.long. Labels should be binary.
Returns:
Tensor: Focal cost matrix with weight in shape\
(num_query, num_gt).
"""
cls_pred = cls_pred.flatten(1)
gt_labels = gt_labels.flatten(1).float()
n = cls_pred.shape[1]
cls_pred = F.sigmoid(cls_pred)
neg_cost = -(1 - cls_pred + self.eps).log() * (
1 - self.alpha) * cls_pred.pow(self.gamma)
pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
1 - cls_pred).pow(self.gamma)
cls_cost = paddle.einsum('nc,mc->nm', pos_cost, gt_labels) + \
paddle.einsum('nc,mc->nm', neg_cost, (1 - gt_labels))
return cls_cost / n * self.weight
def __call__(self, cls_pred, gt_labels):
"""
Args:
cls_pred (Tensor): Predicted classfication logits.
gt_labels (Tensor)): Labels.
Returns:
Tensor: Focal cost matrix with weight in shape\
(num_query, num_gt).
"""
if self.binary_input:
return self._mask_focal_loss_cost(cls_pred, gt_labels)
else:
return self._focal_loss_cost(cls_pred, gt_labels)