更换文档检测模型

2024-08-27 14:42:45 +08:00
parent aea6f19951
commit 1514e09c40
2072 changed files with 254336 additions and 4967 deletions
--- a/paddle_detection/ppdet/utils/cam_utils.py
+++ b/paddle_detection/ppdet/utils/cam_utils.py
@@ -0,0 +1,343 @@
+import numpy as np
+import cv2
+import os
+import sys
+import glob
+from ppdet.utils.logger import setup_logger
+import copy
+logger = setup_logger('ppdet_cam')
+
+import paddle
+from ppdet.engine import Trainer
+
+
+def get_test_images(infer_dir, infer_img):
+    """
+    Get image path list in TEST mode
+    """
+    assert infer_img is not None or infer_dir is not None, \
+        "--infer_img or --infer_dir should be set"
+    assert infer_img is None or os.path.isfile(infer_img), \
+            "{} is not a file".format(infer_img)
+    assert infer_dir is None or os.path.isdir(infer_dir), \
+            "{} is not a directory".format(infer_dir)
+
+    # infer_img has a higher priority
+    if infer_img and os.path.isfile(infer_img):
+        return [infer_img]
+
+    images = set()
+    infer_dir = os.path.abspath(infer_dir)
+    assert os.path.isdir(infer_dir), \
+        "infer_dir {} is not a directory".format(infer_dir)
+    exts = ['jpg', 'jpeg', 'png', 'bmp']
+    exts += [ext.upper() for ext in exts]
+    for ext in exts:
+        images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
+    images = list(images)
+
+    assert len(images) > 0, "no image found in {}".format(infer_dir)
+    logger.info("Found {} inference images in total.".format(len(images)))
+
+    return images
+
+
+def compute_ious(boxes1, boxes2):
+    """[Compute pairwise IOU matrix for given two sets of boxes]
+
+        Args:
+            boxes1 ([numpy ndarray with shape N,4]): [representing bounding boxes with format (xmin,ymin,xmax,ymax)]
+            boxes2 ([numpy ndarray with shape M,4]): [representing bounding boxes with format (xmin,ymin,xmax,ymax)]
+        Returns:
+            pairwise IOU maxtrix with shape (N,M)，where the value at ith row jth column hold the iou between ith
+            box and jth box from box1 and box2 respectively.
+    """
+    lu = np.maximum(
+        boxes1[:, None, :2], boxes2[:, :2]
+    )  # lu with shape N,M,2 ; boxes1[:,None,:2] with shape (N,1,2) boxes2 with shape(M,2)
+    rd = np.minimum(boxes1[:, None, 2:], boxes2[:, 2:])  # rd same to lu
+    intersection_wh = np.maximum(0.0, rd - lu)
+    intersection_area = intersection_wh[:, :,
+                                        0] * intersection_wh[:, :,
+                                                             1]  # with shape (N,M)
+    boxes1_wh = np.maximum(0.0, boxes1[:, 2:] - boxes1[:, :2])
+    boxes1_area = boxes1_wh[:, 0] * boxes1_wh[:, 1]  # with shape (N,)
+    boxes2_wh = np.maximum(0.0, boxes2[:, 2:] - boxes2[:, :2])
+    boxes2_area = boxes2_wh[:, 0] * boxes2_wh[:, 1]  # with shape (M,)
+    union_area = np.maximum(
+        boxes1_area[:, None] + boxes2_area - intersection_area,
+        1e-8)  # with shape (N,M)
+    ious = np.clip(intersection_area / union_area, 0.0, 1.0)
+    return ious
+
+
+def grad_cam(feat, grad):
+    """
+
+    Args:
+        feat:  CxHxW
+        grad:  CxHxW
+
+    Returns:
+           cam: HxW
+    """
+    exp = (feat * grad.mean((1, 2), keepdims=True)).mean(axis=0)
+    exp = np.maximum(-exp, 0)
+    return exp
+
+
+def resize_cam(explanation, resize_shape) -> np.ndarray:
+    """
+
+    Args:
+        explanation: (width, height)
+        resize_shape: (width, height)
+
+    Returns:
+
+    """
+    assert len(explanation.shape) == 2, f"{explanation.shape}. " \
+                                        f"Currently support 2D explanation results for visualization. " \
+                                        "Reduce higher dimensions to 2D for visualization."
+
+    explanation = (explanation - explanation.min()) / (
+        explanation.max() - explanation.min())
+
+    explanation = cv2.resize(explanation, resize_shape)
+    explanation = np.uint8(255 * explanation)
+    explanation = cv2.applyColorMap(explanation, cv2.COLORMAP_JET)
+    explanation = cv2.cvtColor(explanation, cv2.COLOR_BGR2RGB)
+
+    return explanation
+
+
+class BBoxCAM:
+    def __init__(self, FLAGS, cfg):
+        self.FLAGS = FLAGS
+        self.cfg = cfg
+        # build model
+        self.trainer = self.build_trainer(cfg)
+        # num_class
+        self.num_class = cfg.num_classes
+        # set hook for extraction of featuremaps and grads
+        self.set_hook(cfg)
+        self.nms_idx_need_divid_numclass_arch = ['FasterRCNN', 'MaskRCNN', 'CascadeRCNN']
+        """
+        In these networks, the bbox array shape before nms contain num_class,
+        the nms_keep_idx of the bbox need to divide the num_class; 
+        """
+
+        # cam image output_dir
+        try:
+            os.makedirs(FLAGS.cam_out)
+        except:
+            print('Path already exists.')
+            pass
+
+    def build_trainer(self, cfg):
+        # build trainer
+        trainer = Trainer(cfg, mode='test')
+        # load weights
+        trainer.load_weights(cfg.weights)
+
+        # set for get extra_data before nms
+        trainer.model.use_extra_data=True
+        # set for record the bbox index before nms
+        if cfg.architecture in ['FasterRCNN', 'MaskRCNN']:
+            trainer.model.bbox_post_process.nms.return_index = True
+        elif cfg.architecture in ['YOLOv3', 'PPYOLOE', 'PPYOLOEWithAuxHead']:
+            if trainer.model.post_process is not None:
+                # anchor based YOLOs: YOLOv3,PP-YOLO
+                trainer.model.post_process.nms.return_index = True
+            else:
+                # anchor free YOLOs: PP-YOLOE, PP-YOLOE+
+                trainer.model.yolo_head.nms.return_index = True
+        elif cfg.architecture=='BlazeFace' or cfg.architecture=='SSD':
+            trainer.model.post_process.nms.return_index = True
+        elif cfg.architecture=='RetinaNet':
+            trainer.model.head.nms.return_index = True
+        else:
+            print(
+                cfg.architecture+' is not supported for cam temporarily!'
+            )
+            sys.exit()
+        # Todo: Unify the head/post_process name in each model
+
+        return trainer
+
+    def set_hook(self, cfg):
+        # set hook for extraction of featuremaps and grads
+        self.target_feats = {}
+        self.target_layer_name = cfg.target_feature_layer_name
+        # such as trainer.model.backbone, trainer.model.bbox_head.roi_extractor
+
+        def hook(layer, input, output):
+            self.target_feats[layer._layer_name_for_hook] = output
+
+        try:
+            exec('self.trainer.'+self.target_layer_name+'._layer_name_for_hook = self.target_layer_name')
+            # self.trainer.target_layer_name._layer_name_for_hook = self.target_layer_name
+            exec('self.trainer.'+self.target_layer_name+'.register_forward_post_hook(hook)')
+            # self.trainer.target_layer_name.register_forward_post_hook(hook)
+        except:
+            print("Error! "
+                  "The target_layer_name--"+self.target_layer_name+" is not in model! "
+                  "Please check the spelling and "
+                  "the network's architecture!")
+            sys.exit()
+
+    def get_bboxes(self):
+        # get inference images
+        images = get_test_images(self.FLAGS.infer_dir, self.FLAGS.infer_img)
+
+        # inference
+        result = self.trainer.predict(
+            images,
+            draw_threshold=self.FLAGS.draw_threshold,
+            output_dir=self.FLAGS.output_dir,
+            save_results=self.FLAGS.save_results,
+            visualize=False)[0]
+        return result
+
+    def get_bboxes_cams(self):
+        # Get the bboxes prediction(after nms result) of the input
+        inference_result = self.get_bboxes()
+
+        # read input image
+        # Todo: Support folder multi-images process
+        from PIL import Image
+        img = np.array(Image.open(self.cfg.infer_img))
+
+        # data for calaulate bbox grad_cam
+        extra_data = inference_result['extra_data']
+        """
+        Example of Faster_RCNN based architecture:
+            extra_data: {'scores': tensor with shape [num_of_bboxes_before_nms, num_classes], for example: [1000, 80]
+                       'nms_keep_idx': tensor with shape [num_of_bboxes_after_nms, 1], for example: [300, 1]
+                      }
+        Example of YOLOv3 based architecture:
+            extra_data: {'scores': tensor with shape [1, num_classes, num_of_yolo_bboxes_before_nms], #for example: [1, 80, 8400]
+                       'nms_keep_idx': tensor with shape [num_of_yolo_bboxes_after_nms, 1], # for example: [300, 1]
+                      }
+        """
+
+        # array index of the predicted bbox before nms
+        if self.cfg.architecture in self.nms_idx_need_divid_numclass_arch:
+            # some network's bbox array shape before nms may be like [num_of_bboxes_before_nms, num_classes, 4],
+            # we need to divide num_classes to get the before_nms_index；
+            # currently, only include the rcnn architectures （fasterrcnn, maskrcnn, cascadercnn);
+            before_nms_indexes = extra_data['nms_keep_idx'].cpu().numpy(
+            ) // self.num_class  # num_class
+        else :
+            before_nms_indexes = extra_data['nms_keep_idx'].cpu().numpy()
+
+        # Calculate and visualize the heatmap of per predict bbox
+        for index, target_bbox in enumerate(inference_result['bbox']):
+            # target_bbox: [cls, score, x1, y1, x2, y2]
+            # filter bboxes with low predicted scores
+            if target_bbox[1] < self.FLAGS.draw_threshold:
+                continue
+
+            target_bbox_before_nms = int(before_nms_indexes[index])
+
+            if len(extra_data['scores'].shape)==2:
+                score_out = extra_data['scores'][target_bbox_before_nms]
+            else:
+                score_out = extra_data['scores'][0, :, target_bbox_before_nms]
+            """
+            There are two kinds array shape of bbox score output :
+                1) [num_of_bboxes_before_nms, num_classes], for example: [1000, 80]
+                2) [num_of_image, num_classes, num_of_yolo_bboxes_before_nms], for example: [1, 80, 1000]
+            """
+
+
+            # construct one_hot label and do backward to get the gradients
+            predicted_label = paddle.argmax(score_out)
+            label_onehot = paddle.nn.functional.one_hot(
+                predicted_label, num_classes=len(score_out))
+            label_onehot = label_onehot.squeeze()
+            target = paddle.sum(score_out * label_onehot)
+            target.backward(retain_graph=True)
+
+
+            if 'backbone' in self.target_layer_name or \
+                    'neck' in self.target_layer_name: # backbone/neck level feature
+                if isinstance(self.target_feats[self.target_layer_name], list):
+                    # when the featuremap contains of multiple scales,
+                    # take the featuremap of the last scale
+                    # Todo: fuse the cam result from multisclae featuremaps
+                    if self.target_feats[self.target_layer_name][
+                            -1].shape[-1]==1:
+                        """
+                        if the last level featuremap is 1x1 size,
+                        we take the second last one
+                        """
+                        cam_grad = self.target_feats[self.target_layer_name][
+                            -2].grad.squeeze().cpu().numpy()
+                        cam_feat = self.target_feats[self.target_layer_name][
+                            -2].squeeze().cpu().numpy()
+                    else:
+                        cam_grad = self.target_feats[self.target_layer_name][
+                            -1].grad.squeeze().cpu().numpy()
+                        cam_feat = self.target_feats[self.target_layer_name][
+                            -1].squeeze().cpu().numpy()
+                else:
+                    cam_grad = self.target_feats[
+                        self.target_layer_name].grad.squeeze().cpu().numpy()
+                    cam_feat = self.target_feats[
+                        self.target_layer_name].squeeze().cpu().numpy()
+            else: # roi level feature
+                cam_grad = self.target_feats[
+                    self.target_layer_name].grad.squeeze().cpu().numpy()[target_bbox_before_nms]
+                cam_feat = self.target_feats[
+                    self.target_layer_name].squeeze().cpu().numpy()[target_bbox_before_nms]
+
+            # grad_cam:
+            exp = grad_cam(cam_feat, cam_grad)
+
+            if 'backbone' in self.target_layer_name or \
+                    'neck' in self.target_layer_name:
+                """
+                when use backbone/neck featuremap, 
+                we first do the cam on whole image, 
+                and then set the area outside the predic bbox to 0
+                """
+                # reshape the cam image to the input image size
+                resized_exp = resize_cam(exp, (img.shape[1], img.shape[0]))
+                mask = np.zeros((img.shape[0], img.shape[1], 3))
+                mask[int(target_bbox[3]):int(target_bbox[5]), int(target_bbox[2]):
+                     int(target_bbox[4]), :] = 1
+                resized_exp = resized_exp * mask
+                # add the bbox cam back to the input image
+                overlay_vis = np.uint8(resized_exp * 0.4 + img * 0.6)
+            elif 'roi' in self.target_layer_name:
+                # get the bbox part of the image
+                bbox_img = copy.deepcopy(img[int(target_bbox[3]):int(target_bbox[5]),
+                                         int(target_bbox[2]):int(target_bbox[4]), :])
+                # reshape the cam image to the bbox size
+                resized_exp = resize_cam(exp, (bbox_img.shape[1], bbox_img.shape[0]))
+                # add the bbox cam back to the bbox image
+                bbox_overlay_vis = np.uint8(resized_exp * 0.4 + bbox_img * 0.6)
+                # put the bbox_cam image to the original image
+                overlay_vis = copy.deepcopy(img)
+                overlay_vis[int(target_bbox[3]):int(target_bbox[5]),
+                    int(target_bbox[2]):int(target_bbox[4]), :] = bbox_overlay_vis
+            else:
+                print(
+                    'Only supported cam for  backbone/neck feature and roi feature,  the others are not supported temporarily!'
+                )
+                sys.exit()
+
+            # put the bbox rectangle on image
+            cv2.rectangle(
+                overlay_vis, (int(target_bbox[2]), int(target_bbox[3])),
+                (int(target_bbox[4]), int(target_bbox[5])), (0, 0, 255), 2)
+
+            # save visualization result
+            cam_image = Image.fromarray(overlay_vis)
+            cam_image.save(self.FLAGS.cam_out + '/' + str(index) + '.jpg')
+
+            # clear gradients after each bbox grad_cam
+            target.clear_gradient()
+            for n, v in self.trainer.model.named_sublayers():
+                v.clear_gradients()