更换文档检测模型

This commit is contained in:
2024-08-27 14:42:45 +08:00
parent aea6f19951
commit 1514e09c40
2072 changed files with 254336 additions and 4967 deletions

View File

@@ -0,0 +1,343 @@
import numpy as np
import cv2
import os
import sys
import glob
from ppdet.utils.logger import setup_logger
import copy
logger = setup_logger('ppdet_cam')
import paddle
from ppdet.engine import Trainer
def get_test_images(infer_dir, infer_img):
"""
Get image path list in TEST mode
"""
assert infer_img is not None or infer_dir is not None, \
"--infer_img or --infer_dir should be set"
assert infer_img is None or os.path.isfile(infer_img), \
"{} is not a file".format(infer_img)
assert infer_dir is None or os.path.isdir(infer_dir), \
"{} is not a directory".format(infer_dir)
# infer_img has a higher priority
if infer_img and os.path.isfile(infer_img):
return [infer_img]
images = set()
infer_dir = os.path.abspath(infer_dir)
assert os.path.isdir(infer_dir), \
"infer_dir {} is not a directory".format(infer_dir)
exts = ['jpg', 'jpeg', 'png', 'bmp']
exts += [ext.upper() for ext in exts]
for ext in exts:
images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
images = list(images)
assert len(images) > 0, "no image found in {}".format(infer_dir)
logger.info("Found {} inference images in total.".format(len(images)))
return images
def compute_ious(boxes1, boxes2):
"""[Compute pairwise IOU matrix for given two sets of boxes]
Args:
boxes1 ([numpy ndarray with shape N,4]): [representing bounding boxes with format (xmin,ymin,xmax,ymax)]
boxes2 ([numpy ndarray with shape M,4]): [representing bounding boxes with format (xmin,ymin,xmax,ymax)]
Returns:
pairwise IOU maxtrix with shape (N,M)where the value at ith row jth column hold the iou between ith
box and jth box from box1 and box2 respectively.
"""
lu = np.maximum(
boxes1[:, None, :2], boxes2[:, :2]
) # lu with shape N,M,2 ; boxes1[:,None,:2] with shape (N,1,2) boxes2 with shape(M,2)
rd = np.minimum(boxes1[:, None, 2:], boxes2[:, 2:]) # rd same to lu
intersection_wh = np.maximum(0.0, rd - lu)
intersection_area = intersection_wh[:, :,
0] * intersection_wh[:, :,
1] # with shape (N,M)
boxes1_wh = np.maximum(0.0, boxes1[:, 2:] - boxes1[:, :2])
boxes1_area = boxes1_wh[:, 0] * boxes1_wh[:, 1] # with shape (N,)
boxes2_wh = np.maximum(0.0, boxes2[:, 2:] - boxes2[:, :2])
boxes2_area = boxes2_wh[:, 0] * boxes2_wh[:, 1] # with shape (M,)
union_area = np.maximum(
boxes1_area[:, None] + boxes2_area - intersection_area,
1e-8) # with shape (N,M)
ious = np.clip(intersection_area / union_area, 0.0, 1.0)
return ious
def grad_cam(feat, grad):
"""
Args:
feat: CxHxW
grad: CxHxW
Returns:
cam: HxW
"""
exp = (feat * grad.mean((1, 2), keepdims=True)).mean(axis=0)
exp = np.maximum(-exp, 0)
return exp
def resize_cam(explanation, resize_shape) -> np.ndarray:
"""
Args:
explanation: (width, height)
resize_shape: (width, height)
Returns:
"""
assert len(explanation.shape) == 2, f"{explanation.shape}. " \
f"Currently support 2D explanation results for visualization. " \
"Reduce higher dimensions to 2D for visualization."
explanation = (explanation - explanation.min()) / (
explanation.max() - explanation.min())
explanation = cv2.resize(explanation, resize_shape)
explanation = np.uint8(255 * explanation)
explanation = cv2.applyColorMap(explanation, cv2.COLORMAP_JET)
explanation = cv2.cvtColor(explanation, cv2.COLOR_BGR2RGB)
return explanation
class BBoxCAM:
def __init__(self, FLAGS, cfg):
self.FLAGS = FLAGS
self.cfg = cfg
# build model
self.trainer = self.build_trainer(cfg)
# num_class
self.num_class = cfg.num_classes
# set hook for extraction of featuremaps and grads
self.set_hook(cfg)
self.nms_idx_need_divid_numclass_arch = ['FasterRCNN', 'MaskRCNN', 'CascadeRCNN']
"""
In these networks, the bbox array shape before nms contain num_class,
the nms_keep_idx of the bbox need to divide the num_class;
"""
# cam image output_dir
try:
os.makedirs(FLAGS.cam_out)
except:
print('Path already exists.')
pass
def build_trainer(self, cfg):
# build trainer
trainer = Trainer(cfg, mode='test')
# load weights
trainer.load_weights(cfg.weights)
# set for get extra_data before nms
trainer.model.use_extra_data=True
# set for record the bbox index before nms
if cfg.architecture in ['FasterRCNN', 'MaskRCNN']:
trainer.model.bbox_post_process.nms.return_index = True
elif cfg.architecture in ['YOLOv3', 'PPYOLOE', 'PPYOLOEWithAuxHead']:
if trainer.model.post_process is not None:
# anchor based YOLOs: YOLOv3,PP-YOLO
trainer.model.post_process.nms.return_index = True
else:
# anchor free YOLOs: PP-YOLOE, PP-YOLOE+
trainer.model.yolo_head.nms.return_index = True
elif cfg.architecture=='BlazeFace' or cfg.architecture=='SSD':
trainer.model.post_process.nms.return_index = True
elif cfg.architecture=='RetinaNet':
trainer.model.head.nms.return_index = True
else:
print(
cfg.architecture+' is not supported for cam temporarily!'
)
sys.exit()
# Todo: Unify the head/post_process name in each model
return trainer
def set_hook(self, cfg):
# set hook for extraction of featuremaps and grads
self.target_feats = {}
self.target_layer_name = cfg.target_feature_layer_name
# such as trainer.model.backbone, trainer.model.bbox_head.roi_extractor
def hook(layer, input, output):
self.target_feats[layer._layer_name_for_hook] = output
try:
exec('self.trainer.'+self.target_layer_name+'._layer_name_for_hook = self.target_layer_name')
# self.trainer.target_layer_name._layer_name_for_hook = self.target_layer_name
exec('self.trainer.'+self.target_layer_name+'.register_forward_post_hook(hook)')
# self.trainer.target_layer_name.register_forward_post_hook(hook)
except:
print("Error! "
"The target_layer_name--"+self.target_layer_name+" is not in model! "
"Please check the spelling and "
"the network's architecture!")
sys.exit()
def get_bboxes(self):
# get inference images
images = get_test_images(self.FLAGS.infer_dir, self.FLAGS.infer_img)
# inference
result = self.trainer.predict(
images,
draw_threshold=self.FLAGS.draw_threshold,
output_dir=self.FLAGS.output_dir,
save_results=self.FLAGS.save_results,
visualize=False)[0]
return result
def get_bboxes_cams(self):
# Get the bboxes prediction(after nms result) of the input
inference_result = self.get_bboxes()
# read input image
# Todo: Support folder multi-images process
from PIL import Image
img = np.array(Image.open(self.cfg.infer_img))
# data for calaulate bbox grad_cam
extra_data = inference_result['extra_data']
"""
Example of Faster_RCNN based architecture:
extra_data: {'scores': tensor with shape [num_of_bboxes_before_nms, num_classes], for example: [1000, 80]
'nms_keep_idx': tensor with shape [num_of_bboxes_after_nms, 1], for example: [300, 1]
}
Example of YOLOv3 based architecture:
extra_data: {'scores': tensor with shape [1, num_classes, num_of_yolo_bboxes_before_nms], #for example: [1, 80, 8400]
'nms_keep_idx': tensor with shape [num_of_yolo_bboxes_after_nms, 1], # for example: [300, 1]
}
"""
# array index of the predicted bbox before nms
if self.cfg.architecture in self.nms_idx_need_divid_numclass_arch:
# some network's bbox array shape before nms may be like [num_of_bboxes_before_nms, num_classes, 4],
# we need to divide num_classes to get the before_nms_index
# currently, only include the rcnn architectures fasterrcnn, maskrcnn, cascadercnn);
before_nms_indexes = extra_data['nms_keep_idx'].cpu().numpy(
) // self.num_class # num_class
else :
before_nms_indexes = extra_data['nms_keep_idx'].cpu().numpy()
# Calculate and visualize the heatmap of per predict bbox
for index, target_bbox in enumerate(inference_result['bbox']):
# target_bbox: [cls, score, x1, y1, x2, y2]
# filter bboxes with low predicted scores
if target_bbox[1] < self.FLAGS.draw_threshold:
continue
target_bbox_before_nms = int(before_nms_indexes[index])
if len(extra_data['scores'].shape)==2:
score_out = extra_data['scores'][target_bbox_before_nms]
else:
score_out = extra_data['scores'][0, :, target_bbox_before_nms]
"""
There are two kinds array shape of bbox score output :
1) [num_of_bboxes_before_nms, num_classes], for example: [1000, 80]
2) [num_of_image, num_classes, num_of_yolo_bboxes_before_nms], for example: [1, 80, 1000]
"""
# construct one_hot label and do backward to get the gradients
predicted_label = paddle.argmax(score_out)
label_onehot = paddle.nn.functional.one_hot(
predicted_label, num_classes=len(score_out))
label_onehot = label_onehot.squeeze()
target = paddle.sum(score_out * label_onehot)
target.backward(retain_graph=True)
if 'backbone' in self.target_layer_name or \
'neck' in self.target_layer_name: # backbone/neck level feature
if isinstance(self.target_feats[self.target_layer_name], list):
# when the featuremap contains of multiple scales,
# take the featuremap of the last scale
# Todo: fuse the cam result from multisclae featuremaps
if self.target_feats[self.target_layer_name][
-1].shape[-1]==1:
"""
if the last level featuremap is 1x1 size,
we take the second last one
"""
cam_grad = self.target_feats[self.target_layer_name][
-2].grad.squeeze().cpu().numpy()
cam_feat = self.target_feats[self.target_layer_name][
-2].squeeze().cpu().numpy()
else:
cam_grad = self.target_feats[self.target_layer_name][
-1].grad.squeeze().cpu().numpy()
cam_feat = self.target_feats[self.target_layer_name][
-1].squeeze().cpu().numpy()
else:
cam_grad = self.target_feats[
self.target_layer_name].grad.squeeze().cpu().numpy()
cam_feat = self.target_feats[
self.target_layer_name].squeeze().cpu().numpy()
else: # roi level feature
cam_grad = self.target_feats[
self.target_layer_name].grad.squeeze().cpu().numpy()[target_bbox_before_nms]
cam_feat = self.target_feats[
self.target_layer_name].squeeze().cpu().numpy()[target_bbox_before_nms]
# grad_cam:
exp = grad_cam(cam_feat, cam_grad)
if 'backbone' in self.target_layer_name or \
'neck' in self.target_layer_name:
"""
when use backbone/neck featuremap,
we first do the cam on whole image,
and then set the area outside the predic bbox to 0
"""
# reshape the cam image to the input image size
resized_exp = resize_cam(exp, (img.shape[1], img.shape[0]))
mask = np.zeros((img.shape[0], img.shape[1], 3))
mask[int(target_bbox[3]):int(target_bbox[5]), int(target_bbox[2]):
int(target_bbox[4]), :] = 1
resized_exp = resized_exp * mask
# add the bbox cam back to the input image
overlay_vis = np.uint8(resized_exp * 0.4 + img * 0.6)
elif 'roi' in self.target_layer_name:
# get the bbox part of the image
bbox_img = copy.deepcopy(img[int(target_bbox[3]):int(target_bbox[5]),
int(target_bbox[2]):int(target_bbox[4]), :])
# reshape the cam image to the bbox size
resized_exp = resize_cam(exp, (bbox_img.shape[1], bbox_img.shape[0]))
# add the bbox cam back to the bbox image
bbox_overlay_vis = np.uint8(resized_exp * 0.4 + bbox_img * 0.6)
# put the bbox_cam image to the original image
overlay_vis = copy.deepcopy(img)
overlay_vis[int(target_bbox[3]):int(target_bbox[5]),
int(target_bbox[2]):int(target_bbox[4]), :] = bbox_overlay_vis
else:
print(
'Only supported cam for backbone/neck feature and roi feature, the others are not supported temporarily!'
)
sys.exit()
# put the bbox rectangle on image
cv2.rectangle(
overlay_vis, (int(target_bbox[2]), int(target_bbox[3])),
(int(target_bbox[4]), int(target_bbox[5])), (0, 0, 255), 2)
# save visualization result
cam_image = Image.fromarray(overlay_vis)
cam_image.save(self.FLAGS.cam_out + '/' + str(index) + '.jpg')
# clear gradients after each bbox grad_cam
target.clear_gradient()
for n, v in self.trainer.model.named_sublayers():
v.clear_gradients()