更换文档检测模型

2024-08-27 14:42:45 +08:00
parent aea6f19951
commit 1514e09c40
2072 changed files with 254336 additions and 4967 deletions
--- a/paddle_detection/configs/picodet/legacy_model/application/layout_analysis/README.md
+++ b/paddle_detection/configs/picodet/legacy_model/application/layout_analysis/README.md
@@ -0,0 +1,56 @@
+# 更多应用
+
+
+## 1. 版面分析任务
+
+版面分析指的是对图片形式的文档进行区域划分，定位其中的关键区域，如文字、标题、表格、图片等。版面分析示意图如下图所示。
+
+<div align="center">
+    <img src="images/layout_demo.png" width="800">
+</div>
+
+### 1.1 数据集
+
+使用[PubLayNet](https://github.com/ibm-aur-nlp/PubLayNet)训练英文文档版面分析模型，该数据面向英文文献类（论文）场景，分别训练集(333,703张标注图片)、验证集(11,245张标注图片)和测试集(11,405张图片)，包含5类：Table、Figure、Title、Text、List，更多[版面分析数据集](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/layout/README.md#32)
+
+### 1.2 模型库
+
+使用PicoDet模型在PubLayNet数据集进行训练，同时采用FGD蒸馏，预训练模型如下:
+
+| 模型     | 图像输入尺寸 | mAP<sup>val<br/>0.5 |  下载地址  |  配置文件  |
+| :-------- | :--------: |  :----------------: | :---------------: | ----------------- |
+| PicoDet-LCNet_x1_0 |  800*608   |   93.5% | [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout.pdparams) &#124; [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_layout_infer.tar) | [config](./picodet_lcnet_x1_0_layout.yml) |
+| PicoDet-LCNet_x1_0 + FGD |  800*608   |   94.0%     | [trained model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout.pdparams) &#124; [inference model](https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar) | [teacher config](./picodet_lcnet_x2_5_layout.yml)&#124;[student config](./picodet_lcnet_x1_0_layout.yml) |
+
+ [FGD蒸馏介绍](https://github.com/PaddlePaddle/PaddleDetection/blob/develop/configs/slim/distill/README.md)
+
+### 1.3 模型推理
+
+了解版面分析整个流程(数据准备、模型训练、评估等)，请参考[版面分析](https://github.com/PaddlePaddle/PaddleOCR/blob/dygraph/ppstructure/layout/README.md)，这里仅展示模型推理过程。首先下载模型库中的inference_model模型。
+
+```
+mkdir inference_model
+cd inference_model
+# 下载并解压PubLayNet推理模型
+wget https://paddleocr.bj.bcebos.com/ppstructure/models/layout/picodet_lcnet_x1_0_fgd_layout_infer.tar && tar xf picodet_lcnet_x1_0_fgd_layout_infer.tar
+cd ..
+```
+
+版面恢复任务进行推理，可以执行如下命令：
+
+```bash
+python3 deploy/python/infer.py \
+    --model_dir=inference_model/picodet_lcnet_x1_0_fgd_layout_infer/ \
+    --image_file=docs/images/layout.jpg \
+    --device=CPU
+```
+
+可视化版面结果如下图所示：
+
+<div align="center">
+    <img src="images/layout_res.jpg" width="800">
+</div>
+
+## 2 Reference
+
+[1] Zhong X, Tang J, Yepes A J. Publaynet: largest dataset ever for document layout analysis[C]//2019 International Conference on Document Analysis and Recognition (ICDAR). IEEE, 2019: 1015-1022.
--- a/paddle_detection/configs/picodet/legacy_model/application/layout_analysis/images/layout_demo.png
+++ b/paddle_detection/configs/picodet/legacy_model/application/layout_analysis/images/layout_demo.png
--- a/paddle_detection/configs/picodet/legacy_model/application/layout_analysis/images/layout_res.jpg
+++ b/paddle_detection/configs/picodet/legacy_model/application/layout_analysis/images/layout_res.jpg
--- a/paddle_detection/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml
+++ b/paddle_detection/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml
@@ -0,0 +1,90 @@
+_BASE_: [
+  '../../../../runtime.yml',
+  '../../_base_/picodet_esnet.yml',
+  '../../_base_/optimizer_100e.yml',
+  '../../_base_/picodet_640_reader.yml',
+]
+
+pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/LCNet_x1_0_pretrained.pdparams
+weights: output/picodet_lcnet_x1_0_layout/model_final
+find_unused_parameters: True
+use_ema: true
+cycle_epoch: 10
+snapshot_epoch: 1
+epoch: 100
+
+PicoDet:
+  backbone: LCNet
+  neck: CSPPAN
+  head: PicoHead
+  nms_cpu: True
+
+LCNet:
+  scale: 1.0
+  feature_maps: [3, 4, 5]
+
+metric: COCO
+num_classes: 5
+
+TrainDataset:
+    name: COCODataSet
+    image_dir: train
+    anno_path: train.json
+    dataset_dir: ./dataset/publaynet/
+    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
+
+EvalDataset:
+    name: COCODataSet
+    image_dir: val
+    anno_path: val.json
+    dataset_dir: ./dataset/publaynet/
+
+TestDataset:
+  !ImageFolder
+    anno_path: ./dataset/publaynet/val.json
+
+
+worker_num: 8
+eval_height: &eval_height 800
+eval_width: &eval_width 608
+eval_size: &eval_size [*eval_height, *eval_width]
+
+TrainReader:
+  sample_transforms:
+  - Decode: {}
+  - RandomCrop: {}
+  - RandomFlip: {prob: 0.5}
+  - RandomDistort: {}
+  batch_transforms:
+  - BatchRandomResize: {target_size: [[768, 576], [800, 608], [832, 640]], random_size: True, random_interp: True, keep_ratio: False}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_size: 24
+  shuffle: true
+  drop_last: true
+  collate_batch: false
+
+EvalReader:
+  sample_transforms:
+  - Decode: {}
+  - Resize: {interp: 2, target_size: [800, 608], keep_ratio: False}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_transforms:
+  - PadBatch: {pad_to_stride: 32}
+  batch_size: 8
+  shuffle: false
+
+
+TestReader:
+  inputs_def:
+    image_shape: [1, 3, 800, 608]
+  sample_transforms:
+  - Decode: {}
+  - Resize: {interp: 2, target_size: [800, 608], keep_ratio: False}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_transforms:
+  - PadBatch: {pad_to_stride: 32}
+  batch_size: 1
+  shuffle: false
--- a/paddle_detection/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml
+++ b/paddle_detection/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x2_5_layout.yml
@@ -0,0 +1,34 @@
+_BASE_: [
+  '../../_base_/picodet_esnet.yml',
+]
+
+pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/LCNet_x2_5_ssld_pretrained.pdparams
+weights: output/picodet_lcnet_x2_5_layout/model_final
+find_unused_parameters: True
+
+PicoDet:
+  backbone: LCNet
+  neck: CSPPAN
+  head: PicoHead
+  nms_cpu: True
+
+LCNet:
+  scale: 2.5
+  feature_maps: [3, 4, 5]
+
+CSPPAN:
+  spatial_scales: [0.125, 0.0625, 0.03125]
+
+slim: Distill
+slim_method: FGD
+distill_loss: FGDFeatureLoss
+distill_loss_name: ['neck_f_3', 'neck_f_2', 'neck_f_1', 'neck_f_0']
+
+FGDFeatureLoss:
+  student_channels: 128
+  teacher_channels: 128
+  temp: 0.5
+  alpha_fgd: 0.001
+  beta_fgd: 0.0005
+  gamma_fgd: 0.0005
+  lambda_fgd: 0.000005
--- a/paddle_detection/configs/picodet/legacy_model/application/mainbody_detection/README.md
+++ b/paddle_detection/configs/picodet/legacy_model/application/mainbody_detection/README.md
@@ -0,0 +1,30 @@
+# 更多应用
+
+
+## 1. 主体检测任务
+
+主体检测技术是目前应用非常广泛的一种检测技术，它指的是检测出图片中一个或者多个主体的坐标位置，然后将图像中的对应区域裁剪下来，进行识别，从而完成整个识别过程。主体检测是识别任务的前序步骤，可以有效提升识别精度。
+
+主体检测是图像识别的前序步骤，被用于PaddleClas的PP-ShiTu图像识别系统中。PP-ShiTu中使用的主体检测模型基于PP-PicoDet。更多关于PP-ShiTu的介绍与使用可以参考：[PP-ShiTu](https://github.com/PaddlePaddle/PaddleClas)。
+
+
+### 1.1 数据集
+
+PP-ShiTu图像识别任务中，训练主体检测模型时主要用到了以下几个数据集。
+
+| 数据集       | 数据量   | 主体检测任务中使用的数据量   | 场景  | 数据集地址 |
+| :------------:  | :-------------: | :-------: | :-------: | :--------: |
+| Objects365 | 1700K | 173k | 通用场景 | [地址](https://www.objects365.org/overview.html) |
+| COCO2017 | 118K | 118k  | 通用场景 | [地址](https://cocodataset.org/) |
+| iCartoonFace | 48k | 48k | 动漫人脸检测 | [地址](https://github.com/luxiangju-PersonAI/iCartoonFace) |
+| LogoDet-3k | 155k | 155k | Logo检测 | [地址](https://github.com/Wangjing1551/LogoDet-3K-Dataset) |
+| RPC | 54k | 54k  | 商品检测 | [地址](https://rpc-dataset.github.io/) |
+
+在实际训练的过程中，将所有数据集混合在一起。由于是主体检测，这里将所有标注出的检测框对应的类别都修改为 `前景` 的类别，最终融合的数据集中只包含 1 个类别，即前景，数据集定义配置可以参考[picodet_lcnet_x2_5_640_mainbody.yml](./picodet_lcnet_x2_5_640_mainbody.yml)。
+
+
+### 1.2 模型库
+
+| 模型     | 图像输入尺寸 | mAP<sup>val<br>0.5:0.95 | mAP<sup>val<br>0.5 |  下载地址  | config |
+| :-------- | :--------: | :---------------------: | :----------------: | :----------------: | :---------------: |
+| PicoDet-LCNet_x2_5 |  640*640   |          41.5   |    62.0     | [trained model](https://paddledet.bj.bcebos.com/models/picodet_lcnet_x2_5_640_mainbody.pdparams) &#124; [inference model](https://paddledet.bj.bcebos.com/models/picodet_lcnet_x2_5_640_mainbody_infer.tar) &#124; [log](https://paddledet.bj.bcebos.com/logs/train_picodet_lcnet_x2_5_640_mainbody.log) | [config](./picodet_lcnet_x2_5_640_mainbody.yml) |
--- a/paddle_detection/configs/picodet/legacy_model/application/mainbody_detection/picodet_lcnet_x2_5_640_mainbody.yml
+++ b/paddle_detection/configs/picodet/legacy_model/application/mainbody_detection/picodet_lcnet_x2_5_640_mainbody.yml
@@ -0,0 +1,23 @@
+_BASE_: [
+  '../../../../datasets/coco_detection.yml',
+  '../../../../runtime.yml',
+  '../../_base_/picodet_esnet.yml',
+  '../../_base_/optimizer_100e.yml',
+  '../../_base_/picodet_640_reader.yml',
+]
+
+pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/LCNet_x2_5_ssld_pretrained.pdparams
+weights: output/picodet_lcnet_x2_5_640_mainbody/model_final
+find_unused_parameters: True
+use_ema: true
+cycle_epoch: 20
+snapshot_epoch: 2
+
+PicoDet:
+  backbone: LCNet
+  neck: CSPPAN
+  head: PicoHead
+
+LCNet:
+  scale: 2.5
+  feature_maps: [3, 4, 5]
--- a/paddle_detection/configs/picodet/legacy_model/application/pedestrian_detection/picodet_s_192_pedestrian.yml
+++ b/paddle_detection/configs/picodet/legacy_model/application/pedestrian_detection/picodet_s_192_pedestrian.yml
@@ -0,0 +1,149 @@
+use_gpu: true
+log_iter: 20
+save_dir: output
+snapshot_epoch: 1
+print_flops: false
+pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ESNet_x0_75_pretrained.pdparams
+weights: output/picodet_s_192_pedestrian/model_final
+find_unused_parameters: True
+use_ema: true
+cycle_epoch: 40
+snapshot_epoch: 10
+epoch: 300
+metric: COCO
+num_classes: 1
+# Exporting the model
+export:
+  post_process: False  # Whether post-processing is included in the network when export model.
+  nms: False           # Whether NMS is included in the network when export model.
+  benchmark: False    # It is used to testing model performance, if set `True`, post-process and NMS will not be exported.
+
+architecture: PicoDet
+
+PicoDet:
+  backbone: ESNet
+  neck: CSPPAN
+  head: PicoHead
+
+ESNet:
+  scale: 0.75
+  feature_maps: [4, 11, 14]
+  act: hard_swish
+  channel_ratio: [0.875, 0.5, 0.5, 0.5, 0.625, 0.5, 0.625, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
+
+CSPPAN:
+  out_channels: 96
+  use_depthwise: True
+  num_csp_blocks: 1
+  num_features: 4
+
+PicoHead:
+  conv_feat:
+    name: PicoFeat
+    feat_in: 96
+    feat_out: 96
+    num_convs: 2
+    num_fpn_stride: 4
+    norm_type: bn
+    share_cls_reg: True
+  fpn_stride: [8, 16, 32, 64]
+  feat_in_chan: 96
+  prior_prob: 0.01
+  reg_max: 7
+  cell_offset: 0.5
+  loss_class:
+    name: VarifocalLoss
+    use_sigmoid: True
+    iou_weighted: True
+    loss_weight: 1.0
+  loss_dfl:
+    name: DistributionFocalLoss
+    loss_weight: 0.25
+  loss_bbox:
+    name: GIoULoss
+    loss_weight: 2.0
+  assigner:
+    name: SimOTAAssigner
+    candidate_topk: 10
+    iou_weight: 6
+  nms:
+    name: MultiClassNMS
+    nms_top_k: 1000
+    keep_top_k: 100
+    score_threshold: 0.025
+    nms_threshold: 0.6
+
+LearningRate:
+  base_lr: 0.4
+  schedulers:
+  - !CosineDecay
+    max_epochs: 300
+  - !LinearWarmup
+    start_factor: 0.1
+    steps: 300
+
+OptimizerBuilder:
+  optimizer:
+    momentum: 0.9
+    type: Momentum
+  regularizer:
+    factor: 0.00004
+    type: L2
+
+TrainDataset:
+  !COCODataSet
+    image_dir: ""
+    anno_path: aic_coco_train_cocoformat.json
+    dataset_dir: dataset
+    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
+
+EvalDataset:
+  !COCODataSet
+    image_dir: val2017
+    anno_path: annotations/instances_val2017.json
+    dataset_dir: dataset/coco
+
+TestDataset:
+  !ImageFolder
+    anno_path: annotations/instances_val2017.json
+
+worker_num: 8
+TrainReader:
+  sample_transforms:
+  - Decode: {}
+  - RandomCrop: {}
+  - RandomFlip: {prob: 0.5}
+  - RandomDistort: {}
+  batch_transforms:
+  - BatchRandomResize: {target_size: [128, 160, 192, 224, 256], random_size: True, random_interp: True, keep_ratio: False}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_size: 128
+  shuffle: true
+  drop_last: true
+  collate_batch: false
+
+EvalReader:
+  sample_transforms:
+  - Decode: {}
+  - Resize: {interp: 2, target_size: [192, 192], keep_ratio: False}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_transforms:
+  - PadBatch: {pad_to_stride: 32}
+  batch_size: 8
+  shuffle: false
+
+TestReader:
+  inputs_def:
+    image_shape: [1, 3, 192, 192]
+  sample_transforms:
+  - Decode: {}
+  - Resize: {interp: 2, target_size: [192, 192], keep_ratio: False}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_transforms:
+  - PadBatch: {pad_to_stride: 32}
+  batch_size: 1
+  shuffle: false
+  fuse_normalize: true
--- a/paddle_detection/configs/picodet/legacy_model/application/pedestrian_detection/picodet_s_320_pedestrian.yml
+++ b/paddle_detection/configs/picodet/legacy_model/application/pedestrian_detection/picodet_s_320_pedestrian.yml
@@ -0,0 +1,148 @@
+use_gpu: true
+log_iter: 20
+save_dir: output
+snapshot_epoch: 1
+print_flops: false
+pretrain_weights: https://paddledet.bj.bcebos.com/models/pretrained/ESNet_x0_75_pretrained.pdparams
+weights: output/picodet_s_320_pedestrian/model_final
+find_unused_parameters: True
+use_ema: true
+cycle_epoch: 40
+snapshot_epoch: 10
+epoch: 300
+metric: COCO
+num_classes: 1
+# Exporting the model
+export:
+  post_process: False  # Whether post-processing is included in the network when export model.
+  nms: False           # Whether NMS is included in the network when export model.
+  benchmark: False    # It is used to testing model performance, if set `True`, post-process and NMS will not be exported.
+
+architecture: PicoDet
+
+PicoDet:
+  backbone: ESNet
+  neck: CSPPAN
+  head: PicoHead
+
+ESNet:
+  scale: 0.75
+  feature_maps: [4, 11, 14]
+  act: hard_swish
+  channel_ratio: [0.875, 0.5, 0.5, 0.5, 0.625, 0.5, 0.625, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
+
+CSPPAN:
+  out_channels: 96
+  use_depthwise: True
+  num_csp_blocks: 1
+  num_features: 4
+
+PicoHead:
+  conv_feat:
+    name: PicoFeat
+    feat_in: 96
+    feat_out: 96
+    num_convs: 2
+    num_fpn_stride: 4
+    norm_type: bn
+    share_cls_reg: True
+  fpn_stride: [8, 16, 32, 64]
+  feat_in_chan: 96
+  prior_prob: 0.01
+  reg_max: 7
+  cell_offset: 0.5
+  loss_class:
+    name: VarifocalLoss
+    use_sigmoid: True
+    iou_weighted: True
+    loss_weight: 1.0
+  loss_dfl:
+    name: DistributionFocalLoss
+    loss_weight: 0.25
+  loss_bbox:
+    name: GIoULoss
+    loss_weight: 2.0
+  assigner:
+    name: SimOTAAssigner
+    candidate_topk: 10
+    iou_weight: 6
+  nms:
+    name: MultiClassNMS
+    nms_top_k: 1000
+    keep_top_k: 100
+    score_threshold: 0.025
+    nms_threshold: 0.6
+
+LearningRate:
+  base_lr: 0.4
+  schedulers:
+  - !CosineDecay
+    max_epochs: 300
+  - !LinearWarmup
+    start_factor: 0.1
+    steps: 300
+
+OptimizerBuilder:
+  optimizer:
+    momentum: 0.9
+    type: Momentum
+  regularizer:
+    factor: 0.00004
+    type: L2
+
+TrainDataset:
+  !COCODataSet
+    image_dir: ""
+    anno_path: aic_coco_train_cocoformat.json
+    dataset_dir: dataset
+    data_fields: ['image', 'gt_bbox', 'gt_class', 'is_crowd']
+
+EvalDataset:
+  !COCODataSet
+    image_dir: val2017
+    anno_path: annotations/instances_val2017.json
+    dataset_dir: dataset/coco
+
+TestDataset:
+  !ImageFolder
+    anno_path: annotations/instances_val2017.json
+
+worker_num: 8
+TrainReader:
+  sample_transforms:
+  - Decode: {}
+  - RandomCrop: {}
+  - RandomFlip: {prob: 0.5}
+  - RandomDistort: {}
+  batch_transforms:
+  - BatchRandomResize: {target_size: [256, 288, 320, 352, 384], random_size: True, random_interp: True, keep_ratio: False}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_size: 128
+  shuffle: true
+  drop_last: true
+  collate_batch: false
+
+EvalReader:
+  sample_transforms:
+  - Decode: {}
+  - Resize: {interp: 2, target_size: [320, 320], keep_ratio: False}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_transforms:
+  - PadBatch: {pad_to_stride: 32}
+  batch_size: 8
+  shuffle: false
+
+TestReader:
+  inputs_def:
+    image_shape: [1, 3, 320, 320]
+  sample_transforms:
+  - Decode: {}
+  - Resize: {interp: 2, target_size: [320, 320], keep_ratio: False}
+  - NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
+  - Permute: {}
+  batch_transforms:
+  - PadBatch: {pad_to_stride: 32}
+  batch_size: 1
+  shuffle: false