更换文档检测模型
This commit is contained in:
69
paddle_detection/configs/vitdet/README.md
Normal file
69
paddle_detection/configs/vitdet/README.md
Normal file
@@ -0,0 +1,69 @@
|
||||
# Vision Transformer Detection
|
||||
|
||||
## Introduction
|
||||
|
||||
- [Context Autoencoder for Self-Supervised Representation Learning](https://arxiv.org/abs/2202.03026)
|
||||
- [Benchmarking Detection Transfer Learning with Vision Transformers](https://arxiv.org/pdf/2111.11429.pdf)
|
||||
|
||||
Object detection is a central downstream task used to
|
||||
test if pre-trained network parameters confer benefits, such
|
||||
as improved accuracy or training speed. The complexity
|
||||
of object detection methods can make this benchmarking
|
||||
non-trivial when new architectures, such as Vision Transformer (ViT) models, arrive.
|
||||
|
||||
## Model Zoo
|
||||
|
||||
| Model | Backbone | Pretrained | Scheduler | Images/GPU | Box AP | Mask AP | Config | Download |
|
||||
|:------:|:--------:|:--------------:|:--------------:|:--------------:|:--------------:|:------:|:------:|:--------:|
|
||||
| Cascade RCNN | ViT-base | CAE | 1x | 1 | 52.7 | - | [config](./cascade_rcnn_vit_base_hrfpn_cae_1x_coco.yml) | [model](https://bj.bcebos.com/v1/paddledet/models/cascade_rcnn_vit_base_hrfpn_cae_1x_coco.pdparams) |
|
||||
| Cascade RCNN | ViT-large | CAE | 1x | 1 | 55.7 | - | [config](./cascade_rcnn_vit_large_hrfpn_cae_1x_coco.yml) | [model](https://bj.bcebos.com/v1/paddledet/models/cascade_rcnn_vit_large_hrfpn_cae_1x_coco.pdparams) |
|
||||
| PP-YOLOE | ViT-base | CAE | 36e | 2 | 52.2 | - | [config](./ppyoloe_vit_base_csppan_cae_36e_coco.yml) | [model](https://bj.bcebos.com/v1/paddledet/models/ppyoloe_vit_base_csppan_cae_36e_coco.pdparams) |
|
||||
| Mask RCNN | ViT-base | CAE | 1x | 1 | 50.6 | 44.9 | [config](./mask_rcnn_vit_base_hrfpn_cae_1x_coco.yml) | [model](https://bj.bcebos.com/v1/paddledet/models/mask_rcnn_vit_base_hrfpn_cae_1x_coco.pdparams) |
|
||||
| Mask RCNN | ViT-large | CAE | 1x | 1 | 54.2 | 47.4 | [config](./mask_rcnn_vit_large_hrfpn_cae_1x_coco.yml) | [model](https://bj.bcebos.com/v1/paddledet/models/mask_rcnn_vit_large_hrfpn_cae_1x_coco.pdparams) |
|
||||
|
||||
|
||||
**Notes:**
|
||||
- Model is trained on COCO train2017 dataset and evaluated on val2017 results of `mAP(IoU=0.5:0.95)
|
||||
- Base model is trained on 8x32G V100 GPU, large model on 8x80G A100
|
||||
- The `Cascade RCNN` experiments are based on PaddlePaddle 2.2.2
|
||||
|
||||
## Citations
|
||||
```
|
||||
@article{chen2022context,
|
||||
title={Context autoencoder for self-supervised representation learning},
|
||||
author={Chen, Xiaokang and Ding, Mingyu and Wang, Xiaodi and Xin, Ying and Mo, Shentong and Wang, Yunhao and Han, Shumin and Luo, Ping and Zeng, Gang and Wang, Jingdong},
|
||||
journal={arXiv preprint arXiv:2202.03026},
|
||||
year={2022}
|
||||
}
|
||||
|
||||
@article{DBLP:journals/corr/abs-2111-11429,
|
||||
author = {Yanghao Li and
|
||||
Saining Xie and
|
||||
Xinlei Chen and
|
||||
Piotr Doll{\'{a}}r and
|
||||
Kaiming He and
|
||||
Ross B. Girshick},
|
||||
title = {Benchmarking Detection Transfer Learning with Vision Transformers},
|
||||
journal = {CoRR},
|
||||
volume = {abs/2111.11429},
|
||||
year = {2021},
|
||||
url = {https://arxiv.org/abs/2111.11429},
|
||||
eprinttype = {arXiv},
|
||||
eprint = {2111.11429},
|
||||
timestamp = {Fri, 26 Nov 2021 13:48:43 +0100},
|
||||
biburl = {https://dblp.org/rec/journals/corr/abs-2111-11429.bib},
|
||||
bibsource = {dblp computer science bibliography, https://dblp.org}
|
||||
}
|
||||
|
||||
@article{Cai_2019,
|
||||
title={Cascade R-CNN: High Quality Object Detection and Instance Segmentation},
|
||||
ISSN={1939-3539},
|
||||
url={http://dx.doi.org/10.1109/tpami.2019.2956516},
|
||||
DOI={10.1109/tpami.2019.2956516},
|
||||
journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
|
||||
publisher={Institute of Electrical and Electronics Engineers (IEEE)},
|
||||
author={Cai, Zhaowei and Vasconcelos, Nuno},
|
||||
year={2019},
|
||||
pages={1–1}
|
||||
}
|
||||
```
|
||||
@@ -0,0 +1,41 @@
|
||||
worker_num: 2
|
||||
TrainReader:
|
||||
sample_transforms:
|
||||
- Decode: {}
|
||||
- RandomResizeCrop: {resizes: [400, 500, 600], cropsizes: [[384, 600], ], prob: 0.5}
|
||||
- RandomResize: {target_size: [[480, 1333], [512, 1333], [544, 1333], [576, 1333], [608, 1333], [640, 1333], [672, 1333], [704, 1333], [736, 1333], [768, 1333], [800, 1333]], keep_ratio: True, interp: 2}
|
||||
- RandomFlip: {prob: 0.5}
|
||||
- NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
|
||||
- Permute: {}
|
||||
batch_transforms:
|
||||
- PadBatch: {pad_to_stride: 32}
|
||||
batch_size: 2
|
||||
shuffle: true
|
||||
drop_last: true
|
||||
collate_batch: false
|
||||
|
||||
EvalReader:
|
||||
sample_transforms:
|
||||
- Decode: {}
|
||||
- Resize: {interp: 2, target_size: [800, 1333], keep_ratio: True}
|
||||
- NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
|
||||
- Permute: {}
|
||||
batch_transforms:
|
||||
- PadBatch: {pad_to_stride: 32}
|
||||
batch_size: 1
|
||||
shuffle: false
|
||||
drop_last: false
|
||||
|
||||
|
||||
TestReader:
|
||||
inputs_def:
|
||||
image_shape: [-1, 3, 640, 640]
|
||||
sample_transforms:
|
||||
- Decode: {}
|
||||
- Resize: {interp: 2, target_size: 640, keep_ratio: True}
|
||||
- Pad: {size: 640}
|
||||
- NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
|
||||
- Permute: {}
|
||||
batch_size: 1
|
||||
shuffle: false
|
||||
drop_last: false
|
||||
41
paddle_detection/configs/vitdet/_base_/mask_rcnn_reader.yml
Normal file
41
paddle_detection/configs/vitdet/_base_/mask_rcnn_reader.yml
Normal file
@@ -0,0 +1,41 @@
|
||||
worker_num: 2
|
||||
TrainReader:
|
||||
sample_transforms:
|
||||
- Decode: {}
|
||||
# - RandomResizeCrop: {resizes: [400, 500, 600], cropsizes: [[384, 600], ], prob: 0.5}
|
||||
- RandomResize: {target_size: [[640, 1333], [672, 1333], [704, 1333], [736, 1333], [768, 1333], [800, 1333]], interp: 2, keep_ratio: True}
|
||||
- RandomFlip: {prob: 0.5}
|
||||
- NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
|
||||
- Permute: {}
|
||||
batch_transforms:
|
||||
- PadBatch: {pad_to_stride: 32}
|
||||
batch_size: 1
|
||||
shuffle: true
|
||||
drop_last: true
|
||||
collate_batch: false
|
||||
use_shared_memory: true
|
||||
|
||||
EvalReader:
|
||||
sample_transforms:
|
||||
- Decode: {}
|
||||
- Resize: {interp: 2, target_size: [800, 1333], keep_ratio: True}
|
||||
- NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
|
||||
- Permute: {}
|
||||
batch_transforms:
|
||||
- PadBatch: {pad_to_stride: 32}
|
||||
batch_size: 1
|
||||
shuffle: false
|
||||
drop_last: false
|
||||
|
||||
|
||||
TestReader:
|
||||
sample_transforms:
|
||||
- Decode: {}
|
||||
- Resize: {interp: 2, target_size: [800, 1333], keep_ratio: True}
|
||||
- NormalizeImage: {is_scale: true, mean: [0.485,0.456,0.406], std: [0.229, 0.224,0.225]}
|
||||
- Permute: {}
|
||||
batch_transforms:
|
||||
- PadBatch: {pad_to_stride: 32}
|
||||
batch_size: 1
|
||||
shuffle: false
|
||||
drop_last: false
|
||||
22
paddle_detection/configs/vitdet/_base_/optimizer_base_1x.yml
Normal file
22
paddle_detection/configs/vitdet/_base_/optimizer_base_1x.yml
Normal file
@@ -0,0 +1,22 @@
|
||||
epoch: 12
|
||||
|
||||
LearningRate:
|
||||
base_lr: 0.0001
|
||||
schedulers:
|
||||
- !PiecewiseDecay
|
||||
gamma: 0.1
|
||||
milestones: [9, 11]
|
||||
- !LinearWarmup
|
||||
start_factor: 0.001
|
||||
steps: 1000
|
||||
|
||||
OptimizerBuilder:
|
||||
optimizer:
|
||||
type: AdamWDL
|
||||
betas: [0.9, 0.999]
|
||||
layer_decay: 0.75
|
||||
weight_decay: 0.02
|
||||
num_layers: 12
|
||||
filter_bias_and_bn: True
|
||||
skip_decay_names: ['pos_embed', 'cls_token']
|
||||
set_param_lr_func: 'layerwise_lr_decay'
|
||||
@@ -0,0 +1,20 @@
|
||||
|
||||
epoch: 36
|
||||
|
||||
LearningRate:
|
||||
base_lr: 0.0001
|
||||
schedulers:
|
||||
- !CosineDecay
|
||||
max_epochs: 36
|
||||
min_lr_ratio: 0.1 # 0.1
|
||||
- !LinearWarmup
|
||||
start_factor: 0.001
|
||||
epochs: 1
|
||||
|
||||
|
||||
OptimizerBuilder:
|
||||
clip_grad_by_norm: 0.1
|
||||
regularizer: false
|
||||
optimizer:
|
||||
type: AdamW
|
||||
weight_decay: 0.0001
|
||||
40
paddle_detection/configs/vitdet/_base_/ppyoloe_reader.yml
Normal file
40
paddle_detection/configs/vitdet/_base_/ppyoloe_reader.yml
Normal file
@@ -0,0 +1,40 @@
|
||||
worker_num: 4
|
||||
eval_height: &eval_height 640
|
||||
eval_width: &eval_width 640
|
||||
eval_size: &eval_size [*eval_height, *eval_width]
|
||||
|
||||
TrainReader:
|
||||
sample_transforms:
|
||||
- Decode: {}
|
||||
- RandomDistort: {}
|
||||
- RandomExpand: {fill_value: [123.675, 116.28, 103.53]}
|
||||
- RandomCrop: {}
|
||||
- RandomFlip: {}
|
||||
batch_transforms:
|
||||
- BatchRandomResize: {target_size: [320, 352, 384, 416, 448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768], random_size: True, random_interp: True, keep_ratio: False}
|
||||
- NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
|
||||
- Permute: {}
|
||||
- PadGT: {}
|
||||
batch_size: 2
|
||||
shuffle: true
|
||||
drop_last: true
|
||||
use_shared_memory: true
|
||||
collate_batch: true
|
||||
|
||||
EvalReader:
|
||||
sample_transforms:
|
||||
- Decode: {}
|
||||
- Resize: {target_size: *eval_size, keep_ratio: False, interp: 2}
|
||||
- NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
|
||||
- Permute: {}
|
||||
batch_size: 2
|
||||
|
||||
TestReader:
|
||||
inputs_def:
|
||||
image_shape: [3, *eval_height, *eval_width]
|
||||
sample_transforms:
|
||||
- Decode: {}
|
||||
- Resize: {target_size: *eval_size, keep_ratio: False, interp: 2}
|
||||
- NormalizeImage: {mean: [0., 0., 0.], std: [1., 1., 1.], norm_type: none}
|
||||
- Permute: {}
|
||||
batch_size: 1
|
||||
@@ -0,0 +1,131 @@
|
||||
|
||||
_BASE_: [
|
||||
'../datasets/coco_detection.yml',
|
||||
'../runtime.yml',
|
||||
'./_base_/faster_rcnn_reader.yml',
|
||||
'./_base_/optimizer_base_1x.yml'
|
||||
]
|
||||
|
||||
weights: output/cascade_rcnn_vit_base_hrfpn_cae_1x_coco/model_final
|
||||
|
||||
|
||||
# runtime
|
||||
log_iter: 100
|
||||
snapshot_epoch: 1
|
||||
find_unused_parameters: True
|
||||
|
||||
use_gpu: true
|
||||
norm_type: sync_bn
|
||||
|
||||
|
||||
# reader
|
||||
worker_num: 2
|
||||
TrainReader:
|
||||
batch_size: 1
|
||||
|
||||
|
||||
# model
|
||||
architecture: CascadeRCNN
|
||||
|
||||
CascadeRCNN:
|
||||
backbone: VisionTransformer
|
||||
neck: HRFPN
|
||||
rpn_head: RPNHead
|
||||
bbox_head: CascadeHead
|
||||
# post process
|
||||
bbox_post_process: BBoxPostProcess
|
||||
|
||||
|
||||
VisionTransformer:
|
||||
patch_size: 16
|
||||
embed_dim: 768
|
||||
depth: 12
|
||||
num_heads: 12
|
||||
mlp_ratio: 4
|
||||
qkv_bias: True
|
||||
drop_rate: 0.0
|
||||
drop_path_rate: 0.2
|
||||
init_values: 0.1
|
||||
final_norm: False
|
||||
use_rel_pos_bias: False
|
||||
use_sincos_pos_emb: True
|
||||
epsilon: 0.000001 # 1e-6
|
||||
out_indices: [3, 5, 7, 11]
|
||||
with_fpn: True
|
||||
pretrained: https://bj.bcebos.com/v1/paddledet/models/pretrained/vit_base_cae_pretrained.pdparams
|
||||
|
||||
HRFPN:
|
||||
out_channel: 256
|
||||
use_bias: True
|
||||
|
||||
RPNHead:
|
||||
anchor_generator:
|
||||
aspect_ratios: [0.5, 1.0, 2.0]
|
||||
anchor_sizes: [[32], [64], [128], [256], [512]]
|
||||
strides: [4, 8, 16, 32, 64]
|
||||
rpn_target_assign:
|
||||
batch_size_per_im: 256
|
||||
fg_fraction: 0.5
|
||||
negative_overlap: 0.3
|
||||
positive_overlap: 0.7
|
||||
use_random: True
|
||||
train_proposal:
|
||||
min_size: 0.0
|
||||
nms_thresh: 0.7
|
||||
pre_nms_top_n: 2000
|
||||
post_nms_top_n: 2000
|
||||
topk_after_collect: True
|
||||
test_proposal:
|
||||
min_size: 0.0
|
||||
nms_thresh: 0.7
|
||||
pre_nms_top_n: 1000
|
||||
post_nms_top_n: 1000
|
||||
loss_rpn_bbox: SmoothL1Loss
|
||||
|
||||
SmoothL1Loss:
|
||||
beta: 0.1111111111111111
|
||||
|
||||
|
||||
CascadeHead:
|
||||
head: CascadeXConvNormHead
|
||||
roi_extractor:
|
||||
resolution: 7
|
||||
sampling_ratio: 0
|
||||
aligned: True
|
||||
bbox_assigner: BBoxAssigner
|
||||
bbox_loss: GIoULoss
|
||||
num_cascade_stages: 3
|
||||
reg_class_agnostic: False
|
||||
stage_loss_weights: [1, 0.5, 0.25]
|
||||
loss_normalize_pos: True
|
||||
add_gt_as_proposals: [True, True, True]
|
||||
|
||||
|
||||
BBoxAssigner:
|
||||
batch_size_per_im: 512
|
||||
bg_thresh: 0.5
|
||||
fg_thresh: 0.5
|
||||
fg_fraction: 0.25
|
||||
cascade_iou: [0.5, 0.6, 0.7]
|
||||
use_random: True
|
||||
|
||||
|
||||
CascadeXConvNormHead:
|
||||
norm_type: bn
|
||||
|
||||
|
||||
GIoULoss:
|
||||
loss_weight: 10.
|
||||
reduction: 'none'
|
||||
eps: 0.000001
|
||||
|
||||
|
||||
BBoxPostProcess:
|
||||
decode:
|
||||
name: RCNNBox
|
||||
prior_box_var: [30.0, 30.0, 15.0, 15.0]
|
||||
nms:
|
||||
name: MultiClassNMS
|
||||
keep_top_k: 100
|
||||
score_threshold: 0.05
|
||||
nms_threshold: 0.5
|
||||
@@ -0,0 +1,29 @@
|
||||
_BASE_: [
|
||||
'./cascade_rcnn_vit_base_hrfpn_cae_1x_coco.yml'
|
||||
]
|
||||
|
||||
weights: output/cascade_rcnn_vit_large_hrfpn_cae_1x_coco/model_final
|
||||
|
||||
|
||||
depth: &depth 24
|
||||
dim: &dim 1024
|
||||
use_fused_allreduce_gradients: &use_checkpoint True
|
||||
|
||||
VisionTransformer:
|
||||
img_size: [800, 1344]
|
||||
embed_dim: *dim
|
||||
depth: *depth
|
||||
num_heads: 16
|
||||
drop_path_rate: 0.25
|
||||
out_indices: [7, 11, 15, 23]
|
||||
use_checkpoint: *use_checkpoint
|
||||
pretrained: https://bj.bcebos.com/v1/paddledet/models/pretrained/vit_large_cae_pretrained.pdparams
|
||||
|
||||
HRFPN:
|
||||
in_channels: [*dim, *dim, *dim, *dim]
|
||||
|
||||
OptimizerBuilder:
|
||||
optimizer:
|
||||
layer_decay: 0.9
|
||||
weight_decay: 0.02
|
||||
num_layers: *depth
|
||||
@@ -0,0 +1,130 @@
|
||||
|
||||
_BASE_: [
|
||||
'../datasets/coco_detection.yml',
|
||||
'../runtime.yml',
|
||||
'./_base_/faster_rcnn_reader.yml',
|
||||
'./_base_/optimizer_base_1x.yml'
|
||||
]
|
||||
|
||||
weights: output/faster_rcnn_vit_base_fpn_cae_1x_coco/model_final
|
||||
|
||||
|
||||
# runtime
|
||||
log_iter: 100
|
||||
snapshot_epoch: 1
|
||||
find_unused_parameters: True
|
||||
|
||||
use_gpu: true
|
||||
norm_type: sync_bn
|
||||
|
||||
OptimizerBuilder:
|
||||
optimizer:
|
||||
weight_decay: 0.05
|
||||
|
||||
# reader
|
||||
worker_num: 2
|
||||
TrainReader:
|
||||
batch_size: 1
|
||||
|
||||
|
||||
# model
|
||||
architecture: FasterRCNN
|
||||
|
||||
FasterRCNN:
|
||||
backbone: VisionTransformer
|
||||
neck: FPN
|
||||
rpn_head: RPNHead
|
||||
bbox_head: BBoxHead
|
||||
bbox_post_process: BBoxPostProcess
|
||||
|
||||
VisionTransformer:
|
||||
patch_size: 16
|
||||
embed_dim: 768
|
||||
depth: 12
|
||||
num_heads: 12
|
||||
mlp_ratio: 4
|
||||
qkv_bias: True
|
||||
drop_rate: 0.0
|
||||
drop_path_rate: 0.2
|
||||
init_values: 0.1
|
||||
final_norm: False
|
||||
use_rel_pos_bias: False
|
||||
use_sincos_pos_emb: True
|
||||
epsilon: 0.000001 # 1e-6
|
||||
out_indices: [3, 5, 7, 11]
|
||||
with_fpn: True
|
||||
pretrained: https://bj.bcebos.com/v1/paddledet/models/pretrained/vit_base_cae_pretrained.pdparams
|
||||
|
||||
|
||||
FPN:
|
||||
out_channel: 256
|
||||
|
||||
RPNHead:
|
||||
anchor_generator:
|
||||
aspect_ratios: [0.5, 1.0, 2.0]
|
||||
anchor_sizes: [[32], [64], [128], [256], [512]]
|
||||
strides: [4, 8, 16, 32, 64]
|
||||
rpn_target_assign:
|
||||
batch_size_per_im: 256
|
||||
fg_fraction: 0.5
|
||||
negative_overlap: 0.3
|
||||
positive_overlap: 0.7
|
||||
use_random: True
|
||||
train_proposal:
|
||||
min_size: 0.0
|
||||
nms_thresh: 0.7
|
||||
pre_nms_top_n: 2000
|
||||
post_nms_top_n: 1000
|
||||
topk_after_collect: True
|
||||
test_proposal:
|
||||
min_size: 0.0
|
||||
nms_thresh: 0.7
|
||||
pre_nms_top_n: 1000
|
||||
post_nms_top_n: 1000
|
||||
loss_rpn_bbox: SmoothL1Loss
|
||||
|
||||
|
||||
SmoothL1Loss:
|
||||
beta: 0.1111111111111111
|
||||
|
||||
|
||||
BBoxHead:
|
||||
# head: TwoFCHead
|
||||
head: XConvNormHead
|
||||
roi_extractor:
|
||||
resolution: 7
|
||||
sampling_ratio: 0
|
||||
aligned: True
|
||||
bbox_assigner: BBoxAssigner
|
||||
loss_normalize_pos: True
|
||||
bbox_loss: GIoULoss
|
||||
|
||||
|
||||
GIoULoss:
|
||||
loss_weight: 10.
|
||||
reduction: 'none'
|
||||
eps: 0.000001 # 1e-6
|
||||
|
||||
|
||||
BBoxAssigner:
|
||||
batch_size_per_im: 512
|
||||
bg_thresh: 0.5
|
||||
fg_thresh: 0.5
|
||||
fg_fraction: 0.25
|
||||
use_random: True
|
||||
|
||||
# TwoFCHead:
|
||||
# out_channel: 1024
|
||||
|
||||
XConvNormHead:
|
||||
num_convs: 4
|
||||
norm_type: bn
|
||||
|
||||
|
||||
BBoxPostProcess:
|
||||
decode: RCNNBox
|
||||
nms:
|
||||
name: MultiClassNMS
|
||||
keep_top_k: 100
|
||||
score_threshold: 0.05
|
||||
nms_threshold: 0.5
|
||||
@@ -0,0 +1,135 @@
|
||||
_BASE_: [
|
||||
'../datasets/coco_instance.yml',
|
||||
'../runtime.yml',
|
||||
'./_base_/mask_rcnn_reader.yml',
|
||||
'./_base_/optimizer_base_1x.yml'
|
||||
]
|
||||
|
||||
weights: output/mask_rcnn_vit_base_hrfpn_cae_1x_coco/model_final
|
||||
|
||||
|
||||
# runtime
|
||||
log_iter: 100
|
||||
snapshot_epoch: 1
|
||||
norm_type: sync_bn
|
||||
use_fused_allreduce_gradients: &use_checkpoint False
|
||||
|
||||
|
||||
architecture: MaskRCNN
|
||||
MaskRCNN:
|
||||
backbone: VisionTransformer
|
||||
neck: HRFPN
|
||||
rpn_head: RPNHead
|
||||
bbox_head: BBoxHead
|
||||
mask_head: MaskHead
|
||||
# post process
|
||||
bbox_post_process: BBoxPostProcess
|
||||
mask_post_process: MaskPostProcess
|
||||
|
||||
VisionTransformer:
|
||||
patch_size: 16
|
||||
embed_dim: 768
|
||||
depth: 12
|
||||
num_heads: 12
|
||||
mlp_ratio: 4
|
||||
qkv_bias: True
|
||||
drop_rate: 0.0
|
||||
drop_path_rate: 0.2
|
||||
init_values: 0.1
|
||||
final_norm: False
|
||||
use_rel_pos_bias: False
|
||||
use_sincos_pos_emb: True
|
||||
epsilon: 0.000001 # 1e-6
|
||||
out_indices: [3, 5, 7, 11]
|
||||
with_fpn: True
|
||||
use_checkpoint: *use_checkpoint
|
||||
pretrained: https://bj.bcebos.com/v1/paddledet/models/pretrained/vit_base_cae_pretrained.pdparams
|
||||
|
||||
HRFPN:
|
||||
out_channel: 256
|
||||
use_bias: True
|
||||
|
||||
RPNHead:
|
||||
anchor_generator:
|
||||
aspect_ratios: [0.5, 1.0, 2.0]
|
||||
anchor_sizes: [[32], [64], [128], [256], [512]]
|
||||
strides: [4, 8, 16, 32, 64]
|
||||
rpn_target_assign:
|
||||
batch_size_per_im: 256
|
||||
fg_fraction: 0.5
|
||||
negative_overlap: 0.3
|
||||
positive_overlap: 0.7
|
||||
use_random: True
|
||||
train_proposal:
|
||||
min_size: 0.0
|
||||
nms_thresh: 0.7
|
||||
pre_nms_top_n: 2000
|
||||
post_nms_top_n: 1000
|
||||
topk_after_collect: True
|
||||
test_proposal:
|
||||
min_size: 0.0
|
||||
nms_thresh: 0.7
|
||||
pre_nms_top_n: 1000
|
||||
post_nms_top_n: 1000
|
||||
loss_rpn_bbox: SmoothL1Loss
|
||||
|
||||
SmoothL1Loss:
|
||||
beta: 0.1111111111111111
|
||||
|
||||
|
||||
BBoxHead:
|
||||
head: XConvNormHead
|
||||
roi_extractor:
|
||||
resolution: 7
|
||||
sampling_ratio: 0
|
||||
aligned: True
|
||||
bbox_assigner: BBoxAssigner
|
||||
loss_normalize_pos: True
|
||||
bbox_loss: GIoULoss
|
||||
|
||||
BBoxAssigner:
|
||||
batch_size_per_im: 512
|
||||
bg_thresh: 0.5
|
||||
fg_thresh: 0.5
|
||||
fg_fraction: 0.25
|
||||
use_random: True
|
||||
|
||||
|
||||
XConvNormHead:
|
||||
num_convs: 4
|
||||
norm_type: bn
|
||||
|
||||
GIoULoss:
|
||||
loss_weight: 10.
|
||||
reduction: 'none'
|
||||
eps: 0.000001
|
||||
|
||||
|
||||
|
||||
BBoxPostProcess:
|
||||
decode: RCNNBox
|
||||
nms:
|
||||
name: MultiClassNMS
|
||||
keep_top_k: 100
|
||||
score_threshold: 0.05
|
||||
nms_threshold: 0.5
|
||||
|
||||
MaskHead:
|
||||
head: MaskFeat
|
||||
roi_extractor:
|
||||
resolution: 14
|
||||
sampling_ratio: 0
|
||||
aligned: True
|
||||
mask_assigner: MaskAssigner
|
||||
share_bbox_feat: False
|
||||
|
||||
MaskFeat:
|
||||
num_convs: 4
|
||||
out_channel: 256
|
||||
norm_type: ~
|
||||
|
||||
MaskAssigner:
|
||||
mask_resolution: 28
|
||||
|
||||
MaskPostProcess:
|
||||
binary_thresh: 0.5
|
||||
@@ -0,0 +1,29 @@
|
||||
_BASE_: [
|
||||
'./mask_rcnn_vit_base_hrfpn_cae_1x_coco.yml'
|
||||
]
|
||||
|
||||
weights: output/mask_rcnn_vit_large_hrfpn_cae_1x_coco/model_final
|
||||
|
||||
|
||||
depth: &depth 24
|
||||
dim: &dim 1024
|
||||
use_fused_allreduce_gradients: &use_checkpoint True
|
||||
|
||||
VisionTransformer:
|
||||
img_size: [800, 1344]
|
||||
embed_dim: *dim
|
||||
depth: *depth
|
||||
num_heads: 16
|
||||
drop_path_rate: 0.25
|
||||
out_indices: [7, 11, 15, 23]
|
||||
use_checkpoint: *use_checkpoint
|
||||
pretrained: https://bj.bcebos.com/v1/paddledet/models/pretrained/vit_large_cae_pretrained.pdparams
|
||||
|
||||
HRFPN:
|
||||
in_channels: [*dim, *dim, *dim, *dim]
|
||||
|
||||
OptimizerBuilder:
|
||||
optimizer:
|
||||
layer_decay: 0.9
|
||||
weight_decay: 0.02
|
||||
num_layers: *depth
|
||||
@@ -0,0 +1,78 @@
|
||||
|
||||
_BASE_: [
|
||||
'../datasets/coco_detection.yml',
|
||||
'../runtime.yml',
|
||||
'./_base_/ppyoloe_reader.yml',
|
||||
'./_base_/optimizer_base_36e.yml'
|
||||
]
|
||||
|
||||
weights: output/ppyoloe_vit_base_csppan_cae_36e_coco/model_final
|
||||
|
||||
|
||||
snapshot_epoch: 2
|
||||
log_iter: 100
|
||||
|
||||
|
||||
use_ema: true
|
||||
ema_decay: 0.9999
|
||||
ema_skip_names: ['yolo_head.proj_conv.weight', 'backbone.pos_embed']
|
||||
custom_black_list: ['reduce_mean']
|
||||
use_fused_allreduce_gradients: &use_checkpoint False
|
||||
|
||||
|
||||
architecture: YOLOv3
|
||||
norm_type: sync_bn
|
||||
|
||||
YOLOv3:
|
||||
backbone: VisionTransformer
|
||||
neck: YOLOCSPPAN
|
||||
yolo_head: PPYOLOEHead
|
||||
post_process: ~
|
||||
|
||||
VisionTransformer:
|
||||
patch_size: 16
|
||||
embed_dim: 768
|
||||
depth: 12
|
||||
num_heads: 12
|
||||
mlp_ratio: 4
|
||||
qkv_bias: True
|
||||
drop_rate: 0.0
|
||||
drop_path_rate: 0.2
|
||||
init_values: 0.1
|
||||
final_norm: False
|
||||
use_rel_pos_bias: False
|
||||
use_sincos_pos_emb: True
|
||||
epsilon: 0.000001 # 1e-6
|
||||
out_indices: [11, ]
|
||||
with_fpn: True
|
||||
num_fpn_levels: 3
|
||||
out_with_norm: False
|
||||
use_checkpoint: *use_checkpoint
|
||||
pretrained: https://bj.bcebos.com/v1/paddledet/models/pretrained/vit_base_cae_pretrained.pdparams
|
||||
|
||||
YOLOCSPPAN:
|
||||
in_channels: [768, 768, 768]
|
||||
act: 'silu'
|
||||
|
||||
PPYOLOEHead:
|
||||
fpn_strides: [8, 16, 32]
|
||||
in_channels: [768, 768, 768]
|
||||
static_assigner_epoch: -1
|
||||
grid_cell_scale: 5.0
|
||||
grid_cell_offset: 0.5
|
||||
use_varifocal_loss: True
|
||||
loss_weight: {class: 1.0, iou: 2.5, dfl: 0.5}
|
||||
static_assigner:
|
||||
name: ATSSAssigner
|
||||
topk: 9
|
||||
assigner:
|
||||
name: TaskAlignedAssigner
|
||||
topk: 13
|
||||
alpha: 1.0
|
||||
beta: 6.0
|
||||
nms:
|
||||
name: MultiClassNMS
|
||||
nms_top_k: 1000
|
||||
keep_top_k: 300
|
||||
score_threshold: 0.01
|
||||
nms_threshold: 0.7
|
||||
Reference in New Issue
Block a user