更换文档检测模型

2024-08-27 14:42:45 +08:00
parent aea6f19951
commit 1514e09c40
2072 changed files with 254336 additions and 4967 deletions
--- a/paddle_detection/ppdet/modeling/necks/init.py
+++ b/paddle_detection/ppdet/modeling/necks/init.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import fpn
+from . import yolo_fpn
+from . import hrfpn
+from . import ttf_fpn
+from . import centernet_fpn
+from . import bifpn
+from . import csp_pan
+from . import es_pan
+from . import lc_pan
+from . import custom_pan
+from . import dilated_encoder
+from . import clrnet_fpn
+
+from .fpn import *
+from .yolo_fpn import *
+from .hrfpn import *
+from .ttf_fpn import *
+from .centernet_fpn import *
+from .blazeface_fpn import *
+from .bifpn import *
+from .csp_pan import *
+from .es_pan import *
+from .lc_pan import *
+from .custom_pan import *
+from .dilated_encoder import *
+from .channel_mapper import *
+from .clrnet_fpn import *
--- a/paddle_detection/ppdet/modeling/necks/bifpn.py
+++ b/paddle_detection/ppdet/modeling/necks/bifpn.py
@@ -0,0 +1,300 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import Constant
+
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling.layers import ConvNormLayer
+from ..shape_spec import ShapeSpec
+
+__all__ = ['BiFPN']
+
+
+class SeparableConvLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels=None,
+                 kernel_size=3,
+                 norm_type='bn',
+                 norm_groups=32,
+                 act='swish'):
+        super(SeparableConvLayer, self).__init__()
+        assert norm_type in ['bn', 'sync_bn', 'gn', None]
+        assert act in ['swish', 'relu', None]
+
+        self.in_channels = in_channels
+        if out_channels is None:
+            self.out_channels = self.in_channels
+        self.norm_type = norm_type
+        self.norm_groups = norm_groups
+        self.depthwise_conv = nn.Conv2D(
+            in_channels,
+            in_channels,
+            kernel_size,
+            padding=kernel_size // 2,
+            groups=in_channels,
+            bias_attr=False)
+        self.pointwise_conv = nn.Conv2D(in_channels, self.out_channels, 1)
+
+        # norm type
+        if self.norm_type in ['bn', 'sync_bn']:
+            self.norm = nn.BatchNorm2D(self.out_channels)
+        elif self.norm_type == 'gn':
+            self.norm = nn.GroupNorm(
+                num_groups=self.norm_groups, num_channels=self.out_channels)
+
+        # activation
+        if act == 'swish':
+            self.act = nn.Swish()
+        elif act == 'relu':
+            self.act = nn.ReLU()
+
+    def forward(self, x):
+        if self.act is not None:
+            x = self.act(x)
+        out = self.depthwise_conv(x)
+        out = self.pointwise_conv(out)
+        if self.norm_type is not None:
+            out = self.norm(out)
+        return out
+
+
+class BiFPNCell(nn.Layer):
+    def __init__(self,
+                 channels=256,
+                 num_levels=5,
+                 eps=1e-5,
+                 use_weighted_fusion=True,
+                 kernel_size=3,
+                 norm_type='bn',
+                 norm_groups=32,
+                 act='swish'):
+        super(BiFPNCell, self).__init__()
+        self.channels = channels
+        self.num_levels = num_levels
+        self.eps = eps
+        self.use_weighted_fusion = use_weighted_fusion
+
+        # up
+        self.conv_up = nn.LayerList([
+            SeparableConvLayer(
+                self.channels,
+                kernel_size=kernel_size,
+                norm_type=norm_type,
+                norm_groups=norm_groups,
+                act=act) for _ in range(self.num_levels - 1)
+        ])
+        # down
+        self.conv_down = nn.LayerList([
+            SeparableConvLayer(
+                self.channels,
+                kernel_size=kernel_size,
+                norm_type=norm_type,
+                norm_groups=norm_groups,
+                act=act) for _ in range(self.num_levels - 1)
+        ])
+
+        if self.use_weighted_fusion:
+            self.up_weights = self.create_parameter(
+                shape=[self.num_levels - 1, 2],
+                attr=ParamAttr(initializer=Constant(1.)))
+            self.down_weights = self.create_parameter(
+                shape=[self.num_levels - 1, 3],
+                attr=ParamAttr(initializer=Constant(1.)))
+
+    def _feature_fusion_cell(self,
+                             conv_layer,
+                             lateral_feat,
+                             sampling_feat,
+                             route_feat=None,
+                             weights=None):
+        if self.use_weighted_fusion:
+            weights = F.relu(weights)
+            weights = weights / (weights.sum() + self.eps)
+            if route_feat is not None:
+                out_feat = weights[0] * lateral_feat + \
+                           weights[1] * sampling_feat + \
+                           weights[2] * route_feat
+            else:
+                out_feat = weights[0] * lateral_feat + \
+                           weights[1] * sampling_feat
+        else:
+            if route_feat is not None:
+                out_feat = lateral_feat + sampling_feat + route_feat
+            else:
+                out_feat = lateral_feat + sampling_feat
+
+        out_feat = conv_layer(out_feat)
+        return out_feat
+
+    def forward(self, feats):
+        # feats: [P3 - P7]
+        lateral_feats = []
+
+        # up
+        up_feature = feats[-1]
+        for i, feature in enumerate(feats[::-1]):
+            if i == 0:
+                lateral_feats.append(feature)
+            else:
+                shape = paddle.shape(feature)
+                up_feature = F.interpolate(
+                    up_feature, size=[shape[2], shape[3]])
+                lateral_feature = self._feature_fusion_cell(
+                    self.conv_up[i - 1],
+                    feature,
+                    up_feature,
+                    weights=self.up_weights[i - 1]
+                    if self.use_weighted_fusion else None)
+                lateral_feats.append(lateral_feature)
+                up_feature = lateral_feature
+
+        out_feats = []
+        # down
+        down_feature = lateral_feats[-1]
+        for i, (lateral_feature,
+                route_feature) in enumerate(zip(lateral_feats[::-1], feats)):
+            if i == 0:
+                out_feats.append(lateral_feature)
+            else:
+                down_feature = F.max_pool2d(down_feature, 3, 2, 1)
+                if i == len(feats) - 1:
+                    route_feature = None
+                    weights = self.down_weights[
+                        i - 1][:2] if self.use_weighted_fusion else None
+                else:
+                    weights = self.down_weights[
+                        i - 1] if self.use_weighted_fusion else None
+                out_feature = self._feature_fusion_cell(
+                    self.conv_down[i - 1],
+                    lateral_feature,
+                    down_feature,
+                    route_feature,
+                    weights=weights)
+                out_feats.append(out_feature)
+                down_feature = out_feature
+
+        return out_feats
+
+
+@register
+@serializable
+class BiFPN(nn.Layer):
+    """
+    Bidirectional Feature Pyramid Network, see https://arxiv.org/abs/1911.09070
+
+    Args:
+        in_channels (list[int]): input channels of each level which can be
+            derived from the output shape of backbone by from_config.
+        out_channel (int): output channel of each level.
+        num_extra_levels (int): the number of extra stages added to the last level.
+            default: 2
+        fpn_strides (List): The stride of each level.
+        num_stacks (int): the number of stacks for BiFPN, default: 1.
+        use_weighted_fusion (bool): use weighted feature fusion in BiFPN, default: True.
+        norm_type (string|None): the normalization type in BiFPN module. If
+            norm_type is None, norm will not be used after conv and if
+            norm_type is string, bn, gn, sync_bn are available. default: bn.
+        norm_groups (int): if you use gn, set this param.
+        act (string|None): the activation function of BiFPN.
+    """
+
+    def __init__(self,
+                 in_channels=(512, 1024, 2048),
+                 out_channel=256,
+                 num_extra_levels=2,
+                 fpn_strides=[8, 16, 32, 64, 128],
+                 num_stacks=1,
+                 use_weighted_fusion=True,
+                 norm_type='bn',
+                 norm_groups=32,
+                 act='swish'):
+        super(BiFPN, self).__init__()
+        assert num_stacks > 0, "The number of stacks of BiFPN is at least 1."
+        assert norm_type in ['bn', 'sync_bn', 'gn', None]
+        assert act in ['swish', 'relu', None]
+        assert num_extra_levels >= 0, \
+            "The `num_extra_levels` must be non negative(>=0)."
+
+        self.in_channels = in_channels
+        self.out_channel = out_channel
+        self.num_extra_levels = num_extra_levels
+        self.num_stacks = num_stacks
+        self.use_weighted_fusion = use_weighted_fusion
+        self.norm_type = norm_type
+        self.norm_groups = norm_groups
+        self.act = act
+        self.num_levels = len(self.in_channels) + self.num_extra_levels
+        if len(fpn_strides) != self.num_levels:
+            for i in range(self.num_extra_levels):
+                fpn_strides += [fpn_strides[-1] * 2]
+        self.fpn_strides = fpn_strides
+
+        self.lateral_convs = nn.LayerList()
+        for in_c in in_channels:
+            self.lateral_convs.append(
+                ConvNormLayer(in_c, self.out_channel, 1, 1))
+        if self.num_extra_levels > 0:
+            self.extra_convs = nn.LayerList()
+            for i in range(self.num_extra_levels):
+                if i == 0:
+                    self.extra_convs.append(
+                        ConvNormLayer(self.in_channels[-1], self.out_channel, 3,
+                                      2))
+                else:
+                    self.extra_convs.append(nn.MaxPool2D(3, 2, 1))
+
+        self.bifpn_cells = nn.LayerList()
+        for i in range(self.num_stacks):
+            self.bifpn_cells.append(
+                BiFPNCell(
+                    self.out_channel,
+                    self.num_levels,
+                    use_weighted_fusion=self.use_weighted_fusion,
+                    norm_type=self.norm_type,
+                    norm_groups=self.norm_groups,
+                    act=self.act))
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            'in_channels': [i.channels for i in input_shape],
+            'fpn_strides': [i.stride for i in input_shape]
+        }
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self.out_channel, stride=s) for s in self.fpn_strides
+        ]
+
+    def forward(self, feats):
+        assert len(feats) == len(self.in_channels)
+        fpn_feats = []
+        for conv_layer, feature in zip(self.lateral_convs, feats):
+            fpn_feats.append(conv_layer(feature))
+        if self.num_extra_levels > 0:
+            feat = feats[-1]
+            for conv_layer in self.extra_convs:
+                feat = conv_layer(feat)
+                fpn_feats.append(feat)
+
+        for bifpn_cell in self.bifpn_cells:
+            fpn_feats = bifpn_cell(fpn_feats)
+        return fpn_feats
--- a/paddle_detection/ppdet/modeling/necks/blazeface_fpn.py
+++ b/paddle_detection/ppdet/modeling/necks/blazeface_fpn.py
@@ -0,0 +1,213 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+from paddle import ParamAttr
+import paddle.nn as nn
+from paddle.nn.initializer import KaimingNormal
+from ppdet.core.workspace import register, serializable
+from ..shape_spec import ShapeSpec
+
+__all__ = ['BlazeNeck']
+
+
+def hard_swish(x):
+    return x * F.relu6(x + 3) / 6.
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 act='relu',
+                 conv_lr=0.1,
+                 conv_decay=0.,
+                 norm_decay=0.,
+                 norm_type='bn',
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.act = act
+        self._conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            weight_attr=ParamAttr(
+                learning_rate=conv_lr, initializer=KaimingNormal()),
+            bias_attr=False)
+
+        if norm_type in ['sync_bn', 'bn']:
+            self._batch_norm = nn.BatchNorm2D(out_channels)
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._batch_norm(x)
+        if self.act == "relu":
+            x = F.relu(x)
+        elif self.act == "relu6":
+            x = F.relu6(x)
+        elif self.act == 'leaky':
+            x = F.leaky_relu(x)
+        elif self.act == 'hard_swish':
+            x = hard_swish(x)
+        return x
+
+
+class FPN(nn.Layer):
+    def __init__(self, in_channels, out_channels, name=None):
+        super(FPN, self).__init__()
+        self.conv1_fpn = ConvBNLayer(
+            in_channels,
+            out_channels // 2,
+            kernel_size=1,
+            padding=0,
+            stride=1,
+            act='leaky',
+            name=name + '_output1')
+        self.conv2_fpn = ConvBNLayer(
+            in_channels,
+            out_channels // 2,
+            kernel_size=1,
+            padding=0,
+            stride=1,
+            act='leaky',
+            name=name + '_output2')
+        self.conv3_fpn = ConvBNLayer(
+            out_channels // 2,
+            out_channels // 2,
+            kernel_size=3,
+            padding=1,
+            stride=1,
+            act='leaky',
+            name=name + '_merge')
+
+    def forward(self, input):
+        output1 = self.conv1_fpn(input[0])
+        output2 = self.conv2_fpn(input[1])
+        up2 = F.upsample(
+            output2, size=paddle.shape(output1)[-2:], mode='nearest')
+        output1 = paddle.add(output1, up2)
+        output1 = self.conv3_fpn(output1)
+        return output1, output2
+
+
+class SSH(nn.Layer):
+    def __init__(self, in_channels, out_channels, name=None):
+        super(SSH, self).__init__()
+        assert out_channels % 4 == 0
+        self.conv0_ssh = ConvBNLayer(
+            in_channels,
+            out_channels // 2,
+            kernel_size=3,
+            padding=1,
+            stride=1,
+            act=None,
+            name=name + 'ssh_conv3')
+        self.conv1_ssh = ConvBNLayer(
+            out_channels // 2,
+            out_channels // 4,
+            kernel_size=3,
+            padding=1,
+            stride=1,
+            act='leaky',
+            name=name + 'ssh_conv5_1')
+        self.conv2_ssh = ConvBNLayer(
+            out_channels // 4,
+            out_channels // 4,
+            kernel_size=3,
+            padding=1,
+            stride=1,
+            act=None,
+            name=name + 'ssh_conv5_2')
+        self.conv3_ssh = ConvBNLayer(
+            out_channels // 4,
+            out_channels // 4,
+            kernel_size=3,
+            padding=1,
+            stride=1,
+            act='leaky',
+            name=name + 'ssh_conv7_1')
+        self.conv4_ssh = ConvBNLayer(
+            out_channels // 4,
+            out_channels // 4,
+            kernel_size=3,
+            padding=1,
+            stride=1,
+            act=None,
+            name=name + 'ssh_conv7_2')
+
+    def forward(self, x):
+        conv0 = self.conv0_ssh(x)
+        conv1 = self.conv1_ssh(conv0)
+        conv2 = self.conv2_ssh(conv1)
+        conv3 = self.conv3_ssh(conv2)
+        conv4 = self.conv4_ssh(conv3)
+        concat = paddle.concat([conv0, conv2, conv4], axis=1)
+        return F.relu(concat)
+
+
+@register
+@serializable
+class BlazeNeck(nn.Layer):
+    def __init__(self, in_channel, neck_type="None", data_format='NCHW'):
+        super(BlazeNeck, self).__init__()
+        self.neck_type = neck_type
+        self.reture_input = False
+        self._out_channels = in_channel
+        if self.neck_type == 'None':
+            self.reture_input = True
+        if "fpn" in self.neck_type:
+            self.fpn = FPN(self._out_channels[0],
+                           self._out_channels[1],
+                           name='fpn')
+            self._out_channels = [
+                self._out_channels[0] // 2, self._out_channels[1] // 2
+            ]
+        if "ssh" in self.neck_type:
+            self.ssh1 = SSH(self._out_channels[0],
+                            self._out_channels[0],
+                            name='ssh1')
+            self.ssh2 = SSH(self._out_channels[1],
+                            self._out_channels[1],
+                            name='ssh2')
+            self._out_channels = [self._out_channels[0], self._out_channels[1]]
+
+    def forward(self, inputs):
+        if self.reture_input:
+            return inputs
+        output1, output2 = None, None
+        if "fpn" in self.neck_type:
+            backout_4, backout_1 = inputs
+            output1, output2 = self.fpn([backout_4, backout_1])
+        if self.neck_type == "only_fpn":
+            return [output1, output2]
+        if self.neck_type == "only_ssh":
+            output1, output2 = inputs
+        feature1 = self.ssh1(output1)
+        feature2 = self.ssh2(output2)
+        return [feature1, feature2]
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(channels=c)
+            for c in [self._out_channels[0], self._out_channels[1]]
+        ]
--- a/paddle_detection/ppdet/modeling/necks/centernet_fpn.py
+++ b/paddle_detection/ppdet/modeling/necks/centernet_fpn.py
@@ -0,0 +1,426 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import math
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.nn.initializer import Uniform
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling.layers import ConvNormLayer
+from ppdet.modeling.backbones.hardnet import ConvLayer, HarDBlock
+from ..shape_spec import ShapeSpec
+
+__all__ = ['CenterNetDLAFPN', 'CenterNetHarDNetFPN']
+
+
+# SGE attention
+class BasicConv(nn.Layer):
+    def __init__(self,
+                 in_planes,
+                 out_planes,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 relu=True,
+                 bn=True,
+                 bias_attr=False):
+        super(BasicConv, self).__init__()
+        self.out_channels = out_planes
+        self.conv = nn.Conv2D(
+            in_planes,
+            out_planes,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias_attr=bias_attr)
+        self.bn = nn.BatchNorm2D(
+            out_planes,
+            epsilon=1e-5,
+            momentum=0.01,
+            weight_attr=False,
+            bias_attr=False) if bn else None
+        self.relu = nn.ReLU() if relu else None
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.bn is not None:
+            x = self.bn(x)
+        if self.relu is not None:
+            x = self.relu(x)
+        return x
+
+
+class ChannelPool(nn.Layer):
+    def forward(self, x):
+        return paddle.concat(
+            (paddle.max(x, 1).unsqueeze(1), paddle.mean(x, 1).unsqueeze(1)),
+            axis=1)
+
+
+class SpatialGate(nn.Layer):
+    def __init__(self):
+        super(SpatialGate, self).__init__()
+        kernel_size = 7
+        self.compress = ChannelPool()
+        self.spatial = BasicConv(
+            2,
+            1,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+            relu=False)
+
+    def forward(self, x):
+        x_compress = self.compress(x)
+        x_out = self.spatial(x_compress)
+        scale = F.sigmoid(x_out)  # broadcasting
+        return x * scale
+
+
+def fill_up_weights(up):
+    weight = up.weight.numpy()
+    f = math.ceil(weight.shape[2] / 2)
+    c = (2 * f - 1 - f % 2) / (2. * f)
+    for i in range(weight.shape[2]):
+        for j in range(weight.shape[3]):
+            weight[0, 0, i, j] = \
+                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
+    for c in range(1, weight.shape[0]):
+        weight[c, 0, :, :] = weight[0, 0, :, :]
+    up.weight.set_value(weight)
+
+
+class IDAUp(nn.Layer):
+    def __init__(self, ch_ins, ch_out, up_strides, dcn_v2=True):
+        super(IDAUp, self).__init__()
+        for i in range(1, len(ch_ins)):
+            ch_in = ch_ins[i]
+            up_s = int(up_strides[i])
+            fan_in = ch_in * 3 * 3
+            stdv = 1. / math.sqrt(fan_in)
+            proj = nn.Sequential(
+                ConvNormLayer(
+                    ch_in,
+                    ch_out,
+                    filter_size=3,
+                    stride=1,
+                    use_dcn=dcn_v2,
+                    bias_on=dcn_v2,
+                    norm_decay=None,
+                    dcn_lr_scale=1.,
+                    dcn_regularizer=None,
+                    initializer=Uniform(-stdv, stdv)),
+                nn.ReLU())
+            node = nn.Sequential(
+                ConvNormLayer(
+                    ch_out,
+                    ch_out,
+                    filter_size=3,
+                    stride=1,
+                    use_dcn=dcn_v2,
+                    bias_on=dcn_v2,
+                    norm_decay=None,
+                    dcn_lr_scale=1.,
+                    dcn_regularizer=None,
+                    initializer=Uniform(-stdv, stdv)),
+                nn.ReLU())
+
+            kernel_size = up_s * 2
+            fan_in = ch_out * kernel_size * kernel_size
+            stdv = 1. / math.sqrt(fan_in)
+            up = nn.Conv2DTranspose(
+                ch_out,
+                ch_out,
+                kernel_size=up_s * 2,
+                stride=up_s,
+                padding=up_s // 2,
+                groups=ch_out,
+                weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)),
+                bias_attr=False)
+            fill_up_weights(up)
+            setattr(self, 'proj_' + str(i), proj)
+            setattr(self, 'up_' + str(i), up)
+            setattr(self, 'node_' + str(i), node)
+
+    def forward(self, inputs, start_level, end_level):
+        for i in range(start_level + 1, end_level):
+            upsample = getattr(self, 'up_' + str(i - start_level))
+            project = getattr(self, 'proj_' + str(i - start_level))
+            inputs[i] = project(inputs[i])
+            inputs[i] = upsample(inputs[i])
+            node = getattr(self, 'node_' + str(i - start_level))
+            inputs[i] = node(paddle.add(inputs[i], inputs[i - 1]))
+        return inputs
+
+
+class DLAUp(nn.Layer):
+    def __init__(self, start_level, channels, scales, ch_in=None, dcn_v2=True):
+        super(DLAUp, self).__init__()
+        self.start_level = start_level
+        if ch_in is None:
+            ch_in = channels
+        self.channels = channels
+        channels = list(channels)
+        scales = np.array(scales, dtype=int)
+        for i in range(len(channels) - 1):
+            j = -i - 2
+            setattr(
+                self,
+                'ida_{}'.format(i),
+                IDAUp(
+                    ch_in[j:],
+                    channels[j],
+                    scales[j:] // scales[j],
+                    dcn_v2=dcn_v2))
+            scales[j + 1:] = scales[j]
+            ch_in[j + 1:] = [channels[j] for _ in channels[j + 1:]]
+
+    def forward(self, inputs):
+        out = [inputs[-1]]  # start with 32
+        for i in range(len(inputs) - self.start_level - 1):
+            ida = getattr(self, 'ida_{}'.format(i))
+            outputs = ida(inputs, len(inputs) - i - 2, len(inputs))
+            out.insert(0, outputs[-1])
+        return out
+
+
+@register
+@serializable
+class CenterNetDLAFPN(nn.Layer):
+    """
+    Args:
+        in_channels (list): number of input feature channels from backbone.
+            [16, 32, 64, 128, 256, 512] by default, means the channels of DLA-34
+        down_ratio (int): the down ratio from images to heatmap, 4 by default
+        last_level (int): the last level of input feature fed into the upsamplng block
+        out_channel (int): the channel of the output feature, 0 by default means
+            the channel of the input feature whose down ratio is `down_ratio`
+        first_level (None): the first level of input feature fed into the upsamplng block.
+            if None, the first level stands for logs(down_ratio)
+        dcn_v2 (bool): whether use the DCNv2, True by default
+        with_sge (bool): whether use SGE attention, False by default
+    """
+
+    def __init__(self,
+                 in_channels,
+                 down_ratio=4,
+                 last_level=5,
+                 out_channel=0,
+                 first_level=None,
+                 dcn_v2=True,
+                 with_sge=False):
+        super(CenterNetDLAFPN, self).__init__()
+        self.first_level = int(np.log2(
+            down_ratio)) if first_level is None else first_level
+        assert self.first_level >= 0, "first level in CenterNetDLAFPN should be greater or equal to 0, but received {}".format(
+            self.first_level)
+        self.down_ratio = down_ratio
+        self.last_level = last_level
+        scales = [2**i for i in range(len(in_channels[self.first_level:]))]
+        self.dla_up = DLAUp(
+            self.first_level,
+            in_channels[self.first_level:],
+            scales,
+            dcn_v2=dcn_v2)
+        self.out_channel = out_channel
+        if out_channel == 0:
+            self.out_channel = in_channels[self.first_level]
+        self.ida_up = IDAUp(
+            in_channels[self.first_level:self.last_level],
+            self.out_channel,
+            [2**i for i in range(self.last_level - self.first_level)],
+            dcn_v2=dcn_v2)
+
+        self.with_sge = with_sge
+        if self.with_sge:
+            self.sge_attention = SpatialGate()
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape]}
+
+    def forward(self, body_feats):
+
+        inputs = [body_feats[i] for i in range(len(body_feats))]
+
+        dla_up_feats = self.dla_up(inputs)
+
+        ida_up_feats = []
+        for i in range(self.last_level - self.first_level):
+            ida_up_feats.append(dla_up_feats[i].clone())
+
+        self.ida_up(ida_up_feats, 0, len(ida_up_feats))
+
+        feat = ida_up_feats[-1]
+        if self.with_sge:
+            feat = self.sge_attention(feat)
+        if self.down_ratio != 4:
+            feat = F.interpolate(
+                feat,
+                scale_factor=self.down_ratio // 4,
+                mode="bilinear",
+                align_corners=True)
+        return feat
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=self.out_channel, stride=self.down_ratio)]
+
+
+class TransitionUp(nn.Layer):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+
+    def forward(self, x, skip):
+        w, h = skip.shape[2], skip.shape[3]
+        out = F.interpolate(x, size=(w, h), mode="bilinear", align_corners=True)
+        out = paddle.concat([out, skip], 1)
+        return out
+
+
+@register
+@serializable
+class CenterNetHarDNetFPN(nn.Layer):
+    """
+    Args:
+        in_channels (list): number of input feature channels from backbone.
+            [96, 214, 458, 784] by default, means the channels of HarDNet85
+        num_layers (int): HarDNet laters, 85 by default
+        down_ratio (int): the down ratio from images to heatmap, 4 by default
+        first_level (int|None): the first level of input feature fed into the upsamplng block.
+            if None, the first level stands for logs(down_ratio) - 1
+
+        last_level (int): the last level of input feature fed into the upsamplng block
+        out_channel (int): the channel of the output feature, 0 by default means
+            the channel of the input feature whose down ratio is `down_ratio`
+    """
+
+    def __init__(self,
+                 in_channels,
+                 num_layers=85,
+                 down_ratio=4,
+                 first_level=None,
+                 last_level=4,
+                 out_channel=0):
+        super(CenterNetHarDNetFPN, self).__init__()
+        self.first_level = int(np.log2(
+            down_ratio)) - 1 if first_level is None else first_level
+        assert self.first_level >= 0, "first level in CenterNetDLAFPN should be greater or equal to 0, but received {}".format(
+            self.first_level)
+        self.down_ratio = down_ratio
+        self.last_level = last_level
+        self.last_pool = nn.AvgPool2D(kernel_size=2, stride=2)
+
+        assert num_layers in [68, 85], "HarDNet-{} not support.".format(
+            num_layers)
+        if num_layers == 85:
+            self.last_proj = ConvLayer(784, 256, kernel_size=1)
+            self.last_blk = HarDBlock(768, 80, 1.7, 8)
+            self.skip_nodes = [1, 3, 8, 13]
+            self.SC = [32, 32, 0]
+            gr = [64, 48, 28]
+            layers = [8, 8, 4]
+            ch_list2 = [224 + self.SC[0], 160 + self.SC[1], 96 + self.SC[2]]
+            channels = [96, 214, 458, 784]
+            self.skip_lv = 3
+
+        elif num_layers == 68:
+            self.last_proj = ConvLayer(654, 192, kernel_size=1)
+            self.last_blk = HarDBlock(576, 72, 1.7, 8)
+            self.skip_nodes = [1, 3, 8, 11]
+            self.SC = [32, 32, 0]
+            gr = [48, 32, 20]
+            layers = [8, 8, 4]
+            ch_list2 = [224 + self.SC[0], 96 + self.SC[1], 64 + self.SC[2]]
+            channels = [64, 124, 328, 654]
+            self.skip_lv = 2
+
+        self.transUpBlocks = nn.LayerList([])
+        self.denseBlocksUp = nn.LayerList([])
+        self.conv1x1_up = nn.LayerList([])
+        self.avg9x9 = nn.AvgPool2D(kernel_size=(9, 9), stride=1, padding=(4, 4))
+        prev_ch = self.last_blk.get_out_ch()
+
+        for i in range(3):
+            skip_ch = channels[3 - i]
+            self.transUpBlocks.append(TransitionUp(prev_ch, prev_ch))
+            if i < self.skip_lv:
+                cur_ch = prev_ch + skip_ch
+            else:
+                cur_ch = prev_ch
+            self.conv1x1_up.append(
+                ConvLayer(
+                    cur_ch, ch_list2[i], kernel_size=1))
+            cur_ch = ch_list2[i]
+            cur_ch -= self.SC[i]
+            cur_ch *= 3
+
+            blk = HarDBlock(cur_ch, gr[i], 1.7, layers[i])
+            self.denseBlocksUp.append(blk)
+            prev_ch = blk.get_out_ch()
+
+        prev_ch += self.SC[0] + self.SC[1] + self.SC[2]
+        self.out_channel = prev_ch
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape]}
+
+    def forward(self, body_feats):
+        x = body_feats[-1]
+        x_sc = []
+        x = self.last_proj(x)
+        x = self.last_pool(x)
+        x2 = self.avg9x9(x)
+        x3 = x / (x.sum((2, 3), keepdim=True) + 0.1)
+        x = paddle.concat([x, x2, x3], 1)
+        x = self.last_blk(x)
+
+        for i in range(3):
+            skip_x = body_feats[3 - i]
+            x_up = self.transUpBlocks[i](x, skip_x)
+            x_ch = self.conv1x1_up[i](x_up)
+            if self.SC[i] > 0:
+                end = x_ch.shape[1]
+                new_st = end - self.SC[i]
+                x_sc.append(x_ch[:, new_st:, :, :])
+                x_ch = x_ch[:, :new_st, :, :]
+            x2 = self.avg9x9(x_ch)
+            x3 = x_ch / (x_ch.sum((2, 3), keepdim=True) + 0.1)
+            x_new = paddle.concat([x_ch, x2, x3], 1)
+            x = self.denseBlocksUp[i](x_new)
+
+        scs = [x]
+        for i in range(3):
+            if self.SC[i] > 0:
+                scs.insert(
+                    0,
+                    F.interpolate(
+                        x_sc[i],
+                        size=(x.shape[2], x.shape[3]),
+                        mode="bilinear",
+                        align_corners=True))
+        neck_feat = paddle.concat(scs, 1)
+        return neck_feat
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=self.out_channel, stride=self.down_ratio)]
--- a/paddle_detection/ppdet/modeling/necks/channel_mapper.py
+++ b/paddle_detection/ppdet/modeling/necks/channel_mapper.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+"""
+this code is base on mmdet: git@github.com:open-mmlab/mmdetection.git
+"""
+import paddle.nn as nn
+
+from ppdet.core.workspace import register, serializable
+from ..backbones.hrnet import ConvNormLayer
+from ..shape_spec import ShapeSpec
+from ..initializer import xavier_uniform_, constant_
+
+__all__ = ['ChannelMapper']
+
+
+@register
+@serializable
+class ChannelMapper(nn.Layer):
+    """Channel Mapper to reduce/increase channels of backbone features.
+
+    This is used to reduce/increase channels of backbone features.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        kernel_size (int, optional): kernel_size for reducing channels (used
+            at each scale). Default: 3.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        act_cfg (dict, optional): Config dict for activation layer in
+            ConvModule. Default: dict(type='ReLU').
+        num_outs (int, optional): Number of output feature maps. There
+            would be extra_convs when num_outs larger than the length
+            of in_channels.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+        
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 norm_type="gn",
+                 norm_groups=32,
+                 act='relu',
+                 num_outs=None,
+                 init_cfg=dict(
+                     type='Xavier', layer='Conv2d', distribution='uniform')):
+        super(ChannelMapper, self).__init__()
+        assert isinstance(in_channels, list)
+        self.extra_convs = None
+        if num_outs is None:
+            num_outs = len(in_channels)
+        self.convs = nn.LayerList()
+        for in_channel in in_channels:
+            self.convs.append(
+                ConvNormLayer(
+                    ch_in=in_channel,
+                    ch_out=out_channels,
+                    filter_size=kernel_size,
+                    norm_type='gn',
+                    norm_groups=32,
+                    act=act))
+
+        if num_outs > len(in_channels):
+            self.extra_convs = nn.LayerList()
+            for i in range(len(in_channels), num_outs):
+                if i == len(in_channels):
+                    in_channel = in_channels[-1]
+                else:
+                    in_channel = out_channels
+                self.extra_convs.append(
+                    ConvNormLayer(
+                        ch_in=in_channel,
+                        ch_out=out_channels,
+                        filter_size=3,
+                        stride=2,
+                        norm_type='gn',
+                        norm_groups=32,
+                        act=act))
+        self.init_weights()
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.convs)
+        outs = [self.convs[i](inputs[i]) for i in range(len(inputs))]
+        if self.extra_convs:
+            for i in range(len(self.extra_convs)):
+                if i == 0:
+                    outs.append(self.extra_convs[0](inputs[-1]))
+                else:
+                    outs.append(self.extra_convs[i](outs[-1]))
+        return tuple(outs)
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self.out_channel, stride=1. / s)
+            for s in self.spatial_scales
+        ]
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        for p in self.parameters():
+            if p.rank() > 1:
+                xavier_uniform_(p)
+                if hasattr(p, 'bias') and p.bias is not None:
+                    constant_(p.bais)
--- a/paddle_detection/ppdet/modeling/necks/clrnet_fpn.py
+++ b/paddle_detection/ppdet/modeling/necks/clrnet_fpn.py
@@ -0,0 +1,254 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import XavierUniform
+from ppdet.modeling.initializer import kaiming_normal_, constant_
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling.layers import ConvNormLayer
+from ppdet.modeling.shape_spec import ShapeSpec
+
+__all__ = ['CLRFPN']
+
+
+@register
+@serializable
+class CLRFPN(nn.Layer):
+    """
+    Feature Pyramid Network, see https://arxiv.org/abs/1612.03144
+    Args:
+        in_channels (list[int]): input channels of each level which can be 
+            derived from the output shape of backbone by from_config
+        out_channel (int): output channel of each level
+        spatial_scales (list[float]): the spatial scales between input feature
+            maps and original input image which can be derived from the output 
+            shape of backbone by from_config
+        has_extra_convs (bool): whether to add extra conv to the last level.
+            default False
+        extra_stage (int): the number of extra stages added to the last level.
+            default 1
+        use_c5 (bool): Whether to use c5 as the input of extra stage, 
+            otherwise p5 is used. default True
+        norm_type (string|None): The normalization type in FPN module. If 
+            norm_type is None, norm will not be used after conv and if 
+            norm_type is string, bn, gn, sync_bn are available. default None
+        norm_decay (float): weight decay for normalization layer weights.
+            default 0.
+        freeze_norm (bool): whether to freeze normalization layer.  
+            default False
+        relu_before_extra_convs (bool): whether to add relu before extra convs.
+            default False
+        
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channel,
+                 spatial_scales=[0.25, 0.125, 0.0625, 0.03125],
+                 has_extra_convs=False,
+                 extra_stage=1,
+                 use_c5=True,
+                 norm_type=None,
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 relu_before_extra_convs=True):
+        super(CLRFPN, self).__init__()
+        self.out_channel = out_channel
+        for s in range(extra_stage):
+            spatial_scales = spatial_scales + [spatial_scales[-1] / 2.]
+        self.spatial_scales = spatial_scales
+        self.has_extra_convs = has_extra_convs
+        self.extra_stage = extra_stage
+        self.use_c5 = use_c5
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.norm_type = norm_type
+        self.norm_decay = norm_decay
+        self.freeze_norm = freeze_norm
+        self.in_channels = in_channels
+        self.lateral_convs = []
+        self.fpn_convs = []
+        fan = out_channel * 3 * 3
+
+        # stage index 0,1,2,3 stands for res2,res3,res4,res5 on ResNet Backbone
+        # 0 <= st_stage < ed_stage <= 3
+        st_stage = 4 - len(in_channels)
+        ed_stage = st_stage + len(in_channels) - 1
+
+        for i in range(st_stage, ed_stage + 1):
+            # if i == 3:
+            #     lateral_name = 'fpn_inner_res5_sum'
+            # else:
+            #     lateral_name = 'fpn_inner_res{}_sum_lateral'.format(i + 2)
+            lateral_name = "lateral_convs.{}.conv".format(i - 1)
+            in_c = in_channels[i - st_stage]
+            if self.norm_type is not None:
+                lateral = self.add_sublayer(
+                    lateral_name,
+                    ConvNormLayer(
+                        ch_in=in_c,
+                        ch_out=out_channel,
+                        filter_size=1,
+                        stride=1,
+                        norm_type=self.norm_type,
+                        norm_decay=self.norm_decay,
+                        freeze_norm=self.freeze_norm,
+                        initializer=XavierUniform(fan_out=in_c)))
+            else:
+                lateral = self.add_sublayer(
+                    lateral_name,
+                    nn.Conv2D(
+                        in_channels=in_c,
+                        out_channels=out_channel,
+                        kernel_size=1,
+                        weight_attr=ParamAttr(
+                            initializer=XavierUniform(fan_out=in_c))))
+            self.lateral_convs.append(lateral)
+
+            fpn_name = "fpn_convs.{}.conv".format(i - 1)
+            if self.norm_type is not None:
+                fpn_conv = self.add_sublayer(
+                    fpn_name,
+                    ConvNormLayer(
+                        ch_in=out_channel,
+                        ch_out=out_channel,
+                        filter_size=3,
+                        stride=1,
+                        norm_type=self.norm_type,
+                        norm_decay=self.norm_decay,
+                        freeze_norm=self.freeze_norm,
+                        initializer=XavierUniform(fan_out=fan)))
+            else:
+                fpn_conv = self.add_sublayer(
+                    fpn_name,
+                    nn.Conv2D(
+                        in_channels=out_channel,
+                        out_channels=out_channel,
+                        kernel_size=3,
+                        padding=1,
+                        weight_attr=ParamAttr(
+                            initializer=XavierUniform(fan_out=fan))))
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)
+        if self.has_extra_convs:
+            for i in range(self.extra_stage):
+                lvl = ed_stage + 1 + i
+                if i == 0 and self.use_c5:
+                    in_c = in_channels[-1]
+                else:
+                    in_c = out_channel
+                extra_fpn_name = 'fpn_{}'.format(lvl + 2)
+                if self.norm_type is not None:
+                    extra_fpn_conv = self.add_sublayer(
+                        extra_fpn_name,
+                        ConvNormLayer(
+                            ch_in=in_c,
+                            ch_out=out_channel,
+                            filter_size=3,
+                            stride=2,
+                            norm_type=self.norm_type,
+                            norm_decay=self.norm_decay,
+                            freeze_norm=self.freeze_norm,
+                            initializer=XavierUniform(fan_out=fan)))
+                else:
+                    extra_fpn_conv = self.add_sublayer(
+                        extra_fpn_name,
+                        nn.Conv2D(
+                            in_channels=in_c,
+                            out_channels=out_channel,
+                            kernel_size=3,
+                            stride=2,
+                            padding=1,
+                            weight_attr=ParamAttr(
+                                initializer=XavierUniform(fan_out=fan))))
+                self.fpn_convs.append(extra_fpn_conv)
+        self.init_weights()
+
+    def init_weights(self):
+        for m in self.lateral_convs:
+            if isinstance(m, (nn.Conv1D, nn.Conv2D)):
+                kaiming_normal_(
+                    m.weight, a=0, mode='fan_out', nonlinearity='relu')
+                if m.bias is not None:
+                    constant_(m.bias, value=0.)
+            elif isinstance(m, (nn.BatchNorm1D, nn.BatchNorm2D)):
+                constant_(m.weight, value=1)
+                constant_(m.bias, value=0)
+        for m in self.fpn_convs:
+            if isinstance(m, (nn.Conv1D, nn.Conv2D)):
+                kaiming_normal_(
+                    m.weight, a=0, mode='fan_out', nonlinearity='relu')
+                if m.bias is not None:
+                    constant_(m.bias, value=0.)
+            elif isinstance(m, (nn.BatchNorm1D, nn.BatchNorm2D)):
+                constant_(m.weight, value=1)
+                constant_(m.bias, value=0)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {}
+
+    def forward(self, body_feats):
+        laterals = []
+        if len(body_feats) > len(self.in_channels):
+            for _ in range(len(body_feats) - len(self.in_channels)):
+                del body_feats[0]
+        num_levels = len(body_feats)
+        # print("body_feats",num_levels)
+        for i in range(num_levels):
+            laterals.append(self.lateral_convs[i](body_feats[i]))
+
+        for i in range(1, num_levels):
+            lvl = num_levels - i
+            upsample = F.interpolate(
+                laterals[lvl],
+                scale_factor=2.,
+                mode='nearest', )
+            laterals[lvl - 1] += upsample
+
+        fpn_output = []
+        for lvl in range(num_levels):
+            fpn_output.append(self.fpn_convs[lvl](laterals[lvl]))
+
+        if self.extra_stage > 0:
+            # use max pool to get more levels on top of outputs (Faster R-CNN, Mask R-CNN)
+            if not self.has_extra_convs:
+                assert self.extra_stage == 1, 'extra_stage should be 1 if FPN has not extra convs'
+                fpn_output.append(F.max_pool2d(fpn_output[-1], 1, stride=2))
+            # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)
+            else:
+                if self.use_c5:
+                    extra_source = body_feats[-1]
+                else:
+                    extra_source = fpn_output[-1]
+                fpn_output.append(self.fpn_convs[num_levels](extra_source))
+
+                for i in range(1, self.extra_stage):
+                    if self.relu_before_extra_convs:
+                        fpn_output.append(self.fpn_convs[num_levels + i](F.relu(
+                            fpn_output[-1])))
+                    else:
+                        fpn_output.append(self.fpn_convs[num_levels + i](
+                            fpn_output[-1]))
+        return fpn_output
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self.out_channel, stride=1. / s)
+            for s in self.spatial_scales
+        ]
--- a/paddle_detection/ppdet/modeling/necks/csp_pan.py
+++ b/paddle_detection/ppdet/modeling/necks/csp_pan.py
@@ -0,0 +1,363 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+# The code is based on:
+# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/necks/yolox_pafpn.py
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from ppdet.core.workspace import register, serializable
+from ..shape_spec import ShapeSpec
+
+__all__ = ['CSPPAN']
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channel=96,
+                 out_channel=96,
+                 kernel_size=3,
+                 stride=1,
+                 groups=1,
+                 act='leaky_relu'):
+        super(ConvBNLayer, self).__init__()
+        initializer = nn.initializer.KaimingUniform()
+        self.conv = nn.Conv2D(
+            in_channels=in_channel,
+            out_channels=out_channel,
+            kernel_size=kernel_size,
+            groups=groups,
+            padding=(kernel_size - 1) // 2,
+            stride=stride,
+            weight_attr=ParamAttr(initializer=initializer),
+            bias_attr=False)
+        self.bn = nn.BatchNorm2D(out_channel)
+        if act == "hard_swish":
+            act = 'hardswish'
+        self.act = act
+
+    def forward(self, x):
+        x = self.bn(self.conv(x))
+        if self.act:
+            x = getattr(F, self.act)(x)
+        return x
+
+
+class DPModule(nn.Layer):
+    """
+    Depth-wise and point-wise module.
+     Args:
+        in_channel (int): The input channels of this Module.
+        out_channel (int): The output channels of this Module.
+        kernel_size (int): The conv2d kernel size of this Module.
+        stride (int): The conv2d's stride of this Module.
+        act (str): The activation function of this Module,
+                   Now support `leaky_relu` and `hard_swish`.
+    """
+
+    def __init__(self,
+                 in_channel=96,
+                 out_channel=96,
+                 kernel_size=3,
+                 stride=1,
+                 act='leaky_relu',
+                 use_act_in_out=True):
+        super(DPModule, self).__init__()
+        initializer = nn.initializer.KaimingUniform()
+        self.use_act_in_out = use_act_in_out
+        self.dwconv = nn.Conv2D(
+            in_channels=in_channel,
+            out_channels=out_channel,
+            kernel_size=kernel_size,
+            groups=out_channel,
+            padding=(kernel_size - 1) // 2,
+            stride=stride,
+            weight_attr=ParamAttr(initializer=initializer),
+            bias_attr=False)
+        self.bn1 = nn.BatchNorm2D(out_channel)
+        self.pwconv = nn.Conv2D(
+            in_channels=out_channel,
+            out_channels=out_channel,
+            kernel_size=1,
+            groups=1,
+            padding=0,
+            weight_attr=ParamAttr(initializer=initializer),
+            bias_attr=False)
+        self.bn2 = nn.BatchNorm2D(out_channel)
+        if act == "hard_swish":
+            act = 'hardswish'
+        self.act = act
+
+    def forward(self, x):
+        x = self.bn1(self.dwconv(x))
+        if self.act:
+            x = getattr(F, self.act)(x)
+        x = self.bn2(self.pwconv(x))
+        if self.use_act_in_out and self.act:
+            x = getattr(F, self.act)(x)
+        return x
+
+
+class DarknetBottleneck(nn.Layer):
+    """The basic bottleneck block used in Darknet.
+
+    Each Block consists of two ConvModules and the input is added to the
+    final output. Each ConvModule is composed of Conv, BN, and act.
+    The first convLayer has filter size of 1x1 and the second one has the
+    filter size of 3x3.
+
+    Args:
+        in_channels (int): The input channels of this Module.
+        out_channels (int): The output channels of this Module.
+        expansion (int): The kernel size of the convolution. Default: 0.5
+        add_identity (bool): Whether to add identity to the out.
+            Default: True
+        use_depthwise (bool): Whether to use depthwise separable convolution.
+            Default: False
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 expansion=0.5,
+                 add_identity=True,
+                 use_depthwise=False,
+                 act="leaky_relu"):
+        super(DarknetBottleneck, self).__init__()
+        hidden_channels = int(out_channels * expansion)
+        conv_func = DPModule if use_depthwise else ConvBNLayer
+        self.conv1 = ConvBNLayer(
+            in_channel=in_channels,
+            out_channel=hidden_channels,
+            kernel_size=1,
+            act=act)
+        self.conv2 = conv_func(
+            in_channel=hidden_channels,
+            out_channel=out_channels,
+            kernel_size=kernel_size,
+            stride=1,
+            act=act)
+        self.add_identity = \
+            add_identity and in_channels == out_channels
+
+    def forward(self, x):
+        identity = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+
+        if self.add_identity:
+            return out + identity
+        else:
+            return out
+
+
+class CSPLayer(nn.Layer):
+    """Cross Stage Partial Layer.
+
+    Args:
+        in_channels (int): The input channels of the CSP layer.
+        out_channels (int): The output channels of the CSP layer.
+        expand_ratio (float): Ratio to adjust the number of channels of the
+            hidden layer. Default: 0.5
+        num_blocks (int): Number of blocks. Default: 1
+        add_identity (bool): Whether to add identity in blocks.
+            Default: True
+        use_depthwise (bool): Whether to depthwise separable convolution in
+            blocks. Default: False
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 expand_ratio=0.5,
+                 num_blocks=1,
+                 add_identity=True,
+                 use_depthwise=False,
+                 act="leaky_relu"):
+        super().__init__()
+        mid_channels = int(out_channels * expand_ratio)
+        self.main_conv = ConvBNLayer(in_channels, mid_channels, 1, act=act)
+        self.short_conv = ConvBNLayer(in_channels, mid_channels, 1, act=act)
+        self.final_conv = ConvBNLayer(
+            2 * mid_channels, out_channels, 1, act=act)
+
+        self.blocks = nn.Sequential(* [
+            DarknetBottleneck(
+                mid_channels,
+                mid_channels,
+                kernel_size,
+                1.0,
+                add_identity,
+                use_depthwise,
+                act=act) for _ in range(num_blocks)
+        ])
+
+    def forward(self, x):
+        x_short = self.short_conv(x)
+
+        x_main = self.main_conv(x)
+        x_main = self.blocks(x_main)
+
+        x_final = paddle.concat((x_main, x_short), axis=1)
+        return self.final_conv(x_final)
+
+
+class Channel_T(nn.Layer):
+    def __init__(self,
+                 in_channels=[116, 232, 464],
+                 out_channels=96,
+                 act="leaky_relu"):
+        super(Channel_T, self).__init__()
+        self.convs = nn.LayerList()
+        for i in range(len(in_channels)):
+            self.convs.append(
+                ConvBNLayer(
+                    in_channels[i], out_channels, 1, act=act))
+
+    def forward(self, x):
+        outs = [self.convs[i](x[i]) for i in range(len(x))]
+        return outs
+
+
+@register
+@serializable
+class CSPPAN(nn.Layer):
+    """Path Aggregation Network with CSP module.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        kernel_size (int): The conv2d kernel size of this Module.
+        num_features (int): Number of output features of CSPPAN module.
+        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1
+        use_depthwise (bool): Whether to depthwise separable convolution in
+            blocks. Default: True
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=5,
+                 num_features=3,
+                 num_csp_blocks=1,
+                 use_depthwise=True,
+                 act='hard_swish',
+                 spatial_scales=[0.125, 0.0625, 0.03125]):
+        super(CSPPAN, self).__init__()
+        self.conv_t = Channel_T(in_channels, out_channels, act=act)
+        in_channels = [out_channels] * len(spatial_scales)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.spatial_scales = spatial_scales
+        self.num_features = num_features
+        conv_func = DPModule if use_depthwise else ConvBNLayer
+
+        if self.num_features == 4:
+            self.first_top_conv = conv_func(
+                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)
+            self.second_top_conv = conv_func(
+                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)
+            self.spatial_scales.append(self.spatial_scales[-1] / 2)
+
+        # build top-down blocks
+        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
+        self.top_down_blocks = nn.LayerList()
+        for idx in range(len(in_channels) - 1, 0, -1):
+            self.top_down_blocks.append(
+                CSPLayer(
+                    in_channels[idx - 1] * 2,
+                    in_channels[idx - 1],
+                    kernel_size=kernel_size,
+                    num_blocks=num_csp_blocks,
+                    add_identity=False,
+                    use_depthwise=use_depthwise,
+                    act=act))
+
+        # build bottom-up blocks
+        self.downsamples = nn.LayerList()
+        self.bottom_up_blocks = nn.LayerList()
+        for idx in range(len(in_channels) - 1):
+            self.downsamples.append(
+                conv_func(
+                    in_channels[idx],
+                    in_channels[idx],
+                    kernel_size=kernel_size,
+                    stride=2,
+                    act=act))
+            self.bottom_up_blocks.append(
+                CSPLayer(
+                    in_channels[idx] * 2,
+                    in_channels[idx + 1],
+                    kernel_size=kernel_size,
+                    num_blocks=num_csp_blocks,
+                    add_identity=False,
+                    use_depthwise=use_depthwise,
+                    act=act))
+
+    def forward(self, inputs):
+        """
+        Args:
+            inputs (tuple[Tensor]): input features.
+
+        Returns:
+            tuple[Tensor]: CSPPAN features.
+        """
+        assert len(inputs) == len(self.in_channels)
+        inputs = self.conv_t(inputs)
+
+        # top-down path
+        inner_outs = [inputs[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            feat_low = inputs[idx - 1]
+
+            upsample_feat = self.upsample(feat_heigh)
+
+            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
+                paddle.concat([upsample_feat, feat_low], 1))
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up path
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsamples[idx](feat_low)
+            out = self.bottom_up_blocks[idx](paddle.concat(
+                [downsample_feat, feat_height], 1))
+            outs.append(out)
+
+        top_features = None
+        if self.num_features == 4:
+            top_features = self.first_top_conv(inputs[-1])
+            top_features = top_features + self.second_top_conv(outs[-1])
+            outs.append(top_features)
+
+        return tuple(outs)
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self.out_channels, stride=1. / s)
+            for s in self.spatial_scales
+        ]
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
--- a/paddle_detection/ppdet/modeling/necks/custom_pan.py
+++ b/paddle_detection/ppdet/modeling/necks/custom_pan.py
@@ -0,0 +1,398 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import copy
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling.layers import DropBlock, MultiHeadAttention
+from ppdet.modeling.ops import get_act_fn
+from ..backbones.cspresnet import ConvBNLayer, BasicBlock
+from ..shape_spec import ShapeSpec
+from ..initializer import linear_init_
+
+__all__ = ['CustomCSPPAN']
+
+
+def _get_clones(module, N):
+    return nn.LayerList([copy.deepcopy(module) for _ in range(N)])
+
+
+class SPP(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 k,
+                 pool_size,
+                 act='swish',
+                 data_format='NCHW'):
+        super(SPP, self).__init__()
+        self.pool = []
+        self.data_format = data_format
+        for i, size in enumerate(pool_size):
+            pool = self.add_sublayer(
+                'pool{}'.format(i),
+                nn.MaxPool2D(
+                    kernel_size=size,
+                    stride=1,
+                    padding=size // 2,
+                    data_format=data_format,
+                    ceil_mode=False))
+            self.pool.append(pool)
+        self.conv = ConvBNLayer(ch_in, ch_out, k, padding=k // 2, act=act)
+
+    def forward(self, x):
+        outs = [x]
+        for pool in self.pool:
+            outs.append(pool(x))
+        if self.data_format == 'NCHW':
+            y = paddle.concat(outs, axis=1)
+        else:
+            y = paddle.concat(outs, axis=-1)
+
+        y = self.conv(y)
+        return y
+
+
+class CSPStage(nn.Layer):
+    def __init__(self,
+                 block_fn,
+                 ch_in,
+                 ch_out,
+                 n,
+                 act='swish',
+                 spp=False,
+                 use_alpha=False):
+        super(CSPStage, self).__init__()
+
+        ch_mid = int(ch_out // 2)
+        self.conv1 = ConvBNLayer(ch_in, ch_mid, 1, act=act)
+        self.conv2 = ConvBNLayer(ch_in, ch_mid, 1, act=act)
+        self.convs = nn.Sequential()
+        next_ch_in = ch_mid
+        for i in range(n):
+            self.convs.add_sublayer(
+                str(i),
+                eval(block_fn)(next_ch_in,
+                               ch_mid,
+                               act=act,
+                               shortcut=False,
+                               use_alpha=use_alpha))
+            if i == (n - 1) // 2 and spp:
+                self.convs.add_sublayer(
+                    'spp', SPP(ch_mid * 4, ch_mid, 1, [5, 9, 13], act=act))
+            next_ch_in = ch_mid
+        self.conv3 = ConvBNLayer(ch_mid * 2, ch_out, 1, act=act)
+
+    def forward(self, x):
+        y1 = self.conv1(x)
+        y2 = self.conv2(x)
+        y2 = self.convs(y2)
+        y = paddle.concat([y1, y2], axis=1)
+        y = self.conv3(y)
+        return y
+
+
+class TransformerEncoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False):
+        super(TransformerEncoderLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.activation = getattr(F, activation)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(self, src, src_mask=None, pos_embed=None):
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        q = k = self.with_pos_embed(src, pos_embed)
+        src = self.self_attn(q, k, value=src, attn_mask=src_mask)
+
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+        return src
+
+
+class TransformerEncoder(nn.Layer):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super(TransformerEncoder, self).__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, src, src_mask=None, pos_embed=None):
+        output = src
+        for layer in self.layers:
+            output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+@register
+@serializable
+class CustomCSPPAN(nn.Layer):
+    __shared__ = [
+        'norm_type', 'data_format', 'width_mult', 'depth_mult', 'trt',
+        'eval_size'
+    ]
+
+    def __init__(self,
+                 in_channels=[256, 512, 1024],
+                 out_channels=[1024, 512, 256],
+                 norm_type='bn',
+                 act='leaky',
+                 stage_fn='CSPStage',
+                 block_fn='BasicBlock',
+                 stage_num=1,
+                 block_num=3,
+                 drop_block=False,
+                 block_size=3,
+                 keep_prob=0.9,
+                 spp=False,
+                 data_format='NCHW',
+                 width_mult=1.0,
+                 depth_mult=1.0,
+                 use_alpha=False,
+                 trt=False,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation='gelu',
+                 nhead=4,
+                 num_layers=4,
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False,
+                 use_trans=False,
+                 eval_size=None):
+
+        super(CustomCSPPAN, self).__init__()
+        out_channels = [max(round(c * width_mult), 1) for c in out_channels]
+        block_num = max(round(block_num * depth_mult), 1)
+        act = get_act_fn(
+            act, trt=trt) if act is None or isinstance(act,
+                                                       (str, dict)) else act
+        self.num_blocks = len(in_channels)
+        self.data_format = data_format
+        self._out_channels = out_channels
+
+        self.hidden_dim = in_channels[-1]
+        in_channels = in_channels[::-1]
+
+        self.use_trans = use_trans
+        self.eval_size = eval_size
+        if use_trans:
+            if eval_size is not None:
+                self.pos_embed = self.build_2d_sincos_position_embedding(
+                    eval_size[1] // 32,
+                    eval_size[0] // 32,
+                    embed_dim=self.hidden_dim)
+            else:
+                self.pos_embed = None
+
+            encoder_layer = TransformerEncoderLayer(
+                self.hidden_dim, nhead, dim_feedforward, dropout, activation,
+                attn_dropout, act_dropout, normalize_before)
+            encoder_norm = nn.LayerNorm(
+                self.hidden_dim) if normalize_before else None
+            self.encoder = TransformerEncoder(encoder_layer, num_layers,
+                                              encoder_norm)
+
+        fpn_stages = []
+        fpn_routes = []
+        for i, (ch_in, ch_out) in enumerate(zip(in_channels, out_channels)):
+            if i > 0:
+                ch_in += ch_pre // 2
+
+            stage = nn.Sequential()
+            for j in range(stage_num):
+                stage.add_sublayer(
+                    str(j),
+                    eval(stage_fn)(block_fn,
+                                   ch_in if j == 0 else ch_out,
+                                   ch_out,
+                                   block_num,
+                                   act=act,
+                                   spp=(spp and i == 0),
+                                   use_alpha=use_alpha))
+
+            if drop_block:
+                stage.add_sublayer('drop', DropBlock(block_size, keep_prob))
+
+            fpn_stages.append(stage)
+
+            if i < self.num_blocks - 1:
+                fpn_routes.append(
+                    ConvBNLayer(
+                        ch_in=ch_out,
+                        ch_out=ch_out // 2,
+                        filter_size=1,
+                        stride=1,
+                        padding=0,
+                        act=act))
+
+            ch_pre = ch_out
+
+        self.fpn_stages = nn.LayerList(fpn_stages)
+        self.fpn_routes = nn.LayerList(fpn_routes)
+
+        pan_stages = []
+        pan_routes = []
+        for i in reversed(range(self.num_blocks - 1)):
+            pan_routes.append(
+                ConvBNLayer(
+                    ch_in=out_channels[i + 1],
+                    ch_out=out_channels[i + 1],
+                    filter_size=3,
+                    stride=2,
+                    padding=1,
+                    act=act))
+
+            ch_in = out_channels[i] + out_channels[i + 1]
+            ch_out = out_channels[i]
+            stage = nn.Sequential()
+            for j in range(stage_num):
+                stage.add_sublayer(
+                    str(j),
+                    eval(stage_fn)(block_fn,
+                                   ch_in if j == 0 else ch_out,
+                                   ch_out,
+                                   block_num,
+                                   act=act,
+                                   spp=False,
+                                   use_alpha=use_alpha))
+            if drop_block:
+                stage.add_sublayer('drop', DropBlock(block_size, keep_prob))
+
+            pan_stages.append(stage)
+
+        self.pan_stages = nn.LayerList(pan_stages[::-1])
+        self.pan_routes = nn.LayerList(pan_routes[::-1])
+
+    def build_2d_sincos_position_embedding(
+            self,
+            w,
+            h,
+            embed_dim=1024,
+            temperature=10000., ):
+        grid_w = paddle.arange(int(w), dtype=paddle.float32)
+        grid_h = paddle.arange(int(h), dtype=paddle.float32)
+        grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
+        assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+        pos_dim = embed_dim // 4
+        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
+        omega = 1. / (temperature**omega)
+
+        out_w = grid_w.flatten()[..., None] @omega[None]
+        out_h = grid_h.flatten()[..., None] @omega[None]
+
+        pos_emb = paddle.concat(
+            [
+                paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
+                paddle.cos(out_h)
+            ],
+            axis=1)[None, :, :]
+
+        return pos_emb
+
+    def forward(self, blocks, for_mot=False):
+        if self.use_trans:
+            last_feat = blocks[-1]
+            n, c, h, w = last_feat.shape
+
+            # flatten [B, C, H, W] to [B, HxW, C]
+            src_flatten = last_feat.flatten(2).transpose([0, 2, 1])
+            if self.eval_size is not None and not self.training:
+                pos_embed = self.pos_embed
+            else:
+                pos_embed = self.build_2d_sincos_position_embedding(
+                    w=w, h=h, embed_dim=self.hidden_dim)
+
+            memory = self.encoder(src_flatten, pos_embed=pos_embed)
+            last_feat_encode = memory.transpose([0, 2, 1]).reshape([n, c, h, w])
+            blocks[-1] = last_feat_encode
+
+        blocks = blocks[::-1]
+        fpn_feats = []
+
+        for i, block in enumerate(blocks):
+            if i > 0:
+                block = paddle.concat([route, block], axis=1)
+            route = self.fpn_stages[i](block)
+            fpn_feats.append(route)
+
+            if i < self.num_blocks - 1:
+                route = self.fpn_routes[i](route)
+                route = F.interpolate(
+                    route, scale_factor=2., data_format=self.data_format)
+
+        pan_feats = [fpn_feats[-1], ]
+        route = fpn_feats[-1]
+        for i in reversed(range(self.num_blocks - 1)):
+            block = fpn_feats[i]
+            route = self.pan_routes[i](route)
+            block = paddle.concat([route, block], axis=1)
+            route = self.pan_stages[i](block)
+            pan_feats.append(route)
+
+        return pan_feats[::-1]
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
--- a/paddle_detection/ppdet/modeling/necks/dilated_encoder.py
+++ b/paddle_detection/ppdet/modeling/necks/dilated_encoder.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import KaimingUniform, Constant, Normal
+from ppdet.core.workspace import register, serializable
+from ..shape_spec import ShapeSpec
+
+__all__ = ['DilatedEncoder']
+
+
+class Bottleneck(nn.Layer):
+    def __init__(self, in_channels, mid_channels, dilation):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Sequential(* [
+            nn.Conv2D(
+                in_channels,
+                mid_channels,
+                1,
+                padding=0,
+                weight_attr=ParamAttr(initializer=Normal(
+                    mean=0, std=0.01)),
+                bias_attr=ParamAttr(initializer=Constant(0.0))),
+            nn.BatchNorm2D(
+                mid_channels,
+                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                bias_attr=ParamAttr(regularizer=L2Decay(0.0))),
+            nn.ReLU(),
+        ])
+        self.conv2 = nn.Sequential(* [
+            nn.Conv2D(
+                mid_channels,
+                mid_channels,
+                3,
+                padding=dilation,
+                dilation=dilation,
+                weight_attr=ParamAttr(initializer=Normal(
+                    mean=0, std=0.01)),
+                bias_attr=ParamAttr(initializer=Constant(0.0))),
+            nn.BatchNorm2D(
+                mid_channels,
+                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                bias_attr=ParamAttr(regularizer=L2Decay(0.0))),
+            nn.ReLU(),
+        ])
+        self.conv3 = nn.Sequential(* [
+            nn.Conv2D(
+                mid_channels,
+                in_channels,
+                1,
+                padding=0,
+                weight_attr=ParamAttr(initializer=Normal(
+                    mean=0, std=0.01)),
+                bias_attr=ParamAttr(initializer=Constant(0.0))),
+            nn.BatchNorm2D(
+                in_channels,
+                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                bias_attr=ParamAttr(regularizer=L2Decay(0.0))),
+            nn.ReLU(),
+        ])
+
+    def forward(self, x):
+        identity = x
+        y = self.conv3(self.conv2(self.conv1(x)))
+        return y + identity
+
+
+@register
+class DilatedEncoder(nn.Layer):
+    """
+    DilatedEncoder used in YOLOF
+    """
+
+    def __init__(self,
+                 in_channels=[2048],
+                 out_channels=[512],
+                 block_mid_channels=128,
+                 num_residual_blocks=4,
+                 block_dilations=[2, 4, 6, 8]):
+        super(DilatedEncoder, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        assert len(self.in_channels) == 1, "YOLOF only has one level feature."
+        assert len(self.out_channels) == 1, "YOLOF only has one level feature."
+
+        self.block_mid_channels = block_mid_channels
+        self.num_residual_blocks = num_residual_blocks
+        self.block_dilations = block_dilations
+
+        out_ch = self.out_channels[0]
+        self.lateral_conv = nn.Conv2D(
+            self.in_channels[0],
+            out_ch,
+            1,
+            weight_attr=ParamAttr(initializer=KaimingUniform(
+                negative_slope=1, nonlinearity='leaky_relu')),
+            bias_attr=ParamAttr(initializer=Constant(value=0.0)))
+        self.lateral_norm = nn.BatchNorm2D(
+            out_ch,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+        self.fpn_conv = nn.Conv2D(
+            out_ch,
+            out_ch,
+            3,
+            padding=1,
+            weight_attr=ParamAttr(initializer=KaimingUniform(
+                negative_slope=1, nonlinearity='leaky_relu')))
+        self.fpn_norm = nn.BatchNorm2D(
+            out_ch,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+        encoder_blocks = []
+        for i in range(self.num_residual_blocks):
+            encoder_blocks.append(
+                Bottleneck(
+                    out_ch,
+                    self.block_mid_channels,
+                    dilation=block_dilations[i]))
+        self.dilated_encoder_blocks = nn.Sequential(*encoder_blocks)
+
+    def forward(self, inputs, for_mot=False):
+        out = self.lateral_norm(self.lateral_conv(inputs[0]))
+        out = self.fpn_norm(self.fpn_conv(out))
+        out = self.dilated_encoder_blocks(out)
+        return [out]
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self.out_channels]
--- a/paddle_detection/ppdet/modeling/necks/es_pan.py
+++ b/paddle_detection/ppdet/modeling/necks/es_pan.py
@@ -0,0 +1,212 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from ppdet.core.workspace import register, serializable
+
+from ..shape_spec import ShapeSpec
+from ..backbones.esnet import SEModule
+from .csp_pan import ConvBNLayer, Channel_T, DPModule
+
+__all__ = ['ESPAN']
+
+
+class ES_Block(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 kernel_size=5,
+                 stride=1,
+                 act='leaky_relu'):
+        super(ES_Block, self).__init__()
+        self._residual = ConvBNLayer(
+            in_channel=in_channels,
+            out_channel=out_channels,
+            kernel_size=1,
+            stride=1,
+            groups=1,
+            act=act)
+        self._conv_pw = ConvBNLayer(
+            in_channel=in_channels,
+            out_channel=mid_channels // 2,
+            kernel_size=1,
+            stride=1,
+            groups=1,
+            act=act)
+        self._conv_dw = ConvBNLayer(
+            in_channel=mid_channels // 2,
+            out_channel=mid_channels // 2,
+            kernel_size=kernel_size,
+            stride=stride,
+            groups=mid_channels // 2,
+            act=None)
+        self._se = SEModule(mid_channels)
+
+        self._conv_linear = ConvBNLayer(
+            in_channel=mid_channels,
+            out_channel=out_channels,
+            kernel_size=1,
+            stride=1,
+            groups=1,
+            act=act)
+
+        self._out_conv = ConvBNLayer(
+            in_channel=out_channels * 2,
+            out_channel=out_channels,
+            kernel_size=1,
+            stride=1,
+            groups=1,
+            act=act)
+
+    def forward(self, inputs):
+        x1 = self._residual(inputs)
+        x2 = self._conv_pw(inputs)
+        x3 = self._conv_dw(x2)
+        x3 = paddle.concat([x2, x3], axis=1)
+        x3 = self._se(x3)
+        x3 = self._conv_linear(x3)
+        out = paddle.concat([x1, x3], axis=1)
+        out = self._out_conv(out)
+        return out
+
+
+@register
+@serializable
+class ESPAN(nn.Layer):
+    """Path Aggregation Network with ES module.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        kernel_size (int): The conv2d kernel size of this Module.
+        num_features (int): Number of output features of CSPPAN module.
+        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1
+        use_depthwise (bool): Whether to depthwise separable convolution in
+            blocks. Default: True
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=5,
+                 num_features=3,
+                 use_depthwise=True,
+                 act='hard_swish',
+                 spatial_scales=[0.125, 0.0625, 0.03125]):
+        super(ESPAN, self).__init__()
+        self.conv_t = Channel_T(in_channels, out_channels, act=act)
+        in_channels = [out_channels] * len(spatial_scales)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.spatial_scales = spatial_scales
+        self.num_features = num_features
+        conv_func = DPModule if use_depthwise else ConvBNLayer
+
+        if self.num_features == 4:
+            self.first_top_conv = conv_func(
+                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)
+            self.second_top_conv = conv_func(
+                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)
+            self.spatial_scales.append(self.spatial_scales[-1] / 2)
+
+        # build top-down blocks
+        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
+        self.top_down_blocks = nn.LayerList()
+        for idx in range(len(in_channels) - 1, 0, -1):
+            self.top_down_blocks.append(
+                ES_Block(
+                    in_channels[idx - 1] * 2,
+                    in_channels[idx - 1],
+                    in_channels[idx - 1],
+                    kernel_size=kernel_size,
+                    stride=1,
+                    act=act))
+
+        # build bottom-up blocks
+        self.downsamples = nn.LayerList()
+        self.bottom_up_blocks = nn.LayerList()
+        for idx in range(len(in_channels) - 1):
+            self.downsamples.append(
+                conv_func(
+                    in_channels[idx],
+                    in_channels[idx],
+                    kernel_size=kernel_size,
+                    stride=2,
+                    act=act))
+            self.bottom_up_blocks.append(
+                ES_Block(
+                    in_channels[idx] * 2,
+                    in_channels[idx + 1],
+                    in_channels[idx + 1],
+                    kernel_size=kernel_size,
+                    stride=1,
+                    act=act))
+
+    def forward(self, inputs):
+        """
+        Args:
+            inputs (tuple[Tensor]): input features.
+
+        Returns:
+            tuple[Tensor]: CSPPAN features.
+        """
+        assert len(inputs) == len(self.in_channels)
+        inputs = self.conv_t(inputs)
+
+        # top-down path
+        inner_outs = [inputs[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            feat_low = inputs[idx - 1]
+
+            upsample_feat = self.upsample(feat_heigh)
+
+            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
+                paddle.concat([upsample_feat, feat_low], 1))
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up path
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsamples[idx](feat_low)
+            out = self.bottom_up_blocks[idx](paddle.concat(
+                [downsample_feat, feat_height], 1))
+            outs.append(out)
+
+        top_features = None
+        if self.num_features == 4:
+            top_features = self.first_top_conv(inputs[-1])
+            top_features = top_features + self.second_top_conv(outs[-1])
+            outs.append(top_features)
+
+        return tuple(outs)
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self.out_channels, stride=1. / s)
+            for s in self.spatial_scales
+        ]
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
--- a/paddle_detection/ppdet/modeling/necks/fpn.py
+++ b/paddle_detection/ppdet/modeling/necks/fpn.py
@@ -0,0 +1,231 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import XavierUniform
+
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling.layers import ConvNormLayer
+from ..shape_spec import ShapeSpec
+
+__all__ = ['FPN']
+
+
+@register
+@serializable
+class FPN(nn.Layer):
+    """
+    Feature Pyramid Network, see https://arxiv.org/abs/1612.03144
+
+    Args:
+        in_channels (list[int]): input channels of each level which can be 
+            derived from the output shape of backbone by from_config
+        out_channel (int): output channel of each level
+        spatial_scales (list[float]): the spatial scales between input feature
+            maps and original input image which can be derived from the output 
+            shape of backbone by from_config
+        has_extra_convs (bool): whether to add extra conv to the last level.
+            default False
+        extra_stage (int): the number of extra stages added to the last level.
+            default 1
+        use_c5 (bool): Whether to use c5 as the input of extra stage, 
+            otherwise p5 is used. default True
+        norm_type (string|None): The normalization type in FPN module. If 
+            norm_type is None, norm will not be used after conv and if 
+            norm_type is string, bn, gn, sync_bn are available. default None
+        norm_decay (float): weight decay for normalization layer weights.
+            default 0.
+        freeze_norm (bool): whether to freeze normalization layer.  
+            default False
+        relu_before_extra_convs (bool): whether to add relu before extra convs.
+            default False
+        
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channel,
+                 spatial_scales=[0.25, 0.125, 0.0625, 0.03125],
+                 has_extra_convs=False,
+                 extra_stage=1,
+                 use_c5=True,
+                 norm_type=None,
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 relu_before_extra_convs=True):
+        super(FPN, self).__init__()
+        self.out_channel = out_channel
+        for s in range(extra_stage):
+            spatial_scales = spatial_scales + [spatial_scales[-1] / 2.]
+        self.spatial_scales = spatial_scales
+        self.has_extra_convs = has_extra_convs
+        self.extra_stage = extra_stage
+        self.use_c5 = use_c5
+        self.relu_before_extra_convs = relu_before_extra_convs
+        self.norm_type = norm_type
+        self.norm_decay = norm_decay
+        self.freeze_norm = freeze_norm
+
+        self.lateral_convs = []
+        self.fpn_convs = []
+        fan = out_channel * 3 * 3
+
+        # stage index 0,1,2,3 stands for res2,res3,res4,res5 on ResNet Backbone
+        # 0 <= st_stage < ed_stage <= 3
+        st_stage = 4 - len(in_channels)
+        ed_stage = st_stage + len(in_channels) - 1
+        for i in range(st_stage, ed_stage + 1):
+            if i == 3:
+                lateral_name = 'fpn_inner_res5_sum'
+            else:
+                lateral_name = 'fpn_inner_res{}_sum_lateral'.format(i + 2)
+            in_c = in_channels[i - st_stage]
+            if self.norm_type is not None:
+                lateral = self.add_sublayer(
+                    lateral_name,
+                    ConvNormLayer(
+                        ch_in=in_c,
+                        ch_out=out_channel,
+                        filter_size=1,
+                        stride=1,
+                        norm_type=self.norm_type,
+                        norm_decay=self.norm_decay,
+                        freeze_norm=self.freeze_norm,
+                        initializer=XavierUniform(fan_out=in_c)))
+            else:
+                lateral = self.add_sublayer(
+                    lateral_name,
+                    nn.Conv2D(
+                        in_channels=in_c,
+                        out_channels=out_channel,
+                        kernel_size=1,
+                        weight_attr=ParamAttr(
+                            initializer=XavierUniform(fan_out=in_c))))
+            self.lateral_convs.append(lateral)
+
+            fpn_name = 'fpn_res{}_sum'.format(i + 2)
+            if self.norm_type is not None:
+                fpn_conv = self.add_sublayer(
+                    fpn_name,
+                    ConvNormLayer(
+                        ch_in=out_channel,
+                        ch_out=out_channel,
+                        filter_size=3,
+                        stride=1,
+                        norm_type=self.norm_type,
+                        norm_decay=self.norm_decay,
+                        freeze_norm=self.freeze_norm,
+                        initializer=XavierUniform(fan_out=fan)))
+            else:
+                fpn_conv = self.add_sublayer(
+                    fpn_name,
+                    nn.Conv2D(
+                        in_channels=out_channel,
+                        out_channels=out_channel,
+                        kernel_size=3,
+                        padding=1,
+                        weight_attr=ParamAttr(
+                            initializer=XavierUniform(fan_out=fan))))
+            self.fpn_convs.append(fpn_conv)
+
+        # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)
+        if self.has_extra_convs:
+            for i in range(self.extra_stage):
+                lvl = ed_stage + 1 + i
+                if i == 0 and self.use_c5:
+                    in_c = in_channels[-1]
+                else:
+                    in_c = out_channel
+                extra_fpn_name = 'fpn_{}'.format(lvl + 2)
+                if self.norm_type is not None:
+                    extra_fpn_conv = self.add_sublayer(
+                        extra_fpn_name,
+                        ConvNormLayer(
+                            ch_in=in_c,
+                            ch_out=out_channel,
+                            filter_size=3,
+                            stride=2,
+                            norm_type=self.norm_type,
+                            norm_decay=self.norm_decay,
+                            freeze_norm=self.freeze_norm,
+                            initializer=XavierUniform(fan_out=fan)))
+                else:
+                    extra_fpn_conv = self.add_sublayer(
+                        extra_fpn_name,
+                        nn.Conv2D(
+                            in_channels=in_c,
+                            out_channels=out_channel,
+                            kernel_size=3,
+                            stride=2,
+                            padding=1,
+                            weight_attr=ParamAttr(
+                                initializer=XavierUniform(fan_out=fan))))
+                self.fpn_convs.append(extra_fpn_conv)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            'in_channels': [i.channels for i in input_shape],
+            'spatial_scales': [1.0 / i.stride for i in input_shape],
+        }
+
+    def forward(self, body_feats):
+        laterals = []
+        num_levels = len(body_feats)
+        for i in range(num_levels):
+            laterals.append(self.lateral_convs[i](body_feats[i]))
+
+        for i in range(1, num_levels):
+            lvl = num_levels - i
+            upsample = F.interpolate(
+                laterals[lvl],
+                scale_factor=2.,
+                mode='nearest', )
+            laterals[lvl - 1] += upsample
+
+        fpn_output = []
+        for lvl in range(num_levels):
+            fpn_output.append(self.fpn_convs[lvl](laterals[lvl]))
+
+        if self.extra_stage > 0:
+            # use max pool to get more levels on top of outputs (Faster R-CNN, Mask R-CNN)
+            if not self.has_extra_convs:
+                assert self.extra_stage == 1, 'extra_stage should be 1 if FPN has not extra convs'
+                fpn_output.append(F.max_pool2d(fpn_output[-1], 1, stride=2))
+            # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)
+            else:
+                if self.use_c5:
+                    extra_source = body_feats[-1]
+                else:
+                    extra_source = fpn_output[-1]
+                fpn_output.append(self.fpn_convs[num_levels](extra_source))
+
+                for i in range(1, self.extra_stage):
+                    if self.relu_before_extra_convs:
+                        fpn_output.append(self.fpn_convs[num_levels + i](F.relu(
+                            fpn_output[-1])))
+                    else:
+                        fpn_output.append(self.fpn_convs[num_levels + i](
+                            fpn_output[-1]))
+        return fpn_output
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self.out_channel, stride=1. / s)
+            for s in self.spatial_scales
+        ]
--- a/paddle_detection/ppdet/modeling/necks/hrfpn.py
+++ b/paddle_detection/ppdet/modeling/necks/hrfpn.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn.functional as F
+import paddle.nn as nn
+from ppdet.core.workspace import register
+from ..shape_spec import ShapeSpec
+
+__all__ = ['HRFPN']
+
+
+@register
+class HRFPN(nn.Layer):
+    """
+    Args:
+        in_channels (list): number of input feature channels from backbone
+        out_channel (int): number of output feature channels
+        share_conv (bool): whether to share conv for different layers' reduction
+        extra_stage (int): add extra stage for returning HRFPN fpn_feats
+        spatial_scales (list): feature map scaling factor
+    """
+
+    def __init__(self,
+                 in_channels=[18, 36, 72, 144],
+                 out_channel=256,
+                 share_conv=False,
+                 extra_stage=1,
+                 spatial_scales=[1. / 4, 1. / 8, 1. / 16, 1. / 32],
+                 use_bias=False):
+        super(HRFPN, self).__init__()
+        in_channel = sum(in_channels)
+        self.in_channel = in_channel
+        self.out_channel = out_channel
+        self.share_conv = share_conv
+        for i in range(extra_stage):
+            spatial_scales = spatial_scales + [spatial_scales[-1] / 2.]
+        self.spatial_scales = spatial_scales
+        self.num_out = len(self.spatial_scales)
+        self.use_bias = use_bias
+        bias_attr = False if use_bias is False else None
+
+        self.reduction = nn.Conv2D(
+            in_channels=in_channel,
+            out_channels=out_channel,
+            kernel_size=1,
+            bias_attr=bias_attr)
+
+        if share_conv:
+            self.fpn_conv = nn.Conv2D(
+                in_channels=out_channel,
+                out_channels=out_channel,
+                kernel_size=3,
+                padding=1,
+                bias_attr=bias_attr)
+        else:
+            self.fpn_conv = []
+            for i in range(self.num_out):
+                conv_name = "fpn_conv_" + str(i)
+                conv = self.add_sublayer(
+                    conv_name,
+                    nn.Conv2D(
+                        in_channels=out_channel,
+                        out_channels=out_channel,
+                        kernel_size=3,
+                        padding=1,
+                        bias_attr=bias_attr))
+                self.fpn_conv.append(conv)
+
+    def forward(self, body_feats):
+        num_backbone_stages = len(body_feats)
+
+        outs = []
+        outs.append(body_feats[0])
+
+        # resize
+        for i in range(1, num_backbone_stages):
+            resized = F.interpolate(
+                body_feats[i], scale_factor=2**i, mode='bilinear')
+            outs.append(resized)
+
+        # concat
+        out = paddle.concat(outs, axis=1)
+        assert out.shape[
+            1] == self.in_channel, 'in_channel should be {}, be received {}'.format(
+                out.shape[1], self.in_channel)
+
+        # reduction
+        out = self.reduction(out)
+
+        # conv
+        outs = [out]
+        for i in range(1, self.num_out):
+            outs.append(F.avg_pool2d(out, kernel_size=2**i, stride=2**i))
+        outputs = []
+
+        for i in range(self.num_out):
+            conv_func = self.fpn_conv if self.share_conv else self.fpn_conv[i]
+            conv = conv_func(outs[i])
+            outputs.append(conv)
+
+        fpn_feats = [outputs[k] for k in range(self.num_out)]
+        return fpn_feats
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            'in_channels': [i.channels for i in input_shape],
+            'spatial_scales': [1.0 / i.stride for i in input_shape],
+        }
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self.out_channel, stride=1. / s)
+            for s in self.spatial_scales
+        ]
--- a/paddle_detection/ppdet/modeling/necks/lc_pan.py
+++ b/paddle_detection/ppdet/modeling/necks/lc_pan.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from ppdet.core.workspace import register, serializable
+
+from ..shape_spec import ShapeSpec
+from ..backbones.lcnet import DepthwiseSeparable
+from .csp_pan import ConvBNLayer, Channel_T, DPModule
+
+__all__ = ['LCPAN']
+
+
+@register
+@serializable
+class LCPAN(nn.Layer):
+    """Path Aggregation Network with LCNet module.
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale)
+        kernel_size (int): The conv2d kernel size of this Module.
+        num_features (int): Number of output features of CSPPAN module.
+        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1
+        use_depthwise (bool): Whether to depthwise separable convolution in
+            blocks. Default: True
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=5,
+                 num_features=3,
+                 use_depthwise=True,
+                 act='hard_swish',
+                 spatial_scales=[0.125, 0.0625, 0.03125]):
+        super(LCPAN, self).__init__()
+        self.conv_t = Channel_T(in_channels, out_channels, act=act)
+        in_channels = [out_channels] * len(spatial_scales)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.spatial_scales = spatial_scales
+        self.num_features = num_features
+        conv_func = DPModule if use_depthwise else ConvBNLayer
+
+        NET_CONFIG = {
+            #k, in_c, out_c, stride, use_se
+            "block1": [
+                [kernel_size, out_channels * 2, out_channels * 2, 1, False],
+                [kernel_size, out_channels * 2, out_channels, 1, False],
+            ],
+            "block2": [
+                [kernel_size, out_channels * 2, out_channels * 2, 1, False],
+                [kernel_size, out_channels * 2, out_channels, 1, False],
+            ]
+        }
+
+        if self.num_features == 4:
+            self.first_top_conv = conv_func(
+                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)
+            self.second_top_conv = conv_func(
+                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)
+            self.spatial_scales.append(self.spatial_scales[-1] / 2)
+
+        # build top-down blocks
+        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
+        self.top_down_blocks = nn.LayerList()
+        for idx in range(len(in_channels) - 1, 0, -1):
+            self.top_down_blocks.append(
+                nn.Sequential(* [
+                    DepthwiseSeparable(
+                        num_channels=in_c,
+                        num_filters=out_c,
+                        dw_size=k,
+                        stride=s,
+                        use_se=se)
+                    for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[
+                        "block1"])
+                ]))
+
+        # build bottom-up blocks
+        self.downsamples = nn.LayerList()
+        self.bottom_up_blocks = nn.LayerList()
+        for idx in range(len(in_channels) - 1):
+            self.downsamples.append(
+                conv_func(
+                    in_channels[idx],
+                    in_channels[idx],
+                    kernel_size=kernel_size,
+                    stride=2,
+                    act=act))
+            self.bottom_up_blocks.append(
+                nn.Sequential(* [
+                    DepthwiseSeparable(
+                        num_channels=in_c,
+                        num_filters=out_c,
+                        dw_size=k,
+                        stride=s,
+                        use_se=se)
+                    for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[
+                        "block2"])
+                ]))
+
+    def forward(self, inputs):
+        """
+        Args:
+            inputs (tuple[Tensor]): input features.
+        Returns:
+            tuple[Tensor]: CSPPAN features.
+        """
+        assert len(inputs) == len(self.in_channels)
+        inputs = self.conv_t(inputs)
+
+        # top-down path
+        inner_outs = [inputs[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            feat_low = inputs[idx - 1]
+
+            upsample_feat = self.upsample(feat_heigh)
+
+            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
+                paddle.concat([upsample_feat, feat_low], 1))
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up path
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsamples[idx](feat_low)
+            out = self.bottom_up_blocks[idx](paddle.concat(
+                [downsample_feat, feat_height], 1))
+            outs.append(out)
+
+        top_features = None
+        if self.num_features == 4:
+            top_features = self.first_top_conv(inputs[-1])
+            top_features = top_features + self.second_top_conv(outs[-1])
+            outs.append(top_features)
+
+        return tuple(outs)
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self.out_channels, stride=1. / s)
+            for s in self.spatial_scales
+        ]
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
--- a/paddle_detection/ppdet/modeling/necks/ttf_fpn.py
+++ b/paddle_detection/ppdet/modeling/necks/ttf_fpn.py
@@ -0,0 +1,242 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import Constant, Uniform, Normal, XavierUniform
+from ppdet.core.workspace import register, serializable
+from paddle.regularizer import L2Decay
+from ppdet.modeling.layers import DeformableConvV2, ConvNormLayer, LiteConv
+import math
+from ppdet.modeling.ops import batch_norm
+from ..shape_spec import ShapeSpec
+
+__all__ = ['TTFFPN']
+
+
+class Upsample(nn.Layer):
+    def __init__(self, ch_in, ch_out, norm_type='bn'):
+        super(Upsample, self).__init__()
+        fan_in = ch_in * 3 * 3
+        stdv = 1. / math.sqrt(fan_in)
+        self.dcn = DeformableConvV2(
+            ch_in,
+            ch_out,
+            kernel_size=3,
+            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)),
+            bias_attr=ParamAttr(
+                initializer=Constant(0),
+                regularizer=L2Decay(0.),
+                learning_rate=2.),
+            lr_scale=2.,
+            regularizer=L2Decay(0.))
+
+        self.bn = batch_norm(
+            ch_out, norm_type=norm_type, initializer=Constant(1.))
+
+    def forward(self, feat):
+        dcn = self.dcn(feat)
+        bn = self.bn(dcn)
+        relu = F.relu(bn)
+        out = F.interpolate(relu, scale_factor=2., mode='bilinear')
+        return out
+
+
+class DeConv(nn.Layer):
+    def __init__(self, ch_in, ch_out, norm_type='bn'):
+        super(DeConv, self).__init__()
+        self.deconv = nn.Sequential()
+        conv1 = ConvNormLayer(
+            ch_in=ch_in,
+            ch_out=ch_out,
+            stride=1,
+            filter_size=1,
+            norm_type=norm_type,
+            initializer=XavierUniform())
+        conv2 = nn.Conv2DTranspose(
+            in_channels=ch_out,
+            out_channels=ch_out,
+            kernel_size=4,
+            padding=1,
+            stride=2,
+            groups=ch_out,
+            weight_attr=ParamAttr(initializer=XavierUniform()),
+            bias_attr=False)
+        bn = batch_norm(ch_out, norm_type=norm_type, norm_decay=0.)
+        conv3 = ConvNormLayer(
+            ch_in=ch_out,
+            ch_out=ch_out,
+            stride=1,
+            filter_size=1,
+            norm_type=norm_type,
+            initializer=XavierUniform())
+
+        self.deconv.add_sublayer('conv1', conv1)
+        self.deconv.add_sublayer('relu6_1', nn.ReLU6())
+        self.deconv.add_sublayer('conv2', conv2)
+        self.deconv.add_sublayer('bn', bn)
+        self.deconv.add_sublayer('relu6_2', nn.ReLU6())
+        self.deconv.add_sublayer('conv3', conv3)
+        self.deconv.add_sublayer('relu6_3', nn.ReLU6())
+
+    def forward(self, inputs):
+        return self.deconv(inputs)
+
+
+class LiteUpsample(nn.Layer):
+    def __init__(self, ch_in, ch_out, norm_type='bn'):
+        super(LiteUpsample, self).__init__()
+        self.deconv = DeConv(ch_in, ch_out, norm_type=norm_type)
+        self.conv = LiteConv(ch_in, ch_out, norm_type=norm_type)
+
+    def forward(self, inputs):
+        deconv_up = self.deconv(inputs)
+        conv = self.conv(inputs)
+        interp_up = F.interpolate(conv, scale_factor=2., mode='bilinear')
+        return deconv_up + interp_up
+
+
+class ShortCut(nn.Layer):
+    def __init__(self,
+                 layer_num,
+                 ch_in,
+                 ch_out,
+                 norm_type='bn',
+                 lite_neck=False,
+                 name=None):
+        super(ShortCut, self).__init__()
+        shortcut_conv = nn.Sequential()
+        for i in range(layer_num):
+            fan_out = 3 * 3 * ch_out
+            std = math.sqrt(2. / fan_out)
+            in_channels = ch_in if i == 0 else ch_out
+            shortcut_name = name + '.conv.{}'.format(i)
+            if lite_neck:
+                shortcut_conv.add_sublayer(
+                    shortcut_name,
+                    LiteConv(
+                        in_channels=in_channels,
+                        out_channels=ch_out,
+                        with_act=i < layer_num - 1,
+                        norm_type=norm_type))
+            else:
+                shortcut_conv.add_sublayer(
+                    shortcut_name,
+                    nn.Conv2D(
+                        in_channels=in_channels,
+                        out_channels=ch_out,
+                        kernel_size=3,
+                        padding=1,
+                        weight_attr=ParamAttr(initializer=Normal(0, std)),
+                        bias_attr=ParamAttr(
+                            learning_rate=2., regularizer=L2Decay(0.))))
+                if i < layer_num - 1:
+                    shortcut_conv.add_sublayer(shortcut_name + '.act',
+                                               nn.ReLU())
+        self.shortcut = self.add_sublayer('shortcut', shortcut_conv)
+
+    def forward(self, feat):
+        out = self.shortcut(feat)
+        return out
+
+
+@register
+@serializable
+class TTFFPN(nn.Layer):
+    """
+    Args:
+        in_channels (list): number of input feature channels from backbone.
+            [128,256,512,1024] by default, means the channels of DarkNet53
+            backbone return_idx [1,2,3,4].
+        planes (list): the number of output feature channels of FPN.
+            [256, 128, 64] by default
+        shortcut_num (list): the number of convolution layers in each shortcut.
+            [3,2,1] by default, means DarkNet53 backbone return_idx_1 has 3 convs
+            in its shortcut, return_idx_2 has 2 convs and return_idx_3 has 1 conv.
+        norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional. 
+            bn by default
+        lite_neck (bool): whether to use lite conv in TTFNet FPN, 
+            False by default
+        fusion_method (string): the method to fusion upsample and lateral layer.
+            'add' and 'concat' are optional, add by default
+    """
+
+    __shared__ = ['norm_type']
+
+    def __init__(self,
+                 in_channels,
+                 planes=[256, 128, 64],
+                 shortcut_num=[3, 2, 1],
+                 norm_type='bn',
+                 lite_neck=False,
+                 fusion_method='add'):
+        super(TTFFPN, self).__init__()
+        self.planes = planes
+        self.shortcut_num = shortcut_num[::-1]
+        self.shortcut_len = len(shortcut_num)
+        self.ch_in = in_channels[::-1]
+        self.fusion_method = fusion_method
+
+        self.upsample_list = []
+        self.shortcut_list = []
+        self.upper_list = []
+        for i, out_c in enumerate(self.planes):
+            in_c = self.ch_in[i] if i == 0 else self.upper_list[-1]
+            upsample_module = LiteUpsample if lite_neck else Upsample
+            upsample = self.add_sublayer(
+                'upsample.' + str(i),
+                upsample_module(
+                    in_c, out_c, norm_type=norm_type))
+            self.upsample_list.append(upsample)
+            if i < self.shortcut_len:
+                shortcut = self.add_sublayer(
+                    'shortcut.' + str(i),
+                    ShortCut(
+                        self.shortcut_num[i],
+                        self.ch_in[i + 1],
+                        out_c,
+                        norm_type=norm_type,
+                        lite_neck=lite_neck,
+                        name='shortcut.' + str(i)))
+                self.shortcut_list.append(shortcut)
+                if self.fusion_method == 'add':
+                    upper_c = out_c
+                elif self.fusion_method == 'concat':
+                    upper_c = out_c * 2
+                else:
+                    raise ValueError('Illegal fusion method. Expected add or\
+                        concat, but received {}'.format(self.fusion_method))
+                self.upper_list.append(upper_c)
+
+    def forward(self, inputs):
+        feat = inputs[-1]
+        for i, out_c in enumerate(self.planes):
+            feat = self.upsample_list[i](feat)
+            if i < self.shortcut_len:
+                shortcut = self.shortcut_list[i](inputs[-i - 2])
+                if self.fusion_method == 'add':
+                    feat = feat + shortcut
+                else:
+                    feat = paddle.concat([feat, shortcut], axis=1)
+        return feat
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_channels': [i.channels for i in input_shape], }
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=self.upper_list[-1], )]
--- a/paddle_detection/ppdet/modeling/necks/yolo_fpn.py
+++ b/paddle_detection/ppdet/modeling/necks/yolo_fpn.py