first commit

2026-06-03 12:42:47 +08:00
commit ec23799148
339 changed files with 57120 additions and 0 deletions
--- a/rtdetr_paddle/ppdet/modeling/backbones/init.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/init.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from .resnet import *
+from .darknet import *
+from .mobilenet_v1 import *
+from .mobilenet_v3 import *
+from .shufflenet_v2 import *
+from .swin_transformer import *
+from .lcnet import *
+from .cspresnet import *
+from .csp_darknet import *
+from .convnext import *
+from .vision_transformer import *
+from .mobileone import *
+from .trans_encoder import *
+from .focalnet import *
+from .vit_mae import *
+from .hgnet_v2 import *
--- a/rtdetr_paddle/ppdet/modeling/backbones/convnext.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/convnext.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+'''
+Modified from https://github.com/facebookresearch/ConvNeXt
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+This source code is licensed under the license found in the
+LICENSE file in the root directory of this source tree.
+'''
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.nn.initializer import Constant
+
+import numpy as np
+
+from ppdet.core.workspace import register, serializable
+from ..shape_spec import ShapeSpec
+from .transformer_utils import DropPath, trunc_normal_, zeros_
+
+__all__ = ['ConvNeXt']
+
+
+class Block(nn.Layer):
+    r""" ConvNeXt Block. There are two equivalent implementations:
+    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
+    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
+    We use (2) as we find it slightly faster in Pypaddle
+    
+    Args:
+        dim (int): Number of input channels.
+        drop_path (float): Stochastic depth rate. Default: 0.0
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+
+    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
+        super().__init__()
+        self.dwconv = nn.Conv2D(
+            dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Linear(
+            dim, 4 * dim)  # pointwise/1x1 convs, implemented with linear layers
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Linear(4 * dim, dim)
+
+        if layer_scale_init_value > 0:
+            self.gamma = self.create_parameter(
+                shape=(dim, ),
+                attr=ParamAttr(initializer=Constant(layer_scale_init_value)))
+        else:
+            self.gamma = None
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity(
+        )
+
+    def forward(self, x):
+        input = x
+        x = self.dwconv(x)
+        x = x.transpose([0, 2, 3, 1])
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        if self.gamma is not None:
+            x = self.gamma * x
+        x = x.transpose([0, 3, 1, 2])
+        x = input + self.drop_path(x)
+        return x
+
+
+class LayerNorm(nn.Layer):
+    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. 
+    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with 
+    shape (batch_size, height, width, channels) while channels_first corresponds to inputs 
+    with shape (batch_size, channels, height, width).
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
+        super().__init__()
+
+        self.weight = self.create_parameter(
+            shape=(normalized_shape, ),
+            attr=ParamAttr(initializer=Constant(1.)))
+        self.bias = self.create_parameter(
+            shape=(normalized_shape, ),
+            attr=ParamAttr(initializer=Constant(0.)))
+
+        self.eps = eps
+        self.data_format = data_format
+        if self.data_format not in ["channels_last", "channels_first"]:
+            raise NotImplementedError
+        self.normalized_shape = (normalized_shape, )
+
+    def forward(self, x):
+        if self.data_format == "channels_last":
+            return F.layer_norm(x, self.normalized_shape, self.weight,
+                                self.bias, self.eps)
+        elif self.data_format == "channels_first":
+            u = x.mean(1, keepdim=True)
+            s = (x - u).pow(2).mean(1, keepdim=True)
+            x = (x - u) / paddle.sqrt(s + self.eps)
+            x = self.weight[:, None, None] * x + self.bias[:, None, None]
+            return x
+
+
+@register
+@serializable
+class ConvNeXt(nn.Layer):
+    r""" ConvNeXt
+        A Pypaddle impl of : `A ConvNet for the 2020s`  -
+          https://arxiv.org/pdf/2201.03545.pdf
+
+    Args:
+        in_chans (int): Number of input image channels. Default: 3
+        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
+        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
+        drop_path_rate (float): Stochastic depth rate. Default: 0.
+        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
+    """
+
+    arch_settings = {
+        'tiny': {
+            'depths': [3, 3, 9, 3],
+            'dims': [96, 192, 384, 768]
+        },
+        'small': {
+            'depths': [3, 3, 27, 3],
+            'dims': [96, 192, 384, 768]
+        },
+        'base': {
+            'depths': [3, 3, 27, 3],
+            'dims': [128, 256, 512, 1024]
+        },
+        'large': {
+            'depths': [3, 3, 27, 3],
+            'dims': [192, 384, 768, 1536]
+        },
+        'xlarge': {
+            'depths': [3, 3, 27, 3],
+            'dims': [256, 512, 1024, 2048]
+        },
+    }
+
+    def __init__(
+            self,
+            arch='tiny',
+            in_chans=3,
+            drop_path_rate=0.,
+            layer_scale_init_value=1e-6,
+            return_idx=[1, 2, 3],
+            norm_output=True,
+            pretrained=None, ):
+        super().__init__()
+        depths = self.arch_settings[arch]['depths']
+        dims = self.arch_settings[arch]['dims']
+        self.downsample_layers = nn.LayerList(
+        )  # stem and 3 intermediate downsampling conv layers
+        stem = nn.Sequential(
+            nn.Conv2D(
+                in_chans, dims[0], kernel_size=4, stride=4),
+            LayerNorm(
+                dims[0], eps=1e-6, data_format="channels_first"))
+        self.downsample_layers.append(stem)
+        for i in range(3):
+            downsample_layer = nn.Sequential(
+                LayerNorm(
+                    dims[i], eps=1e-6, data_format="channels_first"),
+                nn.Conv2D(
+                    dims[i], dims[i + 1], kernel_size=2, stride=2), )
+            self.downsample_layers.append(downsample_layer)
+
+        self.stages = nn.LayerList(
+        )  # 4 feature resolution stages, each consisting of multiple residual blocks
+        dp_rates = [x for x in np.linspace(0, drop_path_rate, sum(depths))]
+        cur = 0
+        for i in range(4):
+            stage = nn.Sequential(* [
+                Block(
+                    dim=dims[i],
+                    drop_path=dp_rates[cur + j],
+                    layer_scale_init_value=layer_scale_init_value)
+                for j in range(depths[i])
+            ])
+            self.stages.append(stage)
+            cur += depths[i]
+
+        self.return_idx = return_idx
+        self.dims = [dims[i] for i in return_idx]  # [::-1]
+
+        self.norm_output = norm_output
+        if norm_output:
+            self.norms = nn.LayerList([
+                LayerNorm(
+                    c, eps=1e-6, data_format="channels_first")
+                for c in self.dims
+            ])
+
+        self.apply(self._init_weights)
+
+        if pretrained is not None:
+            if 'http' in pretrained:  #URL
+                path = paddle.utils.download.get_weights_path_from_url(
+                    pretrained)
+            else:  #model in local path
+                path = pretrained
+            self.set_state_dict(paddle.load(path))
+
+    def _init_weights(self, m):
+        if isinstance(m, (nn.Conv2D, nn.Linear)):
+            trunc_normal_(m.weight)
+            zeros_(m.bias)
+
+    def forward_features(self, x):
+        output = []
+        for i in range(4):
+            x = self.downsample_layers[i](x)
+            x = self.stages[i](x)
+            output.append(x)
+
+        outputs = [output[i] for i in self.return_idx]
+        if self.norm_output:
+            outputs = [self.norms[i](out) for i, out in enumerate(outputs)]
+
+        return outputs
+
+    def forward(self, x):
+        x = self.forward_features(x['image'])
+        return x
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self.dims]
--- a/rtdetr_paddle/ppdet/modeling/backbones/csp_darknet.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/csp_darknet.py
@@ -0,0 +1,404 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+# 
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling.initializer import conv_init_
+from ..shape_spec import ShapeSpec
+
+__all__ = [
+    'CSPDarkNet', 'BaseConv', 'DWConv', 'BottleNeck', 'SPPLayer', 'SPPFLayer'
+]
+
+
+class BaseConv(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize,
+                 stride,
+                 groups=1,
+                 bias=False,
+                 act="silu"):
+        super(BaseConv, self).__init__()
+        self.conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=ksize,
+            stride=stride,
+            padding=(ksize - 1) // 2,
+            groups=groups,
+            bias_attr=bias)
+        self.bn = nn.BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+        self._init_weights()
+
+    def _init_weights(self):
+        conv_init_(self.conv)
+
+    def forward(self, x):
+        # use 'x * F.sigmoid(x)' replace 'silu'
+        x = self.bn(self.conv(x))
+        y = x * F.sigmoid(x)
+        return y
+
+
+class DWConv(nn.Layer):
+    """Depthwise Conv"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize,
+                 stride=1,
+                 bias=False,
+                 act="silu"):
+        super(DWConv, self).__init__()
+        self.dw_conv = BaseConv(
+            in_channels,
+            in_channels,
+            ksize=ksize,
+            stride=stride,
+            groups=in_channels,
+            bias=bias,
+            act=act)
+        self.pw_conv = BaseConv(
+            in_channels,
+            out_channels,
+            ksize=1,
+            stride=1,
+            groups=1,
+            bias=bias,
+            act=act)
+
+    def forward(self, x):
+        return self.pw_conv(self.dw_conv(x))
+
+
+class Focus(nn.Layer):
+    """Focus width and height information into channel space, used in YOLOX."""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize=3,
+                 stride=1,
+                 bias=False,
+                 act="silu"):
+        super(Focus, self).__init__()
+        self.conv = BaseConv(
+            in_channels * 4,
+            out_channels,
+            ksize=ksize,
+            stride=stride,
+            bias=bias,
+            act=act)
+
+    def forward(self, inputs):
+        # inputs [bs, C, H, W] -> outputs [bs, 4C, W/2, H/2]
+        top_left = inputs[:, :, 0::2, 0::2]
+        top_right = inputs[:, :, 0::2, 1::2]
+        bottom_left = inputs[:, :, 1::2, 0::2]
+        bottom_right = inputs[:, :, 1::2, 1::2]
+        outputs = paddle.concat(
+            [top_left, bottom_left, top_right, bottom_right], 1)
+        return self.conv(outputs)
+
+
+class BottleNeck(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 shortcut=True,
+                 expansion=0.5,
+                 depthwise=False,
+                 bias=False,
+                 act="silu"):
+        super(BottleNeck, self).__init__()
+        hidden_channels = int(out_channels * expansion)
+        Conv = DWConv if depthwise else BaseConv
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.conv2 = Conv(
+            hidden_channels,
+            out_channels,
+            ksize=3,
+            stride=1,
+            bias=bias,
+            act=act)
+        self.add_shortcut = shortcut and in_channels == out_channels
+
+    def forward(self, x):
+        y = self.conv2(self.conv1(x))
+        if self.add_shortcut:
+            y = y + x
+        return y
+
+
+class SPPLayer(nn.Layer):
+    """Spatial Pyramid Pooling (SPP) layer used in YOLOv3-SPP and YOLOX"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_sizes=(5, 9, 13),
+                 bias=False,
+                 act="silu"):
+        super(SPPLayer, self).__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.maxpoolings = nn.LayerList([
+            nn.MaxPool2D(
+                kernel_size=ks, stride=1, padding=ks // 2)
+            for ks in kernel_sizes
+        ])
+        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
+        self.conv2 = BaseConv(
+            conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = paddle.concat([x] + [mp(x) for mp in self.maxpoolings], axis=1)
+        x = self.conv2(x)
+        return x
+
+
+class SPPFLayer(nn.Layer):
+    """ Spatial Pyramid Pooling - Fast (SPPF) layer used in YOLOv5 by Glenn Jocher,
+        equivalent to SPP(k=(5, 9, 13))
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ksize=5,
+                 bias=False,
+                 act='silu'):
+        super(SPPFLayer, self).__init__()
+        hidden_channels = in_channels // 2
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.maxpooling = nn.MaxPool2D(
+            kernel_size=ksize, stride=1, padding=ksize // 2)
+        conv2_channels = hidden_channels * 4
+        self.conv2 = BaseConv(
+            conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        y1 = self.maxpooling(x)
+        y2 = self.maxpooling(y1)
+        y3 = self.maxpooling(y2)
+        concats = paddle.concat([x, y1, y2, y3], axis=1)
+        out = self.conv2(concats)
+        return out
+
+
+class CSPLayer(nn.Layer):
+    """CSP (Cross Stage Partial) layer with 3 convs, named C3 in YOLOv5"""
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_blocks=1,
+                 shortcut=True,
+                 expansion=0.5,
+                 depthwise=False,
+                 bias=False,
+                 act="silu"):
+        super(CSPLayer, self).__init__()
+        hidden_channels = int(out_channels * expansion)
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.conv2 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.bottlenecks = nn.Sequential(* [
+            BottleNeck(
+                hidden_channels,
+                hidden_channels,
+                shortcut=shortcut,
+                expansion=1.0,
+                depthwise=depthwise,
+                bias=bias,
+                act=act) for _ in range(num_blocks)
+        ])
+        self.conv3 = BaseConv(
+            hidden_channels * 2,
+            out_channels,
+            ksize=1,
+            stride=1,
+            bias=bias,
+            act=act)
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_1 = self.bottlenecks(x_1)
+        x_2 = self.conv2(x)
+        x = paddle.concat([x_1, x_2], axis=1)
+        x = self.conv3(x)
+        return x
+
+
+@register
+@serializable
+class CSPDarkNet(nn.Layer):
+    """
+    CSPDarkNet backbone.
+    Args:
+        arch (str): Architecture of CSPDarkNet, from {P5, P6, X}, default as X,
+            and 'X' means used in YOLOX, 'P5/P6' means used in YOLOv5.
+        depth_mult (float): Depth multiplier, multiply number of channels in
+            each layer, default as 1.0.
+        width_mult (float): Width multiplier, multiply number of blocks in
+            CSPLayer, default as 1.0.
+        depthwise (bool): Whether to use depth-wise conv layer.
+        act (str): Activation function type, default as 'silu'.
+        return_idx (list): Index of stages whose feature maps are returned.
+    """
+
+    __shared__ = ['depth_mult', 'width_mult', 'act', 'trt']
+
+    # in_channels, out_channels, num_blocks, add_shortcut, use_spp(use_sppf)
+    # 'X' means setting used in YOLOX, 'P5/P6' means setting used in YOLOv5.
+    arch_settings = {
+        'X': [[64, 128, 3, True, False], [128, 256, 9, True, False],
+              [256, 512, 9, True, False], [512, 1024, 3, False, True]],
+        'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 9, True, False], [512, 1024, 3, True, True]],
+        'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
+               [256, 512, 9, True, False], [512, 768, 3, True, False],
+               [768, 1024, 3, True, True]],
+    }
+
+    def __init__(self,
+                 arch='X',
+                 depth_mult=1.0,
+                 width_mult=1.0,
+                 depthwise=False,
+                 act='silu',
+                 trt=False,
+                 return_idx=[2, 3, 4]):
+        super(CSPDarkNet, self).__init__()
+        self.arch = arch
+        self.return_idx = return_idx
+        Conv = DWConv if depthwise else BaseConv
+        arch_setting = self.arch_settings[arch]
+        base_channels = int(arch_setting[0][0] * width_mult)
+
+        # Note: differences between the latest YOLOv5 and the original YOLOX
+        # 1. self.stem, use SPPF(in YOLOv5) or SPP(in YOLOX)
+        # 2. use SPPF(in YOLOv5) or SPP(in YOLOX)
+        # 3. put SPPF before(YOLOv5) or SPP after(YOLOX) the last cspdark block's CSPLayer
+        # 4. whether SPPF(SPP)'CSPLayer add shortcut, True in YOLOv5, False in YOLOX
+        if arch in ['P5', 'P6']:
+            # in the latest YOLOv5, use Conv stem, and SPPF (fast, only single spp kernal size)
+            self.stem = Conv(
+                3, base_channels, ksize=6, stride=2, bias=False, act=act)
+            spp_kernal_sizes = 5
+        elif arch in ['X']:
+            # in the original YOLOX, use Focus stem, and SPP (three spp kernal sizes)
+            self.stem = Focus(
+                3, base_channels, ksize=3, stride=1, bias=False, act=act)
+            spp_kernal_sizes = (5, 9, 13)
+        else:
+            raise AttributeError("Unsupported arch type: {}".format(arch))
+
+        _out_channels = [base_channels]
+        layers_num = 1
+        self.csp_dark_blocks = []
+
+        for i, (in_channels, out_channels, num_blocks, shortcut,
+                use_spp) in enumerate(arch_setting):
+            in_channels = int(in_channels * width_mult)
+            out_channels = int(out_channels * width_mult)
+            _out_channels.append(out_channels)
+            num_blocks = max(round(num_blocks * depth_mult), 1)
+            stage = []
+
+            conv_layer = self.add_sublayer(
+                'layers{}.stage{}.conv_layer'.format(layers_num, i + 1),
+                Conv(
+                    in_channels, out_channels, 3, 2, bias=False, act=act))
+            stage.append(conv_layer)
+            layers_num += 1
+
+            if use_spp and arch in ['X']:
+                # in YOLOX use SPPLayer
+                spp_layer = self.add_sublayer(
+                    'layers{}.stage{}.spp_layer'.format(layers_num, i + 1),
+                    SPPLayer(
+                        out_channels,
+                        out_channels,
+                        kernel_sizes=spp_kernal_sizes,
+                        bias=False,
+                        act=act))
+                stage.append(spp_layer)
+                layers_num += 1
+
+            csp_layer = self.add_sublayer(
+                'layers{}.stage{}.csp_layer'.format(layers_num, i + 1),
+                CSPLayer(
+                    out_channels,
+                    out_channels,
+                    num_blocks=num_blocks,
+                    shortcut=shortcut,
+                    depthwise=depthwise,
+                    bias=False,
+                    act=act))
+            stage.append(csp_layer)
+            layers_num += 1
+
+            if use_spp and arch in ['P5', 'P6']:
+                # in latest YOLOv5 use SPPFLayer instead of SPPLayer
+                sppf_layer = self.add_sublayer(
+                    'layers{}.stage{}.sppf_layer'.format(layers_num, i + 1),
+                    SPPFLayer(
+                        out_channels,
+                        out_channels,
+                        ksize=5,
+                        bias=False,
+                        act=act))
+                stage.append(sppf_layer)
+                layers_num += 1
+
+            self.csp_dark_blocks.append(nn.Sequential(*stage))
+
+        self._out_channels = [_out_channels[i] for i in self.return_idx]
+        self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx]
+
+    def forward(self, inputs):
+        x = inputs['image']
+        outputs = []
+        x = self.stem(x)
+        for i, layer in enumerate(self.csp_dark_blocks):
+            x = layer(x)
+            if i + 1 in self.return_idx:
+                outputs.append(x)
+        return outputs
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=c, stride=s)
+            for c, s in zip(self._out_channels, self.strides)
+        ]
--- a/rtdetr_paddle/ppdet/modeling/backbones/cspresnet.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/cspresnet.py
@@ -0,0 +1,321 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Constant
+
+from ppdet.modeling.ops import get_act_fn
+from ppdet.core.workspace import register, serializable
+from ..shape_spec import ShapeSpec
+
+__all__ = ['CSPResNet', 'BasicBlock', 'EffectiveSELayer', 'ConvBNLayer']
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size=3,
+                 stride=1,
+                 groups=1,
+                 padding=0,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+
+        self.conv = nn.Conv2D(
+            in_channels=ch_in,
+            out_channels=ch_out,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias_attr=False)
+
+        self.bn = nn.BatchNorm2D(
+            ch_out,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self.act = get_act_fn(act) if act is None or isinstance(act, (
+            str, dict)) else act
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+
+        return x
+
+
+class RepVggBlock(nn.Layer):
+    def __init__(self, ch_in, ch_out, act='relu', alpha=False):
+        super(RepVggBlock, self).__init__()
+        self.ch_in = ch_in
+        self.ch_out = ch_out
+        self.conv1 = ConvBNLayer(
+            ch_in, ch_out, 3, stride=1, padding=1, act=None)
+        self.conv2 = ConvBNLayer(
+            ch_in, ch_out, 1, stride=1, padding=0, act=None)
+        self.act = get_act_fn(act) if act is None or isinstance(act, (
+            str, dict)) else act
+        if alpha:
+            self.alpha = self.create_parameter(
+                shape=[1],
+                attr=ParamAttr(initializer=Constant(value=1.)),
+                dtype="float32")
+        else:
+            self.alpha = None
+
+    def forward(self, x):
+        if hasattr(self, 'conv'):
+            y = self.conv(x)
+        else:
+            if self.alpha:
+                y = self.conv1(x) + self.alpha * self.conv2(x)
+            else:
+                y = self.conv1(x) + self.conv2(x)
+        y = self.act(y)
+        return y
+
+    def convert_to_deploy(self):
+        if not hasattr(self, 'conv'):
+            self.conv = nn.Conv2D(
+                in_channels=self.ch_in,
+                out_channels=self.ch_out,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                groups=1)
+        kernel, bias = self.get_equivalent_kernel_bias()
+        self.conv.weight.set_value(kernel)
+        self.conv.bias.set_value(bias)
+        self.__delattr__('conv1')
+        self.__delattr__('conv2')
+
+    def get_equivalent_kernel_bias(self):
+        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
+        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
+        if self.alpha:
+            return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor(
+                kernel1x1), bias3x3 + self.alpha * bias1x1
+        else:
+            return kernel3x3 + self._pad_1x1_to_3x3_tensor(
+                kernel1x1), bias3x3 + bias1x1
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
+
+    def _fuse_bn_tensor(self, branch):
+        if branch is None:
+            return 0, 0
+        kernel = branch.conv.weight
+        running_mean = branch.bn._mean
+        running_var = branch.bn._variance
+        gamma = branch.bn.weight
+        beta = branch.bn.bias
+        eps = branch.bn._epsilon
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape((-1, 1, 1, 1))
+        return kernel * t, beta - running_mean * gamma / std
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 act='relu',
+                 shortcut=True,
+                 use_alpha=False):
+        super(BasicBlock, self).__init__()
+        assert ch_in == ch_out
+        self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act)
+        self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha)
+        self.shortcut = shortcut
+
+    def forward(self, x):
+        y = self.conv1(x)
+        y = self.conv2(y)
+        if self.shortcut:
+            return paddle.add(x, y)
+        else:
+            return y
+
+
+class EffectiveSELayer(nn.Layer):
+    """ Effective Squeeze-Excitation
+    From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
+    """
+
+    def __init__(self, channels, act='hardsigmoid'):
+        super(EffectiveSELayer, self).__init__()
+        self.fc = nn.Conv2D(channels, channels, kernel_size=1, padding=0)
+        self.act = get_act_fn(act) if act is None or isinstance(act, (
+            str, dict)) else act
+
+    def forward(self, x):
+        x_se = x.mean((2, 3), keepdim=True)
+        x_se = self.fc(x_se)
+        return x * self.act(x_se)
+
+
+class CSPResStage(nn.Layer):
+    def __init__(self,
+                 block_fn,
+                 ch_in,
+                 ch_out,
+                 n,
+                 stride,
+                 act='relu',
+                 attn='eca',
+                 use_alpha=False):
+        super(CSPResStage, self).__init__()
+
+        ch_mid = (ch_in + ch_out) // 2
+        if stride == 2:
+            self.conv_down = ConvBNLayer(
+                ch_in, ch_mid, 3, stride=2, padding=1, act=act)
+        else:
+            self.conv_down = None
+        self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
+        self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
+        self.blocks = nn.Sequential(*[
+            block_fn(
+                ch_mid // 2,
+                ch_mid // 2,
+                act=act,
+                shortcut=True,
+                use_alpha=use_alpha) for i in range(n)
+        ])
+        if attn:
+            self.attn = EffectiveSELayer(ch_mid, act='hardsigmoid')
+        else:
+            self.attn = None
+
+        self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act)
+
+    def forward(self, x):
+        if self.conv_down is not None:
+            x = self.conv_down(x)
+        y1 = self.conv1(x)
+        y2 = self.blocks(self.conv2(x))
+        y = paddle.concat([y1, y2], axis=1)
+        if self.attn is not None:
+            y = self.attn(y)
+        y = self.conv3(y)
+        return y
+
+
+@register
+@serializable
+class CSPResNet(nn.Layer):
+    __shared__ = ['width_mult', 'depth_mult', 'trt']
+
+    def __init__(self,
+                 layers=[3, 6, 6, 3],
+                 channels=[64, 128, 256, 512, 1024],
+                 act='swish',
+                 return_idx=[1, 2, 3],
+                 depth_wise=False,
+                 use_large_stem=False,
+                 width_mult=1.0,
+                 depth_mult=1.0,
+                 trt=False,
+                 use_checkpoint=False,
+                 use_alpha=False,
+                 **args):
+        super(CSPResNet, self).__init__()
+        self.use_checkpoint = use_checkpoint
+        channels = [max(round(c * width_mult), 1) for c in channels]
+        layers = [max(round(l * depth_mult), 1) for l in layers]
+        act = get_act_fn(
+            act, trt=trt) if act is None or isinstance(act,
+                                                       (str, dict)) else act
+
+        if use_large_stem:
+            self.stem = nn.Sequential(
+                ('conv1', ConvBNLayer(
+                    3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
+                ('conv2', ConvBNLayer(
+                    channels[0] // 2,
+                    channels[0] // 2,
+                    3,
+                    stride=1,
+                    padding=1,
+                    act=act)), ('conv3', ConvBNLayer(
+                        channels[0] // 2,
+                        channels[0],
+                        3,
+                        stride=1,
+                        padding=1,
+                        act=act)))
+        else:
+            self.stem = nn.Sequential(
+                ('conv1', ConvBNLayer(
+                    3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
+                ('conv2', ConvBNLayer(
+                    channels[0] // 2,
+                    channels[0],
+                    3,
+                    stride=1,
+                    padding=1,
+                    act=act)))
+
+        n = len(channels) - 1
+        self.stages = nn.Sequential(*[(str(i), CSPResStage(
+            BasicBlock,
+            channels[i],
+            channels[i + 1],
+            layers[i],
+            2,
+            act=act,
+            use_alpha=use_alpha)) for i in range(n)])
+
+        self._out_channels = channels[1:]
+        self._out_strides = [4 * 2**i for i in range(n)]
+        self.return_idx = return_idx
+        if use_checkpoint:
+            paddle.seed(0)
+
+    def forward(self, inputs):
+        x = inputs['image']
+        x = self.stem(x)
+        outs = []
+        for idx, stage in enumerate(self.stages):
+            if self.use_checkpoint and self.training:
+                x = paddle.distributed.fleet.utils.recompute(
+                    stage, x, **{"preserve_rng_state": True})
+            else:
+                x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+
+        return outs
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self._out_channels[i], stride=self._out_strides[i])
+            for i in self.return_idx
+        ]
--- a/rtdetr_paddle/ppdet/modeling/backbones/darknet.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/darknet.py
@@ -0,0 +1,345 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling.ops import batch_norm, mish
+from ..shape_spec import ShapeSpec
+
+__all__ = ['DarkNet', 'ConvBNLayer']
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size=3,
+                 stride=1,
+                 groups=1,
+                 padding=0,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 act="leaky",
+                 freeze_norm=False,
+                 data_format='NCHW',
+                 name=''):
+        """
+        conv + bn + activation layer
+
+        Args:
+            ch_in (int): input channel
+            ch_out (int): output channel
+            filter_size (int): filter size, default 3
+            stride (int): stride, default 1
+            groups (int): number of groups of conv layer, default 1
+            padding (int): padding size, default 0
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            act (str): activation function type, default 'leaky', which means leaky_relu
+            freeze_norm (bool): whether to freeze norm, default False
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(ConvBNLayer, self).__init__()
+
+        self.conv = nn.Conv2D(
+            in_channels=ch_in,
+            out_channels=ch_out,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            data_format=data_format,
+            bias_attr=False)
+        self.batch_norm = batch_norm(
+            ch_out,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+        self.act = act
+
+    def forward(self, inputs):
+        out = self.conv(inputs)
+        out = self.batch_norm(out)
+        if self.act == 'leaky':
+            out = F.leaky_relu(out, 0.1)
+        else:
+            out = getattr(F, self.act)(out)
+        return out
+
+
+class DownSample(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size=3,
+                 stride=2,
+                 padding=1,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 data_format='NCHW'):
+        """
+        downsample layer
+
+        Args:
+            ch_in (int): input channel
+            ch_out (int): output channel
+            filter_size (int): filter size, default 3
+            stride (int): stride, default 2
+            padding (int): padding size, default 1
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            freeze_norm (bool): whether to freeze norm, default False
+            data_format (str): data format, NCHW or NHWC
+        """
+
+        super(DownSample, self).__init__()
+
+        self.conv_bn_layer = ConvBNLayer(
+            ch_in=ch_in,
+            ch_out=ch_out,
+            filter_size=filter_size,
+            stride=stride,
+            padding=padding,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+        self.ch_out = ch_out
+
+    def forward(self, inputs):
+        out = self.conv_bn_layer(inputs)
+        return out
+
+
+class BasicBlock(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 data_format='NCHW'):
+        """
+        BasicBlock layer of DarkNet
+
+        Args:
+            ch_in (int): input channel
+            ch_out (int): output channel
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            freeze_norm (bool): whether to freeze norm, default False
+            data_format (str): data format, NCHW or NHWC
+        """
+
+        super(BasicBlock, self).__init__()
+
+        assert ch_in == ch_out and (ch_in % 2) == 0, \
+            f"ch_in and ch_out should be the same even int, but the input \'ch_in is {ch_in}, \'ch_out is {ch_out}"
+        # example:
+        # --------------{conv1} --> {conv2}
+        # channel route: 10-->5 --> 5-->10
+        self.conv1 = ConvBNLayer(
+            ch_in=ch_in,
+            ch_out=int(ch_out / 2),
+            filter_size=1,
+            stride=1,
+            padding=0,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+        self.conv2 = ConvBNLayer(
+            ch_in=int(ch_out / 2),
+            ch_out=ch_out,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+
+    def forward(self, inputs):
+        conv1 = self.conv1(inputs)
+        conv2 = self.conv2(conv1)
+        out = paddle.add(x=inputs, y=conv2)
+        return out
+
+
+class Blocks(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 count,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 name=None,
+                 data_format='NCHW'):
+        """
+        Blocks layer, which consist of some BaickBlock layers
+
+        Args:
+            ch_in (int): input channel
+            ch_out (int): output channel
+            count (int): number of BasicBlock layer
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            freeze_norm (bool): whether to freeze norm, default False
+            name (str): layer name
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(Blocks, self).__init__()
+
+        self.basicblock0 = BasicBlock(
+            ch_in,
+            ch_out,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+        self.res_out_list = []
+        for i in range(1, count):
+            block_name = '{}.{}'.format(name, i)
+            res_out = self.add_sublayer(
+                block_name,
+                BasicBlock(
+                    ch_out,
+                    ch_out,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    data_format=data_format))
+            self.res_out_list.append(res_out)
+        self.ch_out = ch_out
+
+    def forward(self, inputs):
+        y = self.basicblock0(inputs)
+        for basic_block_i in self.res_out_list:
+            y = basic_block_i(y)
+        return y
+
+
+DarkNet_cfg = {53: ([1, 2, 8, 8, 4])}
+
+
+@register
+@serializable
+class DarkNet(nn.Layer):
+    __shared__ = ['norm_type', 'data_format']
+
+    def __init__(self,
+                 depth=53,
+                 freeze_at=-1,
+                 return_idx=[2, 3, 4],
+                 num_stages=5,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 data_format='NCHW'):
+        """
+        Darknet, see https://pjreddie.com/darknet/yolo/
+
+        Args:
+            depth (int): depth of network
+            freeze_at (int): freeze the backbone at which stage
+            filter_size (int): filter size, default 3
+            return_idx (list): index of stages whose feature maps are returned
+            norm_type (str): batch norm type, default bn
+            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
+            data_format (str): data format, NCHW or NHWC
+        """
+        super(DarkNet, self).__init__()
+        self.depth = depth
+        self.freeze_at = freeze_at
+        self.return_idx = return_idx
+        self.num_stages = num_stages
+        self.stages = DarkNet_cfg[self.depth][0:num_stages]
+
+        self.conv0 = ConvBNLayer(
+            ch_in=3,
+            ch_out=32,
+            filter_size=3,
+            stride=1,
+            padding=1,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+
+        self.downsample0 = DownSample(
+            ch_in=32,
+            ch_out=32 * 2,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            data_format=data_format)
+
+        self._out_channels = []
+        self.darknet_conv_block_list = []
+        self.downsample_list = []
+        ch_in = [64, 128, 256, 512, 1024]
+        for i, stage in enumerate(self.stages):
+            name = 'stage.{}'.format(i)
+            conv_block = self.add_sublayer(
+                name,
+                Blocks(
+                    int(ch_in[i]),
+                    int(ch_in[i]),
+                    stage,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    data_format=data_format,
+                    name=name))
+            self.darknet_conv_block_list.append(conv_block)
+            if i in return_idx:
+                self._out_channels.append(int(ch_in[i]))
+        for i in range(num_stages - 1):
+            down_name = 'stage.{}.downsample'.format(i)
+            downsample = self.add_sublayer(
+                down_name,
+                DownSample(
+                    ch_in=int(ch_in[i]),
+                    ch_out=int(ch_in[i + 1]),
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    data_format=data_format))
+            self.downsample_list.append(downsample)
+
+    def forward(self, inputs):
+        x = inputs['image']
+
+        out = self.conv0(x)
+        out = self.downsample0(out)
+        blocks = []
+        for i, conv_block_i in enumerate(self.darknet_conv_block_list):
+            out = conv_block_i(out)
+            if i == self.freeze_at:
+                out.stop_gradient = True
+            if i in self.return_idx:
+                blocks.append(out)
+            if i < self.num_stages - 1:
+                out = self.downsample_list[i](out)
+        return blocks
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
--- a/rtdetr_paddle/ppdet/modeling/backbones/focalnet.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/focalnet.py
@@ -0,0 +1,720 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is based on https://github.com/microsoft/FocalNet/blob/main/classification/focalnet.py
+"""
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.modeling.shape_spec import ShapeSpec
+from ppdet.core.workspace import register, serializable
+from .transformer_utils import DropPath, Identity
+from .transformer_utils import add_parameter, to_2tuple
+from .transformer_utils import ones_, zeros_, trunc_normal_
+from .swin_transformer import Mlp
+
+__all__ = ['FocalNet']
+
+MODEL_cfg = {
+    'focalnet_T_224_1k_srf': dict(
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        focal_levels=[2, 2, 2, 2],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.2,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_srf_pretrained.pdparams',
+    ),
+    'focalnet_S_224_1k_srf': dict(
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        focal_levels=[2, 2, 2, 2],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.3,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_srf_pretrained.pdparams',
+    ),
+    'focalnet_B_224_1k_srf': dict(
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        focal_levels=[2, 2, 2, 2],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_srf_pretrained.pdparams',
+    ),
+    'focalnet_T_224_1k_lrf': dict(
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.2,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_lrf_pretrained.pdparams',
+    ),
+    'focalnet_S_224_1k_lrf': dict(
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.3,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_lrf_pretrained.pdparams',
+    ),
+    'focalnet_B_224_1k_lrf': dict(
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=False,
+        use_postln=False,
+        use_postln_in_modulation=False,
+        use_layerscale=False,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_lrf_pretrained.pdparams',
+    ),
+    'focalnet_L_384_22k_fl3': dict(
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[5, 5, 5, 5],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=False,
+        use_layerscale=True,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_pretrained.pdparams',
+    ),
+    'focalnet_L_384_22k_fl4': dict(
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        focal_levels=[4, 4, 4, 4],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=False,
+        use_layerscale=True,
+        normalize_modulator=True,  #
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_fl4_pretrained.pdparams',
+    ),
+    'focalnet_XL_384_22k_fl3': dict(
+        embed_dim=256,
+        depths=[2, 2, 18, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[5, 5, 5, 5],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=False,
+        use_layerscale=True,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_pretrained.pdparams',
+    ),
+    'focalnet_XL_384_22k_fl4': dict(
+        embed_dim=256,
+        depths=[2, 2, 18, 2],
+        focal_levels=[4, 4, 4, 4],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=False,
+        use_layerscale=True,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_fl4_pretrained.pdparams',
+    ),
+    'focalnet_H_224_22k_fl3': dict(
+        embed_dim=352,
+        depths=[2, 2, 18, 2],
+        focal_levels=[3, 3, 3, 3],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=True,  #
+        use_layerscale=True,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_pretrained.pdparams',
+    ),
+    'focalnet_H_224_22k_fl4': dict(
+        embed_dim=352,
+        depths=[2, 2, 18, 2],
+        focal_levels=[4, 4, 4, 4],
+        focal_windows=[3, 3, 3, 3],
+        drop_path_rate=0.5,
+        use_conv_embed=True,
+        use_postln=True,
+        use_postln_in_modulation=True,  #
+        use_layerscale=True,
+        normalize_modulator=False,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_fl4_pretrained.pdparams',
+    ),
+}
+
+
+class FocalModulation(nn.Layer):
+    """
+    Args:
+        dim (int): Number of input channels.
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+        focal_level (int): Number of focal levels
+        focal_window (int): Focal window size at focal level 1
+        focal_factor (int): Step to increase the focal window. Default: 2
+        use_postln_in_modulation (bool): Whether use post-modulation layernorm
+        normalize_modulator (bool): Whether use normalize in modulator
+    """
+
+    def __init__(self,
+                 dim,
+                 proj_drop=0.,
+                 focal_level=2,
+                 focal_window=7,
+                 focal_factor=2,
+                 use_postln_in_modulation=False,
+                 normalize_modulator=False):
+        super().__init__()
+        self.dim = dim
+
+        # specific args for focalv3
+        self.focal_level = focal_level
+        self.focal_window = focal_window
+        self.focal_factor = focal_factor
+        self.use_postln_in_modulation = use_postln_in_modulation
+        self.normalize_modulator = normalize_modulator
+
+        self.f = nn.Linear(
+            dim, 2 * dim + (self.focal_level + 1), bias_attr=True)
+        self.h = nn.Conv2D(
+            dim,
+            dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            bias_attr=True)
+
+        self.act = nn.GELU()
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.focal_layers = nn.LayerList()
+
+        if self.use_postln_in_modulation:
+            self.ln = nn.LayerNorm(dim)
+
+        for k in range(self.focal_level):
+            kernel_size = self.focal_factor * k + self.focal_window
+            self.focal_layers.append(
+                nn.Sequential(
+                    nn.Conv2D(
+                        dim,
+                        dim,
+                        kernel_size=kernel_size,
+                        stride=1,
+                        groups=dim,
+                        padding=kernel_size // 2,
+                        bias_attr=False),
+                    nn.GELU()))
+
+    def forward(self, x):
+        """ Forward function.
+        Args:
+            x: input features with shape of (B, H, W, C)
+        """
+        _, _, _, C = x.shape
+        x = self.f(x)
+        x = x.transpose([0, 3, 1, 2])
+        q, ctx, gates = paddle.split(x, (C, C, self.focal_level + 1), 1)
+
+        ctx_all = 0
+        for l in range(self.focal_level):
+            ctx = self.focal_layers[l](ctx)
+            ctx_all = ctx_all + ctx * gates[:, l:l + 1]
+        ctx_global = self.act(ctx.mean(2, keepdim=True).mean(3, keepdim=True))
+        ctx_all = ctx_all + ctx_global * gates[:, self.focal_level:]
+        if self.normalize_modulator:
+            ctx_all = ctx_all / (self.focal_level + 1)
+
+        x_out = q * self.h(ctx_all)
+        x_out = x_out.transpose([0, 2, 3, 1])
+        if self.use_postln_in_modulation:
+            x_out = self.ln(x_out)
+        x_out = self.proj(x_out)
+        x_out = self.proj_drop(x_out)
+        return x_out
+
+
+class FocalModulationBlock(nn.Layer):
+    """ Focal Modulation Block.
+    Args:
+        dim (int): Number of input channels.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        drop (float, optional): Dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+        focal_level (int): number of focal levels
+        focal_window (int): focal kernel size at level 1
+        use_postln (bool): Whether use layernorm after modulation. Default: False.
+        use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
+        normalize_modulator (bool): Whether use normalize in modulator
+        use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False 
+        layerscale_value (float): Value for layer scale. Default: 1e-4 
+    """
+
+    def __init__(self,
+                 dim,
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm,
+                 focal_level=2,
+                 focal_window=9,
+                 use_postln=False,
+                 use_postln_in_modulation=False,
+                 normalize_modulator=False,
+                 use_layerscale=False,
+                 layerscale_value=1e-4):
+        super().__init__()
+        self.dim = dim
+        self.mlp_ratio = mlp_ratio
+        self.focal_window = focal_window
+        self.focal_level = focal_level
+        self.use_postln = use_postln
+        self.use_layerscale = use_layerscale
+
+        self.norm1 = norm_layer(dim)
+        self.modulation = FocalModulation(
+            dim,
+            proj_drop=drop,
+            focal_level=self.focal_level,
+            focal_window=self.focal_window,
+            use_postln_in_modulation=use_postln_in_modulation,
+            normalize_modulator=normalize_modulator)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        self.H = None
+        self.W = None
+
+        self.gamma_1 = 1.0
+        self.gamma_2 = 1.0
+        if self.use_layerscale:
+            self.gamma_1 = add_parameter(self,
+                                         layerscale_value * paddle.ones([dim]))
+            self.gamma_2 = add_parameter(self,
+                                         layerscale_value * paddle.ones([dim]))
+
+    def forward(self, x):
+        """
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        if not self.use_postln:
+            x = self.norm1(x)
+        x = x.reshape([-1, H, W, C])
+
+        # FM
+        x = self.modulation(x).reshape([-1, H * W, C])
+        if self.use_postln:
+            x = self.norm1(x)
+
+        # FFN
+        x = shortcut + self.drop_path(self.gamma_1 * x)
+
+        if self.use_postln:
+            x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x)))
+        else:
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class BasicLayer(nn.Layer):
+    """ A basic focal modulation layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        drop (float, optional): Dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
+        focal_level (int): Number of focal levels
+        focal_window (int): Focal window size at focal level 1
+        use_conv_embed (bool): Whether use overlapped convolution for patch embedding
+        use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False 
+        layerscale_value (float): Value of layerscale
+        use_postln (bool): Whether use layernorm after modulation. Default: False.
+        use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
+        normalize_modulator (bool): Whether use normalize in modulator
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(self,
+                 dim,
+                 depth,
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 focal_level=2,
+                 focal_window=9,
+                 use_conv_embed=False,
+                 use_layerscale=False,
+                 layerscale_value=1e-4,
+                 use_postln=False,
+                 use_postln_in_modulation=False,
+                 normalize_modulator=False,
+                 use_checkpoint=False):
+        super().__init__()
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.LayerList([
+            FocalModulationBlock(
+                dim=dim,
+                mlp_ratio=mlp_ratio,
+                drop=drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, np.ndarray) else drop_path,
+                act_layer=nn.GELU,
+                norm_layer=norm_layer,
+                focal_level=focal_level,
+                focal_window=focal_window,
+                use_postln=use_postln,
+                use_postln_in_modulation=use_postln_in_modulation,
+                normalize_modulator=normalize_modulator,
+                use_layerscale=use_layerscale,
+                layerscale_value=layerscale_value) for i in range(depth)
+        ])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                patch_size=2,
+                in_chans=dim,
+                embed_dim=2 * dim,
+                use_conv_embed=use_conv_embed,
+                norm_layer=norm_layer,
+                is_stem=False)
+        else:
+            self.downsample = None
+
+    def forward(self, x, H, W):
+        """
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+        """
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            x = blk(x)
+
+        if self.downsample is not None:
+            x_reshaped = x.transpose([0, 2, 1]).reshape(
+                [x.shape[0], x.shape[-1], H, W])
+            x_down = self.downsample(x_reshaped)
+            x_down = x_down.flatten(2).transpose([0, 2, 1])
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Layer, optional): Normalization layer. Default: None
+        use_conv_embed (bool): Whether use overlapped convolution for patch embedding. Default: False
+        is_stem (bool): Is the stem block or not. 
+    """
+
+    def __init__(self,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None,
+                 use_conv_embed=False,
+                 is_stem=False):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        if use_conv_embed:
+            # if we choose to use conv embedding, then we treat the stem and non-stem differently
+            if is_stem:
+                kernel_size = 7
+                padding = 2
+                stride = 4
+            else:
+                kernel_size = 3
+                padding = 1
+                stride = 2
+            self.proj = nn.Conv2D(
+                in_chans,
+                embed_dim,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding)
+        else:
+            self.proj = nn.Conv2D(
+                in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        _, _, H, W = x.shape
+
+        if W % self.patch_size[1] != 0:
+            # for 3D tensor: [pad_left, pad_right]
+            # for 4D tensor: [pad_left, pad_right, pad_top, pad_bottom]
+            x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
+            W += W % self.patch_size[1]
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
+            H += H % self.patch_size[0]
+
+        x = self.proj(x)
+        if self.norm is not None:
+            _, _, Wh, Ww = x.shape
+            x = x.flatten(2).transpose([0, 2, 1])
+            x = self.norm(x)
+            x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
+
+        return x
+
+
+@register
+@serializable
+class FocalNet(nn.Layer):
+    """ FocalNet backbone
+    Args:
+        arch (str): Architecture of FocalNet
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each FocalNet Transformer stage.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        drop_rate (float): Dropout rate.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        focal_levels (Sequence[int]): Number of focal levels at four stages
+        focal_windows (Sequence[int]): Focal window sizes at first focal level at four stages
+        use_conv_embed (bool): Whether use overlapped convolution for patch embedding
+        use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False 
+        layerscale_value (float): Value of layerscale
+        use_postln (bool): Whether use layernorm after modulation. Default: False.
+        use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
+        normalize_modulator (bool): Whether use normalize in modulator
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(
+            self,
+            arch='focalnet_T_224_1k_srf',
+            out_indices=(0, 1, 2, 3),
+            frozen_stages=-1,
+            patch_size=4,
+            in_chans=3,
+            embed_dim=96,
+            depths=[2, 2, 6, 2],
+            mlp_ratio=4.,
+            drop_rate=0.,
+            drop_path_rate=0.2,  # 0.5 better for large+ models
+            norm_layer=nn.LayerNorm,
+            patch_norm=True,
+            focal_levels=[2, 2, 2, 2],
+            focal_windows=[3, 3, 3, 3],
+            use_conv_embed=False,
+            use_layerscale=False,
+            layerscale_value=1e-4,
+            use_postln=False,
+            use_postln_in_modulation=False,
+            normalize_modulator=False,
+            use_checkpoint=False,
+            pretrained=None):
+        super(FocalNet, self).__init__()
+        assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch)
+
+        embed_dim = MODEL_cfg[arch]['embed_dim']
+        depths = MODEL_cfg[arch]['depths']
+        drop_path_rate = MODEL_cfg[arch]['drop_path_rate']
+        focal_levels = MODEL_cfg[arch]['focal_levels']
+        focal_windows = MODEL_cfg[arch]['focal_windows']
+        use_conv_embed = MODEL_cfg[arch]['use_conv_embed']
+        use_layerscale = MODEL_cfg[arch]['use_layerscale']
+        use_postln = MODEL_cfg[arch]['use_postln']
+        use_postln_in_modulation = MODEL_cfg[arch]['use_postln_in_modulation']
+        normalize_modulator = MODEL_cfg[arch]['normalize_modulator']
+        if pretrained is None:
+            pretrained = MODEL_cfg[arch]['pretrained']
+
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.num_layers = len(depths)
+        self.patch_norm = patch_norm
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None,
+            use_conv_embed=use_conv_embed,
+            is_stem=True)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth decay rule
+        dpr = np.linspace(0, drop_path_rate, sum(depths))
+
+        # build layers
+        self.layers = nn.LayerList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                mlp_ratio=mlp_ratio,
+                drop=drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchEmbed
+                if (i_layer < self.num_layers - 1) else None,
+                focal_level=focal_levels[i_layer],
+                focal_window=focal_windows[i_layer],
+                use_conv_embed=use_conv_embed,
+                use_layerscale=use_layerscale,
+                layerscale_value=layerscale_value,
+                use_postln=use_postln,
+                use_postln_in_modulation=use_postln_in_modulation,
+                normalize_modulator=normalize_modulator,
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+
+        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
+        self.num_features = num_features
+
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_sublayer(layer_name, layer)
+
+        self.apply(self._init_weights)
+        self._freeze_stages()
+        if pretrained:
+            if 'http' in pretrained:  #URL
+                path = paddle.utils.download.get_weights_path_from_url(
+                    pretrained)
+            else:  #model in local path
+                path = pretrained
+            self.set_state_dict(paddle.load(path))
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.stop_gradient = True
+
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.stop_gradient = True
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward(self, x):
+        x = self.patch_embed(x['image'])
+        B, _, Wh, Ww = x.shape
+        x = x.flatten(2).transpose([0, 2, 1])
+        x = self.pos_drop(x)
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+                out = x_out.reshape([-1, H, W, self.num_features[i]]).transpose(
+                    (0, 3, 1, 2))
+                outs.append(out)
+
+        return outs
+
+    @property
+    def out_shape(self):
+        out_strides = [4, 8, 16, 32]
+        return [
+            ShapeSpec(
+                channels=self.num_features[i], stride=out_strides[i])
+            for i in self.out_indices
+        ]
--- a/rtdetr_paddle/ppdet/modeling/backbones/hgnet_v2.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/hgnet_v2.py
@@ -0,0 +1,447 @@
+# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn.initializer import KaimingNormal, Constant
+from paddle.nn import Conv2D, BatchNorm2D, ReLU, AdaptiveAvgPool2D, MaxPool2D
+from paddle.regularizer import L2Decay
+from paddle import ParamAttr
+
+import copy
+
+from ppdet.core.workspace import register, serializable
+from ..shape_spec import ShapeSpec
+
+__all__ = ['PPHGNetV2']
+
+kaiming_normal_ = KaimingNormal()
+zeros_ = Constant(value=0.)
+ones_ = Constant(value=1.)
+
+
+class LearnableAffineBlock(nn.Layer):
+    def __init__(self,
+                 scale_value=1.0,
+                 bias_value=0.0,
+                 lr_mult=1.0,
+                 lab_lr=0.01):
+        super().__init__()
+        self.scale = self.create_parameter(
+            shape=[1, ],
+            default_initializer=Constant(value=scale_value),
+            attr=ParamAttr(learning_rate=lr_mult * lab_lr))
+        self.add_parameter("scale", self.scale)
+        self.bias = self.create_parameter(
+            shape=[1, ],
+            default_initializer=Constant(value=bias_value),
+            attr=ParamAttr(learning_rate=lr_mult * lab_lr))
+        self.add_parameter("bias", self.bias)
+
+    def forward(self, x):
+        return self.scale * x + self.bias
+
+
+class ConvBNAct(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=1,
+                 groups=1,
+                 use_act=True,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.use_act = use_act
+        self.use_lab = use_lab
+        self.conv = Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=padding
+            if isinstance(padding, str) else (kernel_size - 1) // 2,
+            groups=groups,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=False)
+        self.bn = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(
+                regularizer=L2Decay(0.0), learning_rate=lr_mult),
+            bias_attr=ParamAttr(
+                regularizer=L2Decay(0.0), learning_rate=lr_mult))
+        if self.use_act:
+            self.act = ReLU()
+            if self.use_lab:
+                self.lab = LearnableAffineBlock(lr_mult=lr_mult)
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.use_act:
+            x = self.act(x)
+            if self.use_lab:
+                x = self.lab(x)
+        return x
+
+
+class LightConvBNAct(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 groups=1,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.conv1 = ConvBNAct(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            use_act=False,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.conv2 = ConvBNAct(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            groups=out_channels,
+            use_act=True,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return x
+
+
+class StemBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.stem1 = ConvBNAct(
+            in_channels=in_channels,
+            out_channels=mid_channels,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem2a = ConvBNAct(
+            in_channels=mid_channels,
+            out_channels=mid_channels // 2,
+            kernel_size=2,
+            stride=1,
+            padding="SAME",
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem2b = ConvBNAct(
+            in_channels=mid_channels // 2,
+            out_channels=mid_channels,
+            kernel_size=2,
+            stride=1,
+            padding="SAME",
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem3 = ConvBNAct(
+            in_channels=mid_channels * 2,
+            out_channels=mid_channels,
+            kernel_size=3,
+            stride=2,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.stem4 = ConvBNAct(
+            in_channels=mid_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.pool = nn.MaxPool2D(
+            kernel_size=2, stride=1, ceil_mode=True, padding="SAME")
+
+    def forward(self, x):
+        x = self.stem1(x)
+        x2 = self.stem2a(x)
+        x2 = self.stem2b(x2)
+        x1 = self.pool(x)
+        x = paddle.concat([x1, x2], 1)
+        x = self.stem3(x)
+        x = self.stem4(x)
+
+        return x
+
+
+class HG_Block(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 kernel_size=3,
+                 layer_num=6,
+                 identity=False,
+                 light_block=True,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.identity = identity
+
+        self.layers = nn.LayerList()
+        block_type = "LightConvBNAct" if light_block else "ConvBNAct"
+        for i in range(layer_num):
+            self.layers.append(
+                eval(block_type)(in_channels=in_channels
+                                 if i == 0 else mid_channels,
+                                 out_channels=mid_channels,
+                                 stride=1,
+                                 kernel_size=kernel_size,
+                                 use_lab=use_lab,
+                                 lr_mult=lr_mult))
+        # feature aggregation
+        total_channels = in_channels + layer_num * mid_channels
+        self.aggregation_squeeze_conv = ConvBNAct(
+            in_channels=total_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+        self.aggregation_excitation_conv = ConvBNAct(
+            in_channels=out_channels // 2,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_lab=use_lab,
+            lr_mult=lr_mult)
+
+    def forward(self, x):
+        identity = x
+        output = []
+        output.append(x)
+        for layer in self.layers:
+            x = layer(x)
+            output.append(x)
+        x = paddle.concat(output, axis=1)
+        x = self.aggregation_squeeze_conv(x)
+        x = self.aggregation_excitation_conv(x)
+        if self.identity:
+            x += identity
+        return x
+
+
+class HG_Stage(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 block_num,
+                 layer_num=6,
+                 downsample=True,
+                 light_block=True,
+                 kernel_size=3,
+                 use_lab=False,
+                 lr_mult=1.0):
+        super().__init__()
+        self.downsample = downsample
+        if downsample:
+            self.downsample = ConvBNAct(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                kernel_size=3,
+                stride=2,
+                groups=in_channels,
+                use_act=False,
+                use_lab=use_lab,
+                lr_mult=lr_mult)
+
+        blocks_list = []
+        for i in range(block_num):
+            blocks_list.append(
+                HG_Block(
+                    in_channels=in_channels if i == 0 else out_channels,
+                    mid_channels=mid_channels,
+                    out_channels=out_channels,
+                    kernel_size=kernel_size,
+                    layer_num=layer_num,
+                    identity=False if i == 0 else True,
+                    light_block=light_block,
+                    use_lab=use_lab,
+                    lr_mult=lr_mult))
+        self.blocks = nn.Sequential(*blocks_list)
+
+    def forward(self, x):
+        if self.downsample:
+            x = self.downsample(x)
+        x = self.blocks(x)
+        return x
+
+
+def _freeze_norm(m: nn.BatchNorm2D):
+    param_attr = ParamAttr(
+        learning_rate=0., regularizer=L2Decay(0.), trainable=False)
+    bias_attr = ParamAttr(
+        learning_rate=0., regularizer=L2Decay(0.), trainable=False)
+    global_stats = True
+    norm = nn.BatchNorm2D(
+        m._num_features,
+        weight_attr=param_attr,
+        bias_attr=bias_attr,
+        use_global_stats=global_stats)
+    for param in norm.parameters():
+        param.stop_gradient = True
+    return norm
+
+
+def reset_bn(model: nn.Layer, reset_func=_freeze_norm):
+    if isinstance(model, nn.BatchNorm2D):
+        model = reset_func(model)
+    else:
+        for name, child in model.named_children():
+            _child = reset_bn(child, reset_func)
+            if _child is not child:
+                setattr(model, name, _child)
+    return model
+
+
+@register
+@serializable
+class PPHGNetV2(nn.Layer):
+    """
+    PPHGNetV2
+    Args:
+        stem_channels: list. Number of channels for the stem block.
+        stage_type: str. The stage configuration of PPHGNet. such as the number of channels, stride, etc.
+        use_lab: boolean. Whether to use LearnableAffineBlock in network.
+        lr_mult_list: list. Control the learning rate of different stages.
+    Returns:
+        model: nn.Layer. Specific PPHGNetV2 model depends on args.
+    """
+
+    arch_configs = {
+        'L': {
+            'stem_channels': [3, 32, 48],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [48, 48, 128, 1, False, False, 3, 6],
+                "stage2": [128, 96, 512, 1, True, False, 3, 6],
+                "stage3": [512, 192, 1024, 3, True, True, 5, 6],
+                "stage4": [1024, 384, 2048, 1, True, True, 5, 6],
+            }
+        },
+        'X': {
+            'stem_channels': [3, 32, 64],
+            'stage_config': {
+                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
+                "stage1": [64, 64, 128, 1, False, False, 3, 6],
+                "stage2": [128, 128, 512, 2, True, False, 3, 6],
+                "stage3": [512, 256, 1024, 5, True, True, 5, 6],
+                "stage4": [1024, 512, 2048, 2, True, True, 5, 6],
+            }
+        }
+    }
+
+    def __init__(self,
+                 arch,
+                 use_lab=False,
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+                 return_idx=[1, 2, 3],
+                 freeze_stem_only=True,
+                 freeze_at=0,
+                 freeze_norm=True):
+        super().__init__()
+        self.use_lab = use_lab
+        self.return_idx = return_idx
+
+        stem_channels = self.arch_configs[arch]['stem_channels']
+        stage_config = self.arch_configs[arch]['stage_config']
+
+        self._out_strides = [4, 8, 16, 32]
+        self._out_channels = [stage_config[k][2] for k in stage_config]
+
+        # stem
+        self.stem = StemBlock(
+            in_channels=stem_channels[0],
+            mid_channels=stem_channels[1],
+            out_channels=stem_channels[2],
+            use_lab=use_lab,
+            lr_mult=lr_mult_list[0])
+
+        # stages
+        self.stages = nn.LayerList()
+        for i, k in enumerate(stage_config):
+            in_channels, mid_channels, out_channels, block_num, downsample, light_block, kernel_size, layer_num = stage_config[
+                k]
+            self.stages.append(
+                HG_Stage(
+                    in_channels,
+                    mid_channels,
+                    out_channels,
+                    block_num,
+                    layer_num,
+                    downsample,
+                    light_block,
+                    kernel_size,
+                    use_lab,
+                    lr_mult=lr_mult_list[i + 1]))
+
+        if freeze_at >= 0:
+            self._freeze_parameters(self.stem)
+            if not freeze_stem_only:
+                for i in range(min(freeze_at + 1, len(self.stages))):
+                    self._freeze_parameters(self.stages[i])
+
+        if freeze_norm:
+            reset_bn(self, reset_func=_freeze_norm)
+
+        self._init_weights()
+
+    def _freeze_parameters(self, m):
+        for p in m.parameters():
+            p.stop_gradient = True
+
+    def _init_weights(self):
+        for m in self.sublayers():
+            if isinstance(m, nn.Conv2D):
+                kaiming_normal_(m.weight)
+            elif isinstance(m, (nn.BatchNorm2D)):
+                ones_(m.weight)
+                zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                zeros_(m.bias)
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self._out_channels[i], stride=self._out_strides[i])
+            for i in self.return_idx
+        ]
+
+    def forward(self, inputs):
+        x = inputs['image']
+        x = self.stem(x)
+        outs = []
+        for idx, stage in enumerate(self.stages):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+        return outs
--- a/rtdetr_paddle/ppdet/modeling/backbones/lcnet.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/lcnet.py
@@ -0,0 +1,271 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.nn import AdaptiveAvgPool2D, Conv2D
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import KaimingNormal
+
+from ppdet.core.workspace import register, serializable
+from numbers import Integral
+from ..shape_spec import ShapeSpec
+
+__all__ = ['LCNet']
+
+NET_CONFIG = {
+    "blocks2":
+    #k, in_c, out_c, s, use_se
+    [[3, 16, 32, 1, False], ],
+    "blocks3": [
+        [3, 32, 64, 2, False],
+        [3, 64, 64, 1, False],
+    ],
+    "blocks4": [
+        [3, 64, 128, 2, False],
+        [3, 128, 128, 1, False],
+    ],
+    "blocks5": [
+        [3, 128, 256, 2, False],
+        [5, 256, 256, 1, False],
+        [5, 256, 256, 1, False],
+        [5, 256, 256, 1, False],
+        [5, 256, 256, 1, False],
+        [5, 256, 256, 1, False],
+    ],
+    "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]]
+}
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 filter_size,
+                 num_filters,
+                 stride,
+                 num_groups=1,
+                 act='hard_swish'):
+        super().__init__()
+
+        self.conv = Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=(filter_size - 1) // 2,
+            groups=num_groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal()),
+            bias_attr=False)
+
+        self.bn = nn.BatchNorm2D(
+            num_filters,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        if act == 'hard_swish':
+            self.act = nn.Hardswish()
+        elif act == 'relu6':
+            self.act = nn.ReLU6()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+        return x
+
+
+class DepthwiseSeparable(nn.Layer):
+    def __init__(self,
+                 num_channels,
+                 num_filters,
+                 stride,
+                 dw_size=3,
+                 use_se=False,
+                 act='hard_swish'):
+        super().__init__()
+        self.use_se = use_se
+        self.dw_conv = ConvBNLayer(
+            num_channels=num_channels,
+            num_filters=num_channels,
+            filter_size=dw_size,
+            stride=stride,
+            num_groups=num_channels,
+            act=act)
+        if use_se:
+            self.se = SEModule(num_channels)
+        self.pw_conv = ConvBNLayer(
+            num_channels=num_channels,
+            filter_size=1,
+            num_filters=num_filters,
+            stride=1,
+            act=act)
+
+    def forward(self, x):
+        x = self.dw_conv(x)
+        if self.use_se:
+            x = self.se(x)
+        x = self.pw_conv(x)
+        return x
+
+
+class SEModule(nn.Layer):
+    def __init__(self, channel, reduction=4):
+        super().__init__()
+        self.avg_pool = AdaptiveAvgPool2D(1)
+        self.conv1 = Conv2D(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.relu = nn.ReLU()
+        self.conv2 = Conv2D(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.hardsigmoid = nn.Hardsigmoid()
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.hardsigmoid(x)
+        x = paddle.multiply(x=identity, y=x)
+        return x
+
+
+@register
+@serializable
+class LCNet(nn.Layer):
+    def __init__(self, scale=1.0, feature_maps=[3, 4, 5], act='hard_swish'):
+        super().__init__()
+        self.scale = scale
+        self.feature_maps = feature_maps
+
+        out_channels = []
+
+        self.conv1 = ConvBNLayer(
+            num_channels=3,
+            filter_size=3,
+            num_filters=make_divisible(16 * scale),
+            stride=2,
+            act=act)
+
+        self.blocks2 = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                act=act)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"])
+        ])
+
+        self.blocks3 = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                act=act)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"])
+        ])
+
+        out_channels.append(
+            make_divisible(NET_CONFIG["blocks3"][-1][2] * scale))
+
+        self.blocks4 = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                act=act)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"])
+        ])
+
+        out_channels.append(
+            make_divisible(NET_CONFIG["blocks4"][-1][2] * scale))
+
+        self.blocks5 = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                act=act)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"])
+        ])
+
+        out_channels.append(
+            make_divisible(NET_CONFIG["blocks5"][-1][2] * scale))
+
+        self.blocks6 = nn.Sequential(* [
+            DepthwiseSeparable(
+                num_channels=make_divisible(in_c * scale),
+                num_filters=make_divisible(out_c * scale),
+                dw_size=k,
+                stride=s,
+                use_se=se,
+                act=act)
+            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"])
+        ])
+
+        out_channels.append(
+            make_divisible(NET_CONFIG["blocks6"][-1][2] * scale))
+        self._out_channels = [
+            ch for idx, ch in enumerate(out_channels) if idx + 2 in feature_maps
+        ]
+
+    def forward(self, inputs):
+        x = inputs['image']
+        outs = []
+
+        x = self.conv1(x)
+        x = self.blocks2(x)
+        x = self.blocks3(x)
+        outs.append(x)
+        x = self.blocks4(x)
+        outs.append(x)
+        x = self.blocks5(x)
+        outs.append(x)
+        x = self.blocks6(x)
+        outs.append(x)
+        outs = [o for i, o in enumerate(outs) if i + 2 in self.feature_maps]
+        return outs
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
--- a/rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v1.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v1.py
@@ -0,0 +1,402 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import KaimingNormal
+from ppdet.core.workspace import register, serializable
+from numbers import Integral
+from ..shape_spec import ShapeSpec
+
+__all__ = ['MobileNet']
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 act='relu',
+                 conv_lr=1.,
+                 conv_decay=0.,
+                 norm_decay=0.,
+                 norm_type='bn',
+                 name=None):
+        super(ConvBNLayer, self).__init__()
+        self.act = act
+        self._conv = nn.Conv2D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            weight_attr=ParamAttr(
+                learning_rate=conv_lr,
+                initializer=KaimingNormal(),
+                regularizer=L2Decay(conv_decay)),
+            bias_attr=False)
+
+        param_attr = ParamAttr(regularizer=L2Decay(norm_decay))
+        bias_attr = ParamAttr(regularizer=L2Decay(norm_decay))
+        if norm_type in ['sync_bn', 'bn']:
+            self._batch_norm = nn.BatchNorm2D(
+                out_channels, weight_attr=param_attr, bias_attr=bias_attr)
+
+    def forward(self, x):
+        x = self._conv(x)
+        x = self._batch_norm(x)
+        if self.act == "relu":
+            x = F.relu(x)
+        elif self.act == "relu6":
+            x = F.relu6(x)
+        return x
+
+
+class DepthwiseSeparable(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels1,
+                 out_channels2,
+                 num_groups,
+                 stride,
+                 scale,
+                 conv_lr=1.,
+                 conv_decay=0.,
+                 norm_decay=0.,
+                 norm_type='bn',
+                 name=None):
+        super(DepthwiseSeparable, self).__init__()
+
+        self._depthwise_conv = ConvBNLayer(
+            in_channels,
+            int(out_channels1 * scale),
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups * scale),
+            conv_lr=conv_lr,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name=name + "_dw")
+
+        self._pointwise_conv = ConvBNLayer(
+            int(out_channels1 * scale),
+            int(out_channels2 * scale),
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            conv_lr=conv_lr,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name=name + "_sep")
+
+    def forward(self, x):
+        x = self._depthwise_conv(x)
+        x = self._pointwise_conv(x)
+        return x
+
+
+class ExtraBlock(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels1,
+                 out_channels2,
+                 num_groups=1,
+                 stride=2,
+                 conv_lr=1.,
+                 conv_decay=0.,
+                 norm_decay=0.,
+                 norm_type='bn',
+                 name=None):
+        super(ExtraBlock, self).__init__()
+
+        self.pointwise_conv = ConvBNLayer(
+            in_channels,
+            int(out_channels1),
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            num_groups=int(num_groups),
+            act='relu6',
+            conv_lr=conv_lr,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name=name + "_extra1")
+
+        self.normal_conv = ConvBNLayer(
+            int(out_channels1),
+            int(out_channels2),
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            num_groups=int(num_groups),
+            act='relu6',
+            conv_lr=conv_lr,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name=name + "_extra2")
+
+    def forward(self, x):
+        x = self.pointwise_conv(x)
+        x = self.normal_conv(x)
+        return x
+
+
+@register
+@serializable
+class MobileNet(nn.Layer):
+    __shared__ = ['norm_type']
+
+    def __init__(self,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 conv_decay=0.,
+                 scale=1,
+                 conv_learning_rate=1.0,
+                 feature_maps=[4, 6, 13],
+                 with_extra_blocks=False,
+                 extra_block_filters=[[256, 512], [128, 256], [128, 256],
+                                      [64, 128]]):
+        super(MobileNet, self).__init__()
+        if isinstance(feature_maps, Integral):
+            feature_maps = [feature_maps]
+        self.feature_maps = feature_maps
+        self.with_extra_blocks = with_extra_blocks
+        self.extra_block_filters = extra_block_filters
+
+        self._out_channels = []
+
+        self.conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=int(32 * scale),
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            conv_lr=conv_learning_rate,
+            conv_decay=conv_decay,
+            norm_decay=norm_decay,
+            norm_type=norm_type,
+            name="conv1")
+
+        self.dwsl = []
+        dws21 = self.add_sublayer(
+            "conv2_1",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(32 * scale),
+                out_channels1=32,
+                out_channels2=64,
+                num_groups=32,
+                stride=1,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv2_1"))
+        self.dwsl.append(dws21)
+        self._update_out_channels(int(64 * scale), len(self.dwsl), feature_maps)
+        dws22 = self.add_sublayer(
+            "conv2_2",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(64 * scale),
+                out_channels1=64,
+                out_channels2=128,
+                num_groups=64,
+                stride=2,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv2_2"))
+        self.dwsl.append(dws22)
+        self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps)
+        # 1/4
+        dws31 = self.add_sublayer(
+            "conv3_1",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(128 * scale),
+                out_channels1=128,
+                out_channels2=128,
+                num_groups=128,
+                stride=1,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv3_1"))
+        self.dwsl.append(dws31)
+        self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps)
+        dws32 = self.add_sublayer(
+            "conv3_2",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(128 * scale),
+                out_channels1=128,
+                out_channels2=256,
+                num_groups=128,
+                stride=2,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv3_2"))
+        self.dwsl.append(dws32)
+        self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps)
+        # 1/8
+        dws41 = self.add_sublayer(
+            "conv4_1",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(256 * scale),
+                out_channels1=256,
+                out_channels2=256,
+                num_groups=256,
+                stride=1,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv4_1"))
+        self.dwsl.append(dws41)
+        self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps)
+        dws42 = self.add_sublayer(
+            "conv4_2",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(256 * scale),
+                out_channels1=256,
+                out_channels2=512,
+                num_groups=256,
+                stride=2,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv4_2"))
+        self.dwsl.append(dws42)
+        self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps)
+        # 1/16
+        for i in range(5):
+            tmp = self.add_sublayer(
+                "conv5_" + str(i + 1),
+                sublayer=DepthwiseSeparable(
+                    in_channels=int(512 * scale),
+                    out_channels1=512,
+                    out_channels2=512,
+                    num_groups=512,
+                    stride=1,
+                    scale=scale,
+                    conv_lr=conv_learning_rate,
+                    conv_decay=conv_decay,
+                    norm_decay=norm_decay,
+                    norm_type=norm_type,
+                    name="conv5_" + str(i + 1)))
+            self.dwsl.append(tmp)
+            self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps)
+        dws56 = self.add_sublayer(
+            "conv5_6",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(512 * scale),
+                out_channels1=512,
+                out_channels2=1024,
+                num_groups=512,
+                stride=2,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv5_6"))
+        self.dwsl.append(dws56)
+        self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps)
+        # 1/32
+        dws6 = self.add_sublayer(
+            "conv6",
+            sublayer=DepthwiseSeparable(
+                in_channels=int(1024 * scale),
+                out_channels1=1024,
+                out_channels2=1024,
+                num_groups=1024,
+                stride=1,
+                scale=scale,
+                conv_lr=conv_learning_rate,
+                conv_decay=conv_decay,
+                norm_decay=norm_decay,
+                norm_type=norm_type,
+                name="conv6"))
+        self.dwsl.append(dws6)
+        self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps)
+
+        if self.with_extra_blocks:
+            self.extra_blocks = []
+            for i, block_filter in enumerate(self.extra_block_filters):
+                in_c = 1024 if i == 0 else self.extra_block_filters[i - 1][1]
+                conv_extra = self.add_sublayer(
+                    "conv7_" + str(i + 1),
+                    sublayer=ExtraBlock(
+                        in_c,
+                        block_filter[0],
+                        block_filter[1],
+                        conv_lr=conv_learning_rate,
+                        conv_decay=conv_decay,
+                        norm_decay=norm_decay,
+                        norm_type=norm_type,
+                        name="conv7_" + str(i + 1)))
+                self.extra_blocks.append(conv_extra)
+                self._update_out_channels(
+                    block_filter[1],
+                    len(self.dwsl) + len(self.extra_blocks), feature_maps)
+
+    def _update_out_channels(self, channel, feature_idx, feature_maps):
+        if feature_idx in feature_maps:
+            self._out_channels.append(channel)
+
+    def forward(self, inputs):
+        outs = []
+        y = self.conv1(inputs['image'])
+        for i, block in enumerate(self.dwsl):
+            y = block(y)
+            if i + 1 in self.feature_maps:
+                outs.append(y)
+
+        if not self.with_extra_blocks:
+            return outs
+
+        y = outs[-1]
+        for i, block in enumerate(self.extra_blocks):
+            idx = i + len(self.dwsl)
+            y = block(y)
+            if idx + 1 in self.feature_maps:
+                outs.append(y)
+        return outs
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
--- a/rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v3.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v3.py
@@ -0,0 +1,478 @@
+# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from ppdet.core.workspace import register, serializable
+from numbers import Integral
+from ..shape_spec import ShapeSpec
+
+__all__ = ['MobileNetV3']
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 padding,
+                 num_groups=1,
+                 act=None,
+                 lr_mult=1.,
+                 conv_decay=0.,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 name=""):
+        super(ConvBNLayer, self).__init__()
+        self.act = act
+        self.conv = nn.Conv2D(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=filter_size,
+            stride=stride,
+            padding=padding,
+            groups=num_groups,
+            weight_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
+            bias_attr=False)
+
+        norm_lr = 0. if freeze_norm else lr_mult
+        param_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay),
+            trainable=False if freeze_norm else True)
+        bias_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay),
+            trainable=False if freeze_norm else True)
+        global_stats = True if freeze_norm else None
+        if norm_type in ['sync_bn', 'bn']:
+            self.bn = nn.BatchNorm2D(
+                out_c,
+                weight_attr=param_attr,
+                bias_attr=bias_attr,
+                use_global_stats=global_stats)
+        norm_params = self.bn.parameters()
+        if freeze_norm:
+            for param in norm_params:
+                param.stop_gradient = True
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.act is not None:
+            if self.act == "relu":
+                x = F.relu(x)
+            elif self.act == "relu6":
+                x = F.relu6(x)
+            elif self.act == "hard_swish":
+                x = F.hardswish(x)
+            else:
+                raise NotImplementedError(
+                    "The activation function is selected incorrectly.")
+        return x
+
+
+class ResidualUnit(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 mid_c,
+                 out_c,
+                 filter_size,
+                 stride,
+                 use_se,
+                 lr_mult,
+                 conv_decay=0.,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 act=None,
+                 return_list=False,
+                 name=''):
+        super(ResidualUnit, self).__init__()
+        self.if_shortcut = stride == 1 and in_c == out_c
+        self.use_se = use_se
+        self.return_list = return_list
+
+        self.expand_conv = ConvBNLayer(
+            in_c=in_c,
+            out_c=mid_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act=act,
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_expand")
+        self.bottleneck_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=mid_c,
+            filter_size=filter_size,
+            stride=stride,
+            padding=int((filter_size - 1) // 2),
+            num_groups=mid_c,
+            act=act,
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_depthwise")
+        if self.use_se:
+            self.mid_se = SEModule(
+                mid_c, lr_mult, conv_decay, name=name + "_se")
+        self.linear_conv = ConvBNLayer(
+            in_c=mid_c,
+            out_c=out_c,
+            filter_size=1,
+            stride=1,
+            padding=0,
+            act=None,
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_linear")
+
+    def forward(self, inputs):
+        y = self.expand_conv(inputs)
+        x = self.bottleneck_conv(y)
+        if self.use_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.if_shortcut:
+            x = paddle.add(inputs, x)
+        if self.return_list:
+            return [y, x]
+        else:
+            return x
+
+
+class SEModule(nn.Layer):
+    def __init__(self, channel, lr_mult, conv_decay, reduction=4, name=""):
+        super(SEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2D(1)
+        mid_channels = int(channel // reduction)
+        self.conv1 = nn.Conv2D(
+            in_channels=channel,
+            out_channels=mid_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
+            bias_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
+        self.conv2 = nn.Conv2D(
+            in_channels=mid_channels,
+            out_channels=channel,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            weight_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
+            bias_attr=ParamAttr(
+                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
+
+    def forward(self, inputs):
+        outputs = self.avg_pool(inputs)
+        outputs = self.conv1(outputs)
+        outputs = F.relu(outputs)
+        outputs = self.conv2(outputs)
+        outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5)
+        return paddle.multiply(x=inputs, y=outputs)
+
+
+class ExtraBlockDW(nn.Layer):
+    def __init__(self,
+                 in_c,
+                 ch_1,
+                 ch_2,
+                 stride,
+                 lr_mult,
+                 conv_decay=0.,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=False,
+                 name=None):
+        super(ExtraBlockDW, self).__init__()
+        self.pointwise_conv = ConvBNLayer(
+            in_c=in_c,
+            out_c=ch_1,
+            filter_size=1,
+            stride=1,
+            padding='SAME',
+            act='relu6',
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_extra1")
+        self.depthwise_conv = ConvBNLayer(
+            in_c=ch_1,
+            out_c=ch_2,
+            filter_size=3,
+            stride=stride,
+            padding='SAME',
+            num_groups=int(ch_1),
+            act='relu6',
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_extra2_dw")
+        self.normal_conv = ConvBNLayer(
+            in_c=ch_2,
+            out_c=ch_2,
+            filter_size=1,
+            stride=1,
+            padding='SAME',
+            act='relu6',
+            lr_mult=lr_mult,
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name=name + "_extra2_sep")
+
+    def forward(self, inputs):
+        x = self.pointwise_conv(inputs)
+        x = self.depthwise_conv(x)
+        x = self.normal_conv(x)
+        return x
+
+
+@register
+@serializable
+class MobileNetV3(nn.Layer):
+    __shared__ = ['norm_type']
+
+    def __init__(
+            self,
+            scale=1.0,
+            model_name="large",
+            feature_maps=[6, 12, 15],
+            with_extra_blocks=False,
+            extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]],
+            lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
+            conv_decay=0.0,
+            multiplier=1.0,
+            norm_type='bn',
+            norm_decay=0.0,
+            freeze_norm=False):
+        super(MobileNetV3, self).__init__()
+        if isinstance(feature_maps, Integral):
+            feature_maps = [feature_maps]
+        if norm_type == 'sync_bn' and freeze_norm:
+            raise ValueError(
+                "The norm_type should not be sync_bn when freeze_norm is True")
+        self.feature_maps = feature_maps
+        self.with_extra_blocks = with_extra_blocks
+        self.extra_block_filters = extra_block_filters
+
+        inplanes = 16
+        if model_name == "large":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, False, "relu", 1],
+                [3, 64, 24, False, "relu", 2],
+                [3, 72, 24, False, "relu", 1],
+                [5, 72, 40, True, "relu", 2],  # RCNN output
+                [5, 120, 40, True, "relu", 1],
+                [5, 120, 40, True, "relu", 1],  # YOLOv3 output
+                [3, 240, 80, False, "hard_swish", 2],  # RCNN output
+                [3, 200, 80, False, "hard_swish", 1],
+                [3, 184, 80, False, "hard_swish", 1],
+                [3, 184, 80, False, "hard_swish", 1],
+                [3, 480, 112, True, "hard_swish", 1],
+                [3, 672, 112, True, "hard_swish", 1],  # YOLOv3 output
+                [5, 672, 160, True, "hard_swish", 2],  # SSD/SSDLite/RCNN output
+                [5, 960, 160, True, "hard_swish", 1],
+                [5, 960, 160, True, "hard_swish", 1],  # YOLOv3 output
+            ]
+        elif model_name == "small":
+            self.cfg = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16, 16, True, "relu", 2],
+                [3, 72, 24, False, "relu", 2],  # RCNN output
+                [3, 88, 24, False, "relu", 1],  # YOLOv3 output
+                [5, 96, 40, True, "hard_swish", 2],  # RCNN output
+                [5, 240, 40, True, "hard_swish", 1],
+                [5, 240, 40, True, "hard_swish", 1],
+                [5, 120, 48, True, "hard_swish", 1],
+                [5, 144, 48, True, "hard_swish", 1],  # YOLOv3 output
+                [5, 288, 96, True, "hard_swish", 2],  # SSD/SSDLite/RCNN output
+                [5, 576, 96, True, "hard_swish", 1],
+                [5, 576, 96, True, "hard_swish", 1],  # YOLOv3 output
+            ]
+        else:
+            raise NotImplementedError(
+                "mode[{}_model] is not implemented!".format(model_name))
+
+        if multiplier != 1.0:
+            self.cfg[-3][2] = int(self.cfg[-3][2] * multiplier)
+            self.cfg[-2][1] = int(self.cfg[-2][1] * multiplier)
+            self.cfg[-2][2] = int(self.cfg[-2][2] * multiplier)
+            self.cfg[-1][1] = int(self.cfg[-1][1] * multiplier)
+            self.cfg[-1][2] = int(self.cfg[-1][2] * multiplier)
+
+        self.conv1 = ConvBNLayer(
+            in_c=3,
+            out_c=make_divisible(inplanes * scale),
+            filter_size=3,
+            stride=2,
+            padding=1,
+            num_groups=1,
+            act="hard_swish",
+            lr_mult=lr_mult_list[0],
+            conv_decay=conv_decay,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            name="conv1")
+
+        self._out_channels = []
+        self.block_list = []
+        i = 0
+        inplanes = make_divisible(inplanes * scale)
+        for (k, exp, c, se, nl, s) in self.cfg:
+            lr_idx = min(i // 3, len(lr_mult_list) - 1)
+            lr_mult = lr_mult_list[lr_idx]
+
+            # for SSD/SSDLite, first head input is after ResidualUnit expand_conv
+            return_list = self.with_extra_blocks and i + 2 in self.feature_maps
+
+            block = self.add_sublayer(
+                "conv" + str(i + 2),
+                sublayer=ResidualUnit(
+                    in_c=inplanes,
+                    mid_c=make_divisible(scale * exp),
+                    out_c=make_divisible(scale * c),
+                    filter_size=k,
+                    stride=s,
+                    use_se=se,
+                    act=nl,
+                    lr_mult=lr_mult,
+                    conv_decay=conv_decay,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    return_list=return_list,
+                    name="conv" + str(i + 2)))
+            self.block_list.append(block)
+            inplanes = make_divisible(scale * c)
+            i += 1
+            self._update_out_channels(
+                make_divisible(scale * exp)
+                if return_list else inplanes, i + 1, feature_maps)
+
+        if self.with_extra_blocks:
+            self.extra_block_list = []
+            extra_out_c = make_divisible(scale * self.cfg[-1][1])
+            lr_idx = min(i // 3, len(lr_mult_list) - 1)
+            lr_mult = lr_mult_list[lr_idx]
+
+            conv_extra = self.add_sublayer(
+                "conv" + str(i + 2),
+                sublayer=ConvBNLayer(
+                    in_c=inplanes,
+                    out_c=extra_out_c,
+                    filter_size=1,
+                    stride=1,
+                    padding=0,
+                    num_groups=1,
+                    act="hard_swish",
+                    lr_mult=lr_mult,
+                    conv_decay=conv_decay,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    name="conv" + str(i + 2)))
+            self.extra_block_list.append(conv_extra)
+            i += 1
+            self._update_out_channels(extra_out_c, i + 1, feature_maps)
+
+            for j, block_filter in enumerate(self.extra_block_filters):
+                in_c = extra_out_c if j == 0 else self.extra_block_filters[j -
+                                                                           1][1]
+                conv_extra = self.add_sublayer(
+                    "conv" + str(i + 2),
+                    sublayer=ExtraBlockDW(
+                        in_c,
+                        block_filter[0],
+                        block_filter[1],
+                        stride=2,
+                        lr_mult=lr_mult,
+                        conv_decay=conv_decay,
+                        norm_type=norm_type,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        name='conv' + str(i + 2)))
+                self.extra_block_list.append(conv_extra)
+                i += 1
+                self._update_out_channels(block_filter[1], i + 1, feature_maps)
+
+    def _update_out_channels(self, channel, feature_idx, feature_maps):
+        if feature_idx in feature_maps:
+            self._out_channels.append(channel)
+
+    def forward(self, inputs):
+        x = self.conv1(inputs['image'])
+        outs = []
+        for idx, block in enumerate(self.block_list):
+            x = block(x)
+            if idx + 2 in self.feature_maps:
+                if isinstance(x, list):
+                    outs.append(x[0])
+                    x = x[1]
+                else:
+                    outs.append(x)
+
+        if not self.with_extra_blocks:
+            return outs
+
+        for i, block in enumerate(self.extra_block_list):
+            idx = i + len(self.block_list)
+            x = block(x)
+            if idx + 2 in self.feature_maps:
+                outs.append(x)
+        return outs
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
--- a/rtdetr_paddle/ppdet/modeling/backbones/mobileone.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/mobileone.py
@@ -0,0 +1,266 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is the paddle implementation of MobileOne block, see: https://arxiv.org/pdf/2206.04040.pdf. 
+Some codes are based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
+Ths copyright of microsoft/Swin-Transformer is as follows:
+MIT License [see LICENSE for details]
+"""
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Normal, Constant
+
+from ppdet.modeling.ops import get_act_fn
+from ppdet.modeling.layers import ConvNormLayer
+
+
+class MobileOneBlock(nn.Layer):
+    def __init__(
+            self,
+            ch_in,
+            ch_out,
+            stride,
+            kernel_size,
+            conv_num=1,
+            norm_type='bn',
+            norm_decay=0.,
+            norm_groups=32,
+            bias_on=False,
+            lr_scale=1.,
+            freeze_norm=False,
+            initializer=Normal(
+                mean=0., std=0.01),
+            skip_quant=False,
+            act='relu', ):
+        super(MobileOneBlock, self).__init__()
+
+        self.ch_in = ch_in
+        self.ch_out = ch_out
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = (kernel_size - 1) // 2
+        self.k = conv_num
+
+        self.depth_conv = nn.LayerList()
+        self.point_conv = nn.LayerList()
+        for _ in range(self.k):
+            self.depth_conv.append(
+                ConvNormLayer(
+                    ch_in,
+                    ch_in,
+                    kernel_size,
+                    stride=stride,
+                    groups=ch_in,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    norm_groups=norm_groups,
+                    bias_on=bias_on,
+                    lr_scale=lr_scale,
+                    freeze_norm=freeze_norm,
+                    initializer=initializer,
+                    skip_quant=skip_quant))
+            self.point_conv.append(
+                ConvNormLayer(
+                    ch_in,
+                    ch_out,
+                    1,
+                    stride=1,
+                    groups=1,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    norm_groups=norm_groups,
+                    bias_on=bias_on,
+                    lr_scale=lr_scale,
+                    freeze_norm=freeze_norm,
+                    initializer=initializer,
+                    skip_quant=skip_quant))
+        self.rbr_1x1 = ConvNormLayer(
+            ch_in,
+            ch_in,
+            1,
+            stride=self.stride,
+            groups=ch_in,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            norm_groups=norm_groups,
+            bias_on=bias_on,
+            lr_scale=lr_scale,
+            freeze_norm=freeze_norm,
+            initializer=initializer,
+            skip_quant=skip_quant)
+        self.rbr_identity_st1 = nn.BatchNorm2D(
+            num_features=ch_in,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(
+                0.0))) if ch_in == ch_out and self.stride == 1 else None
+        self.rbr_identity_st2 = nn.BatchNorm2D(
+            num_features=ch_out,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(
+                0.0))) if ch_in == ch_out and self.stride == 1 else None
+        self.act = get_act_fn(act) if act is None or isinstance(act, (
+            str, dict)) else act
+
+    def forward(self, x):
+        if hasattr(self, "conv1") and hasattr(self, "conv2"):
+            y = self.act(self.conv2(self.act(self.conv1(x))))
+        else:
+            if self.rbr_identity_st1 is None:
+                id_out_st1 = 0
+            else:
+                id_out_st1 = self.rbr_identity_st1(x)
+
+            x1_1 = 0
+            for i in range(self.k):
+                x1_1 += self.depth_conv[i](x)
+
+            x1_2 = self.rbr_1x1(x)
+            x1 = self.act(x1_1 + x1_2 + id_out_st1)
+
+            if self.rbr_identity_st2 is None:
+                id_out_st2 = 0
+            else:
+                id_out_st2 = self.rbr_identity_st2(x1)
+
+            x2_1 = 0
+            for i in range(self.k):
+                x2_1 += self.point_conv[i](x1)
+            y = self.act(x2_1 + id_out_st2)
+
+        return y
+
+    def convert_to_deploy(self):
+        if not hasattr(self, 'conv1'):
+            self.conv1 = nn.Conv2D(
+                in_channels=self.ch_in,
+                out_channels=self.ch_in,
+                kernel_size=self.kernel_size,
+                stride=self.stride,
+                padding=self.padding,
+                groups=self.ch_in,
+                bias_attr=ParamAttr(
+                    initializer=Constant(value=0.), learning_rate=1.))
+        if not hasattr(self, 'conv2'):
+            self.conv2 = nn.Conv2D(
+                in_channels=self.ch_in,
+                out_channels=self.ch_out,
+                kernel_size=1,
+                stride=1,
+                padding='SAME',
+                groups=1,
+                bias_attr=ParamAttr(
+                    initializer=Constant(value=0.), learning_rate=1.))
+
+        conv1_kernel, conv1_bias, conv2_kernel, conv2_bias = self.get_equivalent_kernel_bias(
+        )
+        self.conv1.weight.set_value(conv1_kernel)
+        self.conv1.bias.set_value(conv1_bias)
+        self.conv2.weight.set_value(conv2_kernel)
+        self.conv2.bias.set_value(conv2_bias)
+        self.__delattr__('depth_conv')
+        self.__delattr__('point_conv')
+        self.__delattr__('rbr_1x1')
+        if hasattr(self, 'rbr_identity_st1'):
+            self.__delattr__('rbr_identity_st1')
+        if hasattr(self, 'rbr_identity_st2'):
+            self.__delattr__('rbr_identity_st2')
+
+    def get_equivalent_kernel_bias(self):
+        st1_kernel3x3, st1_bias3x3 = self._fuse_bn_tensor(self.depth_conv)
+        st1_kernel1x1, st1_bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
+        st1_kernelid, st1_biasid = self._fuse_bn_tensor(
+            self.rbr_identity_st1, kernel_size=self.kernel_size)
+
+        st2_kernel1x1, st2_bias1x1 = self._fuse_bn_tensor(self.point_conv)
+        st2_kernelid, st2_biasid = self._fuse_bn_tensor(
+            self.rbr_identity_st2, kernel_size=1)
+
+        conv1_kernel = st1_kernel3x3 + self._pad_1x1_to_3x3_tensor(
+            st1_kernel1x1) + st1_kernelid
+
+        conv1_bias = st1_bias3x3 + st1_bias1x1 + st1_biasid
+
+        conv2_kernel = st2_kernel1x1 + st2_kernelid
+        conv2_bias = st2_bias1x1 + st2_biasid
+
+        return conv1_kernel, conv1_bias, conv2_kernel, conv2_bias
+
+    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
+        if kernel1x1 is None:
+            return 0
+        else:
+            padding_size = (self.kernel_size - 1) // 2
+            return nn.functional.pad(
+                kernel1x1,
+                [padding_size, padding_size, padding_size, padding_size])
+
+    def _fuse_bn_tensor(self, branch, kernel_size=3):
+        if branch is None:
+            return 0, 0
+
+        if isinstance(branch, nn.LayerList):
+            fused_kernels = []
+            fused_bias = []
+            for block in branch:
+                kernel = block.conv.weight
+                running_mean = block.norm._mean
+                running_var = block.norm._variance
+                gamma = block.norm.weight
+                beta = block.norm.bias
+                eps = block.norm._epsilon
+
+                std = (running_var + eps).sqrt()
+                t = (gamma / std).reshape((-1, 1, 1, 1))
+
+                fused_kernels.append(kernel * t)
+                fused_bias.append(beta - running_mean * gamma / std)
+
+            return sum(fused_kernels), sum(fused_bias)
+
+        elif isinstance(branch, ConvNormLayer):
+            kernel = branch.conv.weight
+            running_mean = branch.norm._mean
+            running_var = branch.norm._variance
+            gamma = branch.norm.weight
+            beta = branch.norm.bias
+            eps = branch.norm._epsilon
+        else:
+            assert isinstance(branch, nn.BatchNorm2D)
+            input_dim = self.ch_in if kernel_size == 1 else 1
+            kernel_value = paddle.zeros(
+                shape=[self.ch_in, input_dim, kernel_size, kernel_size],
+                dtype='float32')
+            if kernel_size > 1:
+                for i in range(self.ch_in):
+                    kernel_value[i, i % input_dim, (kernel_size - 1) // 2, (
+                        kernel_size - 1) // 2] = 1
+            elif kernel_size == 1:
+                for i in range(self.ch_in):
+                    kernel_value[i, i % input_dim, 0, 0] = 1
+            else:
+                raise ValueError("Invalid kernel size recieved!")
+            kernel = paddle.to_tensor(kernel_value, place=branch.weight.place)
+            running_mean = branch._mean
+            running_var = branch._variance
+            gamma = branch.weight
+            beta = branch.bias
+            eps = branch._epsilon
+
+        std = (running_var + eps).sqrt()
+        t = (gamma / std).reshape((-1, 1, 1, 1))
+
+        return kernel * t, beta - running_mean * gamma / std
--- a/rtdetr_paddle/ppdet/modeling/backbones/name_adapter.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/name_adapter.py
@@ -0,0 +1,69 @@
+class NameAdapter(object):
+    """Fix the backbones variable names for pretrained weight"""
+
+    def __init__(self, model):
+        super(NameAdapter, self).__init__()
+        self.model = model
+
+    @property
+    def model_type(self):
+        return getattr(self.model, '_model_type', '')
+
+    @property
+    def variant(self):
+        return getattr(self.model, 'variant', '')
+
+    def fix_conv_norm_name(self, name):
+        if name == "conv1":
+            bn_name = "bn_" + name
+        else:
+            bn_name = "bn" + name[3:]
+        # the naming rule is same as pretrained weight
+        if self.model_type == 'SEResNeXt':
+            bn_name = name + "_bn"
+        return bn_name
+
+    def fix_shortcut_name(self, name):
+        if self.model_type == 'SEResNeXt':
+            name = 'conv' + name + '_prj'
+        return name
+
+    def fix_bottleneck_name(self, name):
+        if self.model_type == 'SEResNeXt':
+            conv_name1 = 'conv' + name + '_x1'
+            conv_name2 = 'conv' + name + '_x2'
+            conv_name3 = 'conv' + name + '_x3'
+            shortcut_name = name
+        else:
+            conv_name1 = name + "_branch2a"
+            conv_name2 = name + "_branch2b"
+            conv_name3 = name + "_branch2c"
+            shortcut_name = name + "_branch1"
+        return conv_name1, conv_name2, conv_name3, shortcut_name
+
+    def fix_basicblock_name(self, name):
+        if self.model_type == 'SEResNeXt':
+            conv_name1 = 'conv' + name + '_x1'
+            conv_name2 = 'conv' + name + '_x2'
+            shortcut_name = name
+        else:
+            conv_name1 = name + "_branch2a"
+            conv_name2 = name + "_branch2b"
+            shortcut_name = name + "_branch1"
+        return conv_name1, conv_name2, shortcut_name
+
+    def fix_layer_warp_name(self, stage_num, count, i):
+        name = 'res' + str(stage_num)
+        if count > 10 and stage_num == 4:
+            if i == 0:
+                conv_name = name + "a"
+            else:
+                conv_name = name + "b" + str(i)
+        else:
+            conv_name = name + chr(ord("a") + i)
+        if self.model_type == 'SEResNeXt':
+            conv_name = str(stage_num + 2) + '_' + str(i + 1)
+        return conv_name
+
+    def fix_c1_stage_name(self):
+        return "res_conv1" if self.model_type == 'ResNeXt' else "conv1"
--- a/rtdetr_paddle/ppdet/modeling/backbones/resnet.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/resnet.py
@@ -0,0 +1,611 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
+#   
+# Licensed under the Apache License, Version 2.0 (the "License");   
+# you may not use this file except in compliance with the License.  
+# You may obtain a copy of the License at   
+#   
+#     http://www.apache.org/licenses/LICENSE-2.0    
+#   
+# Unless required by applicable law or agreed to in writing, software   
+# distributed under the License is distributed on an "AS IS" BASIS, 
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+import math
+from numbers import Integral
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Uniform
+from paddle import ParamAttr
+from paddle.nn.initializer import Constant
+from paddle.vision.ops import DeformConv2D
+from .name_adapter import NameAdapter
+from ..shape_spec import ShapeSpec
+
+__all__ = ['ResNet', 'Res5Head', 'Blocks', 'BasicBlock', 'BottleNeck']
+
+ResNet_cfg = {
+    18: [2, 2, 2, 2],
+    34: [3, 4, 6, 3],
+    50: [3, 4, 6, 3],
+    101: [3, 4, 23, 3],
+    152: [3, 8, 36, 3],
+}
+
+
+class ConvNormLayer(nn.Layer):
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 filter_size,
+                 stride,
+                 groups=1,
+                 act=None,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 lr=1.0,
+                 dcn_v2=False):
+        super(ConvNormLayer, self).__init__()
+        assert norm_type in ['bn', 'sync_bn']
+        self.norm_type = norm_type
+        self.act = act
+        self.dcn_v2 = dcn_v2
+
+        if not self.dcn_v2:
+            self.conv = nn.Conv2D(
+                in_channels=ch_in,
+                out_channels=ch_out,
+                kernel_size=filter_size,
+                stride=stride,
+                padding=(filter_size - 1) // 2,
+                groups=groups,
+                weight_attr=ParamAttr(learning_rate=lr),
+                bias_attr=False)
+        else:
+            self.offset_channel = 2 * filter_size**2
+            self.mask_channel = filter_size**2
+
+            self.conv_offset = nn.Conv2D(
+                in_channels=ch_in,
+                out_channels=3 * filter_size**2,
+                kernel_size=filter_size,
+                stride=stride,
+                padding=(filter_size - 1) // 2,
+                weight_attr=ParamAttr(initializer=Constant(0.)),
+                bias_attr=ParamAttr(initializer=Constant(0.)))
+            self.conv = DeformConv2D(
+                in_channels=ch_in,
+                out_channels=ch_out,
+                kernel_size=filter_size,
+                stride=stride,
+                padding=(filter_size - 1) // 2,
+                dilation=1,
+                groups=groups,
+                weight_attr=ParamAttr(learning_rate=lr),
+                bias_attr=False)
+
+        norm_lr = 0. if freeze_norm else lr
+        param_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay),
+            trainable=False if freeze_norm else True)
+        bias_attr = ParamAttr(
+            learning_rate=norm_lr,
+            regularizer=L2Decay(norm_decay),
+            trainable=False if freeze_norm else True)
+
+        global_stats = True if freeze_norm else None
+        if norm_type in ['sync_bn', 'bn']:
+            self.norm = nn.BatchNorm2D(
+                ch_out,
+                weight_attr=param_attr,
+                bias_attr=bias_attr,
+                use_global_stats=global_stats)
+        norm_params = self.norm.parameters()
+
+        if freeze_norm:
+            for param in norm_params:
+                param.stop_gradient = True
+
+    def forward(self, inputs):
+        if not self.dcn_v2:
+            out = self.conv(inputs)
+        else:
+            offset_mask = self.conv_offset(inputs)
+            offset, mask = paddle.split(
+                offset_mask,
+                num_or_sections=[self.offset_channel, self.mask_channel],
+                axis=1)
+            mask = F.sigmoid(mask)
+            out = self.conv(inputs, offset, mask=mask)
+
+        if self.norm_type in ['bn', 'sync_bn']:
+            out = self.norm(out)
+        if self.act:
+            out = getattr(F, self.act)(out)
+        return out
+
+
+class SELayer(nn.Layer):
+    def __init__(self, ch, reduction_ratio=16):
+        super(SELayer, self).__init__()
+        self.pool = nn.AdaptiveAvgPool2D(1)
+        stdv = 1.0 / math.sqrt(ch)
+        c_ = ch // reduction_ratio
+        self.squeeze = nn.Linear(
+            ch,
+            c_,
+            weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
+            bias_attr=True)
+
+        stdv = 1.0 / math.sqrt(c_)
+        self.extract = nn.Linear(
+            c_,
+            ch,
+            weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
+            bias_attr=True)
+
+    def forward(self, inputs):
+        out = self.pool(inputs)
+        out = paddle.squeeze(out, axis=[2, 3])
+        out = self.squeeze(out)
+        out = F.relu(out)
+        out = self.extract(out)
+        out = F.sigmoid(out)
+        out = paddle.unsqueeze(out, axis=[2, 3])
+        scale = out * inputs
+        return scale
+
+
+class BasicBlock(nn.Layer):
+
+    expansion = 1
+
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 stride,
+                 shortcut,
+                 variant='b',
+                 groups=1,
+                 base_width=64,
+                 lr=1.0,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 dcn_v2=False,
+                 std_senet=False):
+        super(BasicBlock, self).__init__()
+        assert groups == 1 and base_width == 64, 'BasicBlock only supports groups=1 and base_width=64'
+
+        self.shortcut = shortcut
+        if not shortcut:
+            if variant == 'd' and stride == 2:
+                self.short = nn.Sequential()
+                self.short.add_sublayer(
+                    'pool',
+                    nn.AvgPool2D(
+                        kernel_size=2, stride=2, padding=0, ceil_mode=True))
+                self.short.add_sublayer(
+                    'conv',
+                    ConvNormLayer(
+                        ch_in=ch_in,
+                        ch_out=ch_out,
+                        filter_size=1,
+                        stride=1,
+                        norm_type=norm_type,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        lr=lr))
+            else:
+                self.short = ConvNormLayer(
+                    ch_in=ch_in,
+                    ch_out=ch_out,
+                    filter_size=1,
+                    stride=stride,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    lr=lr)
+
+        self.branch2a = ConvNormLayer(
+            ch_in=ch_in,
+            ch_out=ch_out,
+            filter_size=3,
+            stride=stride,
+            act='relu',
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr)
+
+        self.branch2b = ConvNormLayer(
+            ch_in=ch_out,
+            ch_out=ch_out,
+            filter_size=3,
+            stride=1,
+            act=None,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr,
+            dcn_v2=dcn_v2)
+
+        self.std_senet = std_senet
+        if self.std_senet:
+            self.se = SELayer(ch_out)
+
+    def forward(self, inputs):
+        out = self.branch2a(inputs)
+        out = self.branch2b(out)
+        if self.std_senet:
+            out = self.se(out)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        out = paddle.add(x=out, y=short)
+        out = F.relu(out)
+
+        return out
+
+
+class BottleNeck(nn.Layer):
+
+    expansion = 4
+
+    def __init__(self,
+                 ch_in,
+                 ch_out,
+                 stride,
+                 shortcut,
+                 variant='b',
+                 groups=1,
+                 base_width=4,
+                 lr=1.0,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 dcn_v2=False,
+                 std_senet=False):
+        super(BottleNeck, self).__init__()
+        if variant == 'a':
+            stride1, stride2 = stride, 1
+        else:
+            stride1, stride2 = 1, stride
+
+        # ResNeXt
+        width = int(ch_out * (base_width / 64.)) * groups
+
+        self.branch2a = ConvNormLayer(
+            ch_in=ch_in,
+            ch_out=width,
+            filter_size=1,
+            stride=stride1,
+            groups=1,
+            act='relu',
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr)
+
+        self.branch2b = ConvNormLayer(
+            ch_in=width,
+            ch_out=width,
+            filter_size=3,
+            stride=stride2,
+            groups=groups,
+            act='relu',
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr,
+            dcn_v2=dcn_v2)
+
+        self.branch2c = ConvNormLayer(
+            ch_in=width,
+            ch_out=ch_out * self.expansion,
+            filter_size=1,
+            stride=1,
+            groups=1,
+            norm_type=norm_type,
+            norm_decay=norm_decay,
+            freeze_norm=freeze_norm,
+            lr=lr)
+
+        self.shortcut = shortcut
+        if not shortcut:
+            if variant == 'd' and stride == 2:
+                self.short = nn.Sequential()
+                self.short.add_sublayer(
+                    'pool',
+                    nn.AvgPool2D(
+                        kernel_size=2, stride=2, padding=0, ceil_mode=True))
+                self.short.add_sublayer(
+                    'conv',
+                    ConvNormLayer(
+                        ch_in=ch_in,
+                        ch_out=ch_out * self.expansion,
+                        filter_size=1,
+                        stride=1,
+                        norm_type=norm_type,
+                        norm_decay=norm_decay,
+                        freeze_norm=freeze_norm,
+                        lr=lr))
+            else:
+                self.short = ConvNormLayer(
+                    ch_in=ch_in,
+                    ch_out=ch_out * self.expansion,
+                    filter_size=1,
+                    stride=stride,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    lr=lr)
+
+        self.std_senet = std_senet
+        if self.std_senet:
+            self.se = SELayer(ch_out * self.expansion)
+
+    def forward(self, inputs):
+
+        out = self.branch2a(inputs)
+        out = self.branch2b(out)
+        out = self.branch2c(out)
+
+        if self.std_senet:
+            out = self.se(out)
+
+        if self.shortcut:
+            short = inputs
+        else:
+            short = self.short(inputs)
+
+        out = paddle.add(x=out, y=short)
+        out = F.relu(out)
+
+        return out
+
+
+class Blocks(nn.Layer):
+    def __init__(self,
+                 block,
+                 ch_in,
+                 ch_out,
+                 count,
+                 name_adapter,
+                 stage_num,
+                 variant='b',
+                 groups=1,
+                 base_width=64,
+                 lr=1.0,
+                 norm_type='bn',
+                 norm_decay=0.,
+                 freeze_norm=True,
+                 dcn_v2=False,
+                 std_senet=False):
+        super(Blocks, self).__init__()
+
+        self.blocks = []
+        for i in range(count):
+            conv_name = name_adapter.fix_layer_warp_name(stage_num, count, i)
+            layer = self.add_sublayer(
+                conv_name,
+                block(
+                    ch_in=ch_in,
+                    ch_out=ch_out,
+                    stride=2 if i == 0 and stage_num != 2 else 1,
+                    shortcut=False if i == 0 else True,
+                    variant=variant,
+                    groups=groups,
+                    base_width=base_width,
+                    lr=lr,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    dcn_v2=dcn_v2,
+                    std_senet=std_senet))
+            self.blocks.append(layer)
+            if i == 0:
+                ch_in = ch_out * block.expansion
+
+    def forward(self, inputs):
+        block_out = inputs
+        for block in self.blocks:
+            block_out = block(block_out)
+        return block_out
+
+
+@register
+@serializable
+class ResNet(nn.Layer):
+    __shared__ = ['norm_type']
+
+    def __init__(self,
+                 depth=50,
+                 ch_in=64,
+                 variant='b',
+                 lr_mult_list=[1.0, 1.0, 1.0, 1.0],
+                 groups=1,
+                 base_width=64,
+                 norm_type='bn',
+                 norm_decay=0,
+                 freeze_norm=True,
+                 freeze_at=0,
+                 return_idx=[0, 1, 2, 3],
+                 dcn_v2_stages=[-1],
+                 num_stages=4,
+                 std_senet=False,
+                 freeze_stem_only=False):
+        """
+        Residual Network, see https://arxiv.org/abs/1512.03385
+        
+        Args:
+            depth (int): ResNet depth, should be 18, 34, 50, 101, 152.
+            ch_in (int): output channel of first stage, default 64
+            variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently
+            lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),
+                                 lower learning rate ratio is need for pretrained model 
+                                 got using distillation(default as [1.0, 1.0, 1.0, 1.0]).
+            groups (int): group convolution cardinality
+            base_width (int): base width of each group convolution
+            norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'
+            norm_decay (float): weight decay for normalization layer weights
+            freeze_norm (bool): freeze normalization layers
+            freeze_at (int): freeze the backbone at which stage
+            return_idx (list): index of the stages whose feature maps are returned
+            dcn_v2_stages (list): index of stages who select deformable conv v2
+            num_stages (int): total num of stages
+            std_senet (bool): whether use senet, default False.
+        """
+        super(ResNet, self).__init__()
+        self._model_type = 'ResNet' if groups == 1 else 'ResNeXt'
+        assert num_stages >= 1 and num_stages <= 4
+        self.depth = depth
+        self.variant = variant
+        self.groups = groups
+        self.base_width = base_width
+        self.norm_type = norm_type
+        self.norm_decay = norm_decay
+        self.freeze_norm = freeze_norm
+        self.freeze_at = freeze_at
+        if isinstance(return_idx, Integral):
+            return_idx = [return_idx]
+        assert max(return_idx) < num_stages, \
+            'the maximum return index must smaller than num_stages, ' \
+            'but received maximum return index is {} and num_stages ' \
+            'is {}'.format(max(return_idx), num_stages)
+        self.return_idx = return_idx
+        self.num_stages = num_stages
+        assert len(lr_mult_list) == 4, \
+            "lr_mult_list length must be 4 but got {}".format(len(lr_mult_list))
+        if isinstance(dcn_v2_stages, Integral):
+            dcn_v2_stages = [dcn_v2_stages]
+        assert max(dcn_v2_stages) < num_stages
+
+        if isinstance(dcn_v2_stages, Integral):
+            dcn_v2_stages = [dcn_v2_stages]
+        assert max(dcn_v2_stages) < num_stages
+        self.dcn_v2_stages = dcn_v2_stages
+
+        block_nums = ResNet_cfg[depth]
+        na = NameAdapter(self)
+
+        conv1_name = na.fix_c1_stage_name()
+        if variant in ['c', 'd']:
+            conv_def = [
+                [3, ch_in // 2, 3, 2, "conv1_1"],
+                [ch_in // 2, ch_in // 2, 3, 1, "conv1_2"],
+                [ch_in // 2, ch_in, 3, 1, "conv1_3"],
+            ]
+        else:
+            conv_def = [[3, ch_in, 7, 2, conv1_name]]
+        self.conv1 = nn.Sequential()
+        for (c_in, c_out, k, s, _name) in conv_def:
+            self.conv1.add_sublayer(
+                _name,
+                ConvNormLayer(
+                    ch_in=c_in,
+                    ch_out=c_out,
+                    filter_size=k,
+                    stride=s,
+                    groups=1,
+                    act='relu',
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    lr=1.0))
+
+        self.ch_in = ch_in
+        ch_out_list = [64, 128, 256, 512]
+        block = BottleNeck if depth >= 50 else BasicBlock
+
+        self._out_channels = [block.expansion * v for v in ch_out_list]
+        self._out_strides = [4, 8, 16, 32]
+
+        self.res_layers = []
+        for i in range(num_stages):
+            lr_mult = lr_mult_list[i]
+            stage_num = i + 2
+            res_name = "res{}".format(stage_num)
+            res_layer = self.add_sublayer(
+                res_name,
+                Blocks(
+                    block,
+                    self.ch_in,
+                    ch_out_list[i],
+                    count=block_nums[i],
+                    name_adapter=na,
+                    stage_num=stage_num,
+                    variant=variant,
+                    groups=groups,
+                    base_width=base_width,
+                    lr=lr_mult,
+                    norm_type=norm_type,
+                    norm_decay=norm_decay,
+                    freeze_norm=freeze_norm,
+                    dcn_v2=(i in self.dcn_v2_stages),
+                    std_senet=std_senet))
+            self.res_layers.append(res_layer)
+            self.ch_in = self._out_channels[i]
+
+        if freeze_at >= 0:
+            self._freeze_parameters(self.conv1)
+            if not freeze_stem_only:
+                for i in range(min(freeze_at + 1, num_stages)):
+                    self._freeze_parameters(self.res_layers[i])
+
+    def _freeze_parameters(self, m):
+        for p in m.parameters():
+            p.stop_gradient = True
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self._out_channels[i], stride=self._out_strides[i])
+            for i in self.return_idx
+        ]
+
+    def forward(self, inputs):
+        x = inputs['image']
+        conv1 = self.conv1(x)
+        x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1)
+        outs = []
+        for idx, stage in enumerate(self.res_layers):
+            x = stage(x)
+            if idx in self.return_idx:
+                outs.append(x)
+        return outs
+
+
+@register
+class Res5Head(nn.Layer):
+    def __init__(self, depth=50):
+        super(Res5Head, self).__init__()
+        feat_in, feat_out = [1024, 512]
+        if depth < 50:
+            feat_in = 256
+        na = NameAdapter(self)
+        block = BottleNeck if depth >= 50 else BasicBlock
+        self.res5 = Blocks(
+            block, feat_in, feat_out, count=3, name_adapter=na, stage_num=5)
+        self.feat_out = feat_out if depth < 50 else feat_out * 4
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(
+            channels=self.feat_out,
+            stride=16, )]
+
+    def forward(self, roi_feat, stage=0):
+        y = self.res5(roi_feat)
+        return y
--- a/rtdetr_paddle/ppdet/modeling/backbones/shufflenet_v2.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/shufflenet_v2.py
@@ -0,0 +1,250 @@
+# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+from paddle import ParamAttr
+import paddle.nn.functional as F
+from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm2D
+from paddle.nn.initializer import KaimingNormal
+from paddle.regularizer import L2Decay
+
+from ppdet.core.workspace import register, serializable
+from numbers import Integral
+from ..shape_spec import ShapeSpec
+from ppdet.modeling.ops import channel_shuffle
+
+__all__ = ['ShuffleNetV2']
+
+
+class ConvBNLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride,
+                 padding,
+                 groups=1,
+                 act=None):
+        super(ConvBNLayer, self).__init__()
+        self._conv = Conv2D(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            weight_attr=ParamAttr(initializer=KaimingNormal()),
+            bias_attr=False)
+
+        self._batch_norm = BatchNorm2D(
+            out_channels,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        if act == "hard_swish":
+            act = 'hardswish'
+        self.act = act
+
+    def forward(self, inputs):
+        y = self._conv(inputs)
+        y = self._batch_norm(y)
+        if self.act:
+            y = getattr(F, self.act)(y)
+        return y
+
+
+class InvertedResidual(nn.Layer):
+    def __init__(self, in_channels, out_channels, stride, act="relu"):
+        super(InvertedResidual, self).__init__()
+        self._conv_pw = ConvBNLayer(
+            in_channels=in_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+        self._conv_dw = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=out_channels // 2,
+            act=None)
+        self._conv_linear = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+
+    def forward(self, inputs):
+        x1, x2 = paddle.split(
+            inputs,
+            num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],
+            axis=1)
+        x2 = self._conv_pw(x2)
+        x2 = self._conv_dw(x2)
+        x2 = self._conv_linear(x2)
+        out = paddle.concat([x1, x2], axis=1)
+        return channel_shuffle(out, 2)
+
+
+class InvertedResidualDS(nn.Layer):
+    def __init__(self, in_channels, out_channels, stride, act="relu"):
+        super(InvertedResidualDS, self).__init__()
+
+        # branch1
+        self._conv_dw_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=in_channels,
+            act=None)
+        self._conv_linear_1 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+        # branch2
+        self._conv_pw_2 = ConvBNLayer(
+            in_channels=in_channels,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+        self._conv_dw_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            groups=out_channels // 2,
+            act=None)
+        self._conv_linear_2 = ConvBNLayer(
+            in_channels=out_channels // 2,
+            out_channels=out_channels // 2,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1,
+            act=act)
+
+    def forward(self, inputs):
+        x1 = self._conv_dw_1(inputs)
+        x1 = self._conv_linear_1(x1)
+        x2 = self._conv_pw_2(inputs)
+        x2 = self._conv_dw_2(x2)
+        x2 = self._conv_linear_2(x2)
+        out = paddle.concat([x1, x2], axis=1)
+
+        return channel_shuffle(out, 2)
+
+
+@register
+@serializable
+class ShuffleNetV2(nn.Layer):
+    def __init__(self, scale=1.0, act="relu", feature_maps=[5, 13, 17]):
+        super(ShuffleNetV2, self).__init__()
+        self.scale = scale
+        if isinstance(feature_maps, Integral):
+            feature_maps = [feature_maps]
+        self.feature_maps = feature_maps
+        stage_repeats = [4, 8, 4]
+
+        if scale == 0.25:
+            stage_out_channels = [-1, 24, 24, 48, 96, 512]
+        elif scale == 0.33:
+            stage_out_channels = [-1, 24, 32, 64, 128, 512]
+        elif scale == 0.5:
+            stage_out_channels = [-1, 24, 48, 96, 192, 1024]
+        elif scale == 1.0:
+            stage_out_channels = [-1, 24, 116, 232, 464, 1024]
+        elif scale == 1.5:
+            stage_out_channels = [-1, 24, 176, 352, 704, 1024]
+        elif scale == 2.0:
+            stage_out_channels = [-1, 24, 244, 488, 976, 2048]
+        else:
+            raise NotImplementedError("This scale size:[" + str(scale) +
+                                      "] is not implemented!")
+        self._out_channels = []
+        self._feature_idx = 0
+        # 1. conv1
+        self._conv1 = ConvBNLayer(
+            in_channels=3,
+            out_channels=stage_out_channels[1],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            act=act)
+        self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
+        self._feature_idx += 1
+
+        # 2. bottleneck sequences
+        self._block_list = []
+        for stage_id, num_repeat in enumerate(stage_repeats):
+            for i in range(num_repeat):
+                if i == 0:
+                    block = self.add_sublayer(
+                        name=str(stage_id + 2) + '_' + str(i + 1),
+                        sublayer=InvertedResidualDS(
+                            in_channels=stage_out_channels[stage_id + 1],
+                            out_channels=stage_out_channels[stage_id + 2],
+                            stride=2,
+                            act=act))
+                else:
+                    block = self.add_sublayer(
+                        name=str(stage_id + 2) + '_' + str(i + 1),
+                        sublayer=InvertedResidual(
+                            in_channels=stage_out_channels[stage_id + 2],
+                            out_channels=stage_out_channels[stage_id + 2],
+                            stride=1,
+                            act=act))
+                self._block_list.append(block)
+                self._feature_idx += 1
+                self._update_out_channels(stage_out_channels[stage_id + 2],
+                                          self._feature_idx, self.feature_maps)
+
+    def _update_out_channels(self, channel, feature_idx, feature_maps):
+        if feature_idx in feature_maps:
+            self._out_channels.append(channel)
+
+    def forward(self, inputs):
+        y = self._conv1(inputs['image'])
+        y = self._max_pool(y)
+        outs = []
+        for i, inv in enumerate(self._block_list):
+            y = inv(y)
+            if i + 2 in self.feature_maps:
+                outs.append(y)
+
+        return outs
+
+    @property
+    def out_shape(self):
+        return [ShapeSpec(channels=c) for c in self._out_channels]
--- a/rtdetr_paddle/ppdet/modeling/backbones/swin_transformer.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/swin_transformer.py
@@ -0,0 +1,752 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is based on https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
+Ths copyright of microsoft/Swin-Transformer is as follows:
+MIT License [see LICENSE for details]
+"""
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.modeling.shape_spec import ShapeSpec
+from ppdet.core.workspace import register, serializable
+from .transformer_utils import DropPath, Identity
+from .transformer_utils import add_parameter, to_2tuple
+from .transformer_utils import ones_, zeros_, trunc_normal_
+
+__all__ = ['SwinTransformer']
+
+MODEL_cfg = {
+    # use 22kto1k finetune weights as default pretrained, can set by SwinTransformer.pretrained in config
+    'swin_T_224': dict(
+        pretrain_img_size=224,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_tiny_patch4_window7_224_22kto1k_pretrained.pdparams',
+    ),
+    'swin_S_224': dict(
+        pretrain_img_size=224,
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_small_patch4_window7_224_22kto1k_pretrained.pdparams',
+    ),
+    'swin_B_224': dict(
+        pretrain_img_size=224,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=7,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window7_224_22kto1k_pretrained.pdparams',
+    ),
+    'swin_L_224': dict(
+        pretrain_img_size=224,
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=7,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window7_224_22kto1k_pretrained.pdparams',
+    ),
+    'swin_B_384': dict(
+        pretrain_img_size=384,
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window12_384_22kto1k_pretrained.pdparams',
+    ),
+    'swin_L_384': dict(
+        pretrain_img_size=384,
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window12_384_22kto1k_pretrained.pdparams',
+    ),
+}
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.reshape(
+        [-1, H // window_size, window_size, W // window_size, window_size, C])
+    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
+        [-1, window_size, window_size, C])
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    _, _, _, C = windows.shape
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.reshape(
+        [-1, H // window_size, W // window_size, window_size, window_size, C])
+    x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C])
+    return x
+
+
+class WindowAttention(nn.Layer):
+    """ Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = add_parameter(
+            self,
+            paddle.zeros(((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                          num_heads)))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(self.window_size[0])
+        coords_w = paddle.arange(self.window_size[1])
+        coords = paddle.stack(paddle.meshgrid(
+            [coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
+        coords_flatten_1 = coords_flatten.unsqueeze(axis=2)
+        coords_flatten_2 = coords_flatten.unsqueeze(axis=1)
+        relative_coords = coords_flatten_1 - coords_flatten_2
+        relative_coords = relative_coords.transpose(
+            [1, 2, 0])  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[
+            0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        self.relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table)
+        self.softmax = nn.Softmax(axis=-1)
+
+    def forward(self, x, mask=None):
+        """ Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(
+            [-1, N, 3, self.num_heads, C // self.num_heads]).transpose(
+                [2, 0, 3, 1, 4])
+        q, k, v = qkv[0], qkv[1], qkv[2]
+
+        q = q * self.scale
+        attn = paddle.mm(q, k.transpose([0, 1, 3, 2]))
+
+        index = self.relative_position_index.flatten()
+
+        relative_position_bias = paddle.index_select(
+            self.relative_position_bias_table, index)
+        relative_position_bias = relative_position_bias.reshape([
+            self.window_size[0] * self.window_size[1],
+            self.window_size[0] * self.window_size[1], -1
+        ])  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.transpose(
+            [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.reshape([-1, nW, self.num_heads, N, N
+                                 ]) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.reshape([-1, self.num_heads, N, N])
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        # x = (attn @ v).transpose(1, 2).reshape([B_, N, C])
+        x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([-1, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock(nn.Layer):
+    """ Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=7,
+                 shift_size=0,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+        self.H = None
+        self.W = None
+
+    def forward(self, x, mask_matrix):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.reshape([-1, H, W, C])
+
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, [0, pad_l, 0, pad_b, 0, pad_r, 0, pad_t],
+                  data_format='NHWC')
+        _, Hp, Wp, _ = x.shape
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = paddle.roll(
+                x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.reshape(
+            [x_windows.shape[0], self.window_size * self.window_size,
+             C])  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.reshape(
+            [x_windows.shape[0], self.window_size, self.window_size, C])
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp,
+                                   Wp)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = paddle.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                axis=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :]
+
+        x = x.reshape([-1, H * W, C])
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+
+class PatchMerging(nn.Layer):
+    r""" Patch Merging Layer.
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        x = x.reshape([-1, H, W, C])
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            # paddle F.pad default data_format is 'NCHW'
+            x = F.pad(x, [0, 0, 0, H % 2, 0, W % 2, 0, 0], data_format='NHWC')
+            H += H % 2
+            W += W % 2
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = paddle.concat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.reshape([-1, H * W // 4, 4 * C])  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+class BasicLayer(nn.Layer):
+    """ A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
+    """
+
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+
+        # build blocks
+        self.blocks = nn.LayerList([
+            SwinTransformerBlock(
+                dim=dim,
+                num_heads=num_heads,
+                window_size=window_size,
+                shift_size=0 if (i % 2 == 0) else window_size // 2,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop,
+                attn_drop=attn_drop,
+                drop_path=drop_path[i]
+                if isinstance(drop_path, np.ndarray) else drop_path,
+                norm_layer=norm_layer) for i in range(depth)
+        ])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x, H, W):
+        """ Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = paddle.zeros([1, Hp, Wp, 1], dtype='float32')  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size, -self.shift_size),
+                    slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+
+                cnt += 1
+
+        mask_windows = window_partition(
+            img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.reshape(
+            [-1, self.window_size * self.window_size])
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        huns = -100.0 * paddle.ones_like(attn_mask)
+        attn_mask = huns * (attn_mask != 0).astype("float32")
+
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Layer, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        # TODO # export dynamic shape
+        B, C, H, W = x.shape
+        # assert [H, W] == self.img_size[:2], "Input image size ({H}*{W}) doesn't match model ({}*{}).".format(H, W, self.img_size[0], self.img_size[1])
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
+
+        x = self.proj(x)
+        if self.norm is not None:
+            _, _, Wh, Ww = x.shape
+            x = x.flatten(2).transpose([0, 2, 1])
+            x = self.norm(x)
+            x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
+
+        return x
+
+
+@register
+@serializable
+class SwinTransformer(nn.Layer):
+    """ Swin Transformer backbone
+    Args:
+        arch (str): Architecture of FocalNet
+        pretrain_img_size (int | tuple(int)): Input image size. Default 224
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        in_chans (int): Number of input image channels. Default: 3
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 7
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+    """
+
+    def __init__(self,
+                 arch='swin_T_224',
+                 pretrain_img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 out_indices=(0, 1, 2, 3),
+                 frozen_stages=-1,
+                 pretrained=None):
+        super(SwinTransformer, self).__init__()
+        assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch)
+
+        pretrain_img_size = MODEL_cfg[arch]['pretrain_img_size']
+        embed_dim = MODEL_cfg[arch]['embed_dim']
+        depths = MODEL_cfg[arch]['depths']
+        num_heads = MODEL_cfg[arch]['num_heads']
+        window_size = MODEL_cfg[arch]['window_size']
+        if pretrained is None:
+            pretrained = MODEL_cfg[arch]['pretrained']
+
+        self.num_layers = len(depths)
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [
+                pretrain_img_size[0] // patch_size[0],
+                pretrain_img_size[1] // patch_size[1]
+            ]
+
+            self.absolute_pos_embed = add_parameter(
+                self,
+                paddle.zeros((1, embed_dim, patches_resolution[0],
+                              patches_resolution[1])))
+            trunc_normal_(self.absolute_pos_embed)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = np.linspace(0, drop_path_rate,
+                          sum(depths))  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.LayerList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2**i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging
+                if (i_layer < self.num_layers - 1) else None)
+            self.layers.append(layer)
+
+        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
+        self.num_features = num_features
+
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_sublayer(layer_name, layer)
+
+        self.apply(self._init_weights)
+        self._freeze_stages()
+        if pretrained:
+            if 'http' in pretrained:  #URL
+                path = paddle.utils.download.get_weights_path_from_url(
+                    pretrained)
+            else:  #model in local path
+                path = pretrained
+            self.set_state_dict(paddle.load(path))
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.stop_gradient = True
+
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.stop_gradient = True
+
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.stop_gradient = True
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                zeros_(m.bias)
+        elif isinstance(m, nn.LayerNorm):
+            zeros_(m.bias)
+            ones_(m.weight)
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x['image'])
+        B, _, Wh, Ww = x.shape
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(
+                self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
+            x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1])
+        else:
+            x = x.flatten(2).transpose([0, 2, 1])
+        x = self.pos_drop(x)
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+                out = x_out.reshape((-1, H, W, self.num_features[i])).transpose(
+                    (0, 3, 1, 2))
+                outs.append(out)
+
+        return outs
+
+    @property
+    def out_shape(self):
+        out_strides = [4, 8, 16, 32]
+        return [
+            ShapeSpec(
+                channels=self.num_features[i], stride=out_strides[i])
+            for i in self.out_indices
+        ]
--- a/rtdetr_paddle/ppdet/modeling/backbones/trans_encoder.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/trans_encoder.py
@@ -0,0 +1,381 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import ReLU, Swish, GELU
+import math
+
+from ppdet.core.workspace import register
+from ..shape_spec import ShapeSpec
+
+__all__ = ['TransEncoder']
+
+
+class BertEmbeddings(nn.Layer):
+    def __init__(self, word_size, position_embeddings_size, word_type_size,
+                 hidden_size, dropout_prob):
+        super(BertEmbeddings, self).__init__()
+        self.word_embeddings = nn.Embedding(
+            word_size, hidden_size, padding_idx=0)
+        self.position_embeddings = nn.Embedding(position_embeddings_size,
+                                                hidden_size)
+        self.token_type_embeddings = nn.Embedding(word_type_size, hidden_size)
+        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
+        self.dropout = nn.Dropout(dropout_prob)
+
+    def forward(self, x, token_type_ids=None, position_ids=None):
+        seq_len = paddle.shape(x)[1]
+        if position_ids is None:
+            position_ids = paddle.arange(seq_len).unsqueeze(0).expand_as(x)
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros(paddle.shape(x))
+
+        word_embs = self.word_embeddings(x)
+        position_embs = self.position_embeddings(position_ids)
+        token_type_embs = self.token_type_embeddings(token_type_ids)
+
+        embs_cmb = word_embs + position_embs + token_type_embs
+        embs_out = self.layernorm(embs_cmb)
+        embs_out = self.dropout(embs_out)
+        return embs_out
+
+
+class BertSelfAttention(nn.Layer):
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 output_attentions=False):
+        super(BertSelfAttention, self).__init__()
+        if hidden_size % num_attention_heads != 0:
+            raise ValueError(
+                "The hidden_size must be a multiple of the number of attention "
+                "heads, but got {} % {} != 0" %
+                (hidden_size, num_attention_heads))
+
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_size = int(hidden_size / num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(hidden_size, self.all_head_size)
+        self.key = nn.Linear(hidden_size, self.all_head_size)
+        self.value = nn.Linear(hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(attention_probs_dropout_prob)
+        self.output_attentions = output_attentions
+
+    def forward(self, x, attention_mask, head_mask=None):
+        query = self.query(x)
+        key = self.key(x)
+        value = self.value(x)
+
+        query_dim1, query_dim2 = paddle.shape(query)[:-1]
+        new_shape = [
+            query_dim1, query_dim2, self.num_attention_heads,
+            self.attention_head_size
+        ]
+        query = query.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
+        key = key.reshape(new_shape).transpose(perm=(0, 2, 3, 1))
+        value = value.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
+
+        attention = paddle.matmul(query,
+                                  key) / math.sqrt(self.attention_head_size)
+        attention = attention + attention_mask
+        attention_value = F.softmax(attention, axis=-1)
+        attention_value = self.dropout(attention_value)
+
+        if head_mask is not None:
+            attention_value = attention_value * head_mask
+
+        context = paddle.matmul(attention_value, value).transpose(perm=(0, 2, 1,
+                                                                        3))
+        ctx_dim1, ctx_dim2 = paddle.shape(context)[:-2]
+        new_context_shape = [
+            ctx_dim1,
+            ctx_dim2,
+            self.all_head_size,
+        ]
+        context = context.reshape(new_context_shape)
+
+        if self.output_attentions:
+            return (context, attention_value)
+        else:
+            return (context, )
+
+
+class BertAttention(nn.Layer):
+    def __init__(self,
+                 hidden_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 output_attentions=False):
+        super(BertAttention, self).__init__()
+        self.bert_selfattention = BertSelfAttention(
+            hidden_size, num_attention_heads, attention_probs_dropout_prob,
+            output_attentions)
+        self.fc = nn.Linear(hidden_size, hidden_size)
+        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
+        self.dropout = nn.Dropout(fc_dropout_prob)
+
+    def forward(self, x, attention_mask, head_mask=None):
+        attention_feats = self.bert_selfattention(x, attention_mask, head_mask)
+        features = self.fc(attention_feats[0])
+        features = self.dropout(features)
+        features = self.layernorm(features + x)
+        if len(attention_feats) == 2:
+            return (features, attention_feats[1])
+        else:
+            return (features, )
+
+
+class BertFeedForward(nn.Layer):
+    def __init__(self,
+                 hidden_size,
+                 intermediate_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 act_fn='ReLU',
+                 output_attentions=False):
+        super(BertFeedForward, self).__init__()
+        self.fc1 = nn.Linear(hidden_size, intermediate_size)
+        self.act_fn = eval(act_fn)
+        self.fc2 = nn.Linear(intermediate_size, hidden_size)
+        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
+        self.dropout = nn.Dropout(fc_dropout_prob)
+
+    def forward(self, x):
+        features = self.fc1(x)
+        features = self.act_fn(features)
+        features = self.fc2(features)
+        features = self.dropout(features)
+        features = self.layernorm(features + x)
+        return features
+
+
+class BertLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size,
+                 intermediate_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 act_fn='ReLU',
+                 output_attentions=False):
+        super(BertLayer, self).__init__()
+        self.attention = BertAttention(hidden_size, num_attention_heads,
+                                       attention_probs_dropout_prob,
+                                       output_attentions)
+        self.feed_forward = BertFeedForward(
+            hidden_size, intermediate_size, num_attention_heads,
+            attention_probs_dropout_prob, fc_dropout_prob, act_fn,
+            output_attentions)
+
+    def forward(self, x, attention_mask, head_mask=None):
+        attention_feats = self.attention(x, attention_mask, head_mask)
+        features = self.feed_forward(attention_feats[0])
+        if len(attention_feats) == 2:
+            return (features, attention_feats[1])
+        else:
+            return (features, )
+
+
+class BertEncoder(nn.Layer):
+    def __init__(self,
+                 num_hidden_layers,
+                 hidden_size,
+                 intermediate_size,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 act_fn='ReLU',
+                 output_attentions=False,
+                 output_hidden_feats=False):
+        super(BertEncoder, self).__init__()
+        self.output_attentions = output_attentions
+        self.output_hidden_feats = output_hidden_feats
+        self.layers = nn.LayerList([
+            BertLayer(hidden_size, intermediate_size, num_attention_heads,
+                      attention_probs_dropout_prob, fc_dropout_prob, act_fn,
+                      output_attentions) for _ in range(num_hidden_layers)
+        ])
+
+    def forward(self, x, attention_mask, head_mask=None):
+        all_features = (x, )
+        all_attentions = ()
+
+        for i, layer in enumerate(self.layers):
+            mask = head_mask[i] if head_mask is not None else None
+            layer_out = layer(x, attention_mask, mask)
+
+            if self.output_hidden_feats:
+                all_features = all_features + (x, )
+            x = layer_out[0]
+            if self.output_attentions:
+                all_attentions = all_attentions + (layer_out[1], )
+
+        outputs = (x, )
+        if self.output_hidden_feats:
+            outputs += (all_features, )
+        if self.output_attentions:
+            outputs += (all_attentions, )
+        return outputs
+
+
+class BertPooler(nn.Layer):
+    def __init__(self, hidden_size):
+        super(BertPooler, self).__init__()
+        self.fc = nn.Linear(hidden_size, hidden_size)
+        self.act = nn.Tanh()
+
+    def forward(self, x):
+        first_token = x[:, 0]
+        pooled_output = self.fc(first_token)
+        pooled_output = self.act(pooled_output)
+        return pooled_output
+
+
+class METROEncoder(nn.Layer):
+    def __init__(self,
+                 vocab_size,
+                 num_hidden_layers,
+                 features_dims,
+                 position_embeddings_size,
+                 hidden_size,
+                 intermediate_size,
+                 output_feature_dim,
+                 num_attention_heads,
+                 attention_probs_dropout_prob,
+                 fc_dropout_prob,
+                 act_fn='ReLU',
+                 output_attentions=False,
+                 output_hidden_feats=False,
+                 use_img_layernorm=False):
+        super(METROEncoder, self).__init__()
+        self.img_dims = features_dims
+        self.num_hidden_layers = num_hidden_layers
+        self.use_img_layernorm = use_img_layernorm
+        self.output_attentions = output_attentions
+        self.embedding = BertEmbeddings(vocab_size, position_embeddings_size, 2,
+                                        hidden_size, fc_dropout_prob)
+        self.encoder = BertEncoder(
+            num_hidden_layers, hidden_size, intermediate_size,
+            num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob,
+            act_fn, output_attentions, output_hidden_feats)
+        self.pooler = BertPooler(hidden_size)
+        self.position_embeddings = nn.Embedding(position_embeddings_size,
+                                                hidden_size)
+        self.img_embedding = nn.Linear(
+            features_dims, hidden_size, bias_attr=True)
+        self.dropout = nn.Dropout(fc_dropout_prob)
+        self.cls_head = nn.Linear(hidden_size, output_feature_dim)
+        self.residual = nn.Linear(features_dims, output_feature_dim)
+
+        self.apply(self.init_weights)
+
+    def init_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.set_value(
+                paddle.normal(
+                    mean=0.0, std=0.02, shape=module.weight.shape))
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.set_value(paddle.zeros(shape=module.bias.shape))
+            module.weight.set_value(
+                paddle.full(
+                    shape=module.weight.shape, fill_value=1.0))
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.set_value(paddle.zeros(shape=module.bias.shape))
+
+    def forward(self, x):
+        batchsize, seq_len = paddle.shape(x)[:2]
+        input_ids = paddle.zeros((batchsize, seq_len), dtype="int64")
+        position_ids = paddle.arange(
+            seq_len, dtype="int64").unsqueeze(0).expand_as(input_ids)
+
+        attention_mask = paddle.ones_like(input_ids).unsqueeze(1).unsqueeze(2)
+        head_mask = [None] * self.num_hidden_layers
+
+        position_embs = self.position_embeddings(position_ids)
+        attention_mask = (1.0 - attention_mask) * -10000.0
+
+        img_features = self.img_embedding(x)
+
+        # We empirically observe that adding an additional learnable position embedding leads to more stable training
+        embeddings = position_embs + img_features
+        if self.use_img_layernorm:
+            embeddings = self.layernorm(embeddings)
+        embeddings = self.dropout(embeddings)
+
+        encoder_outputs = self.encoder(
+            embeddings, attention_mask, head_mask=head_mask)
+
+        pred_score = self.cls_head(encoder_outputs[0])
+        res_img_feats = self.residual(x)
+        pred_score = pred_score + res_img_feats
+
+        if self.output_attentions and self.output_hidden_feats:
+            return pred_score, encoder_outputs[1], encoder_outputs[-1]
+        else:
+            return pred_score
+
+
+def gelu(x):
+    """Implementation of the gelu activation function.
+        https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))
+
+
+@register
+class TransEncoder(nn.Layer):
+    def __init__(self,
+                 vocab_size=30522,
+                 num_hidden_layers=4,
+                 num_attention_heads=4,
+                 position_embeddings_size=512,
+                 intermediate_size=3072,
+                 input_feat_dim=[2048, 512, 128],
+                 hidden_feat_dim=[1024, 256, 128],
+                 attention_probs_dropout_prob=0.1,
+                 fc_dropout_prob=0.1,
+                 act_fn='gelu',
+                 output_attentions=False,
+                 output_hidden_feats=False):
+        super(TransEncoder, self).__init__()
+        output_feat_dim = input_feat_dim[1:] + [3]
+        trans_encoder = []
+        for i in range(len(output_feat_dim)):
+            features_dims = input_feat_dim[i]
+            output_feature_dim = output_feat_dim[i]
+            hidden_size = hidden_feat_dim[i]
+
+            # init a transformer encoder and append it to a list
+            assert hidden_size % num_attention_heads == 0
+            model = METROEncoder(vocab_size, num_hidden_layers, features_dims,
+                                 position_embeddings_size, hidden_size,
+                                 intermediate_size, output_feature_dim,
+                                 num_attention_heads,
+                                 attention_probs_dropout_prob, fc_dropout_prob,
+                                 act_fn, output_attentions, output_hidden_feats)
+            trans_encoder.append(model)
+        self.trans_encoder = paddle.nn.Sequential(*trans_encoder)
+
+    def forward(self, x):
+        out = self.trans_encoder(x)
+        return out
--- a/rtdetr_paddle/ppdet/modeling/backbones/transformer_utils.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/transformer_utils.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddle.nn.initializer import TruncatedNormal, Constant, Assign
+
+# Common initializations
+ones_ = Constant(value=1.)
+zeros_ = Constant(value=0.)
+trunc_normal_ = TruncatedNormal(std=.02)
+
+
+# Common Layers
+def drop_path(x, drop_prob=0., training=False):
+    """
+        Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+        the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+        See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
+    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
+    random_tensor = paddle.floor(random_tensor)  # binarize
+    output = x.divide(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Layer):
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+class Identity(nn.Layer):
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, input):
+        return input
+
+
+# common funcs
+
+
+def to_2tuple(x):
+    if isinstance(x, (list, tuple)):
+        return x
+    return tuple([x] * 2)
+
+
+def add_parameter(layer, datas, name=None):
+    parameter = layer.create_parameter(
+        shape=(datas.shape), default_initializer=Assign(datas))
+    if name:
+        layer.add_parameter(name, parameter)
+    return parameter
+
+
+def window_partition(x, window_size):
+    """
+    Partition into non-overlapping windows with padding if needed.
+    Args:
+        x (tensor): input tokens with [B, H, W, C].
+        window_size (int): window size.
+    Returns:
+        windows: windows after partition with [B * num_windows, window_size, window_size, C].
+        (Hp, Wp): padded height and width before partition
+    """
+    B, H, W, C = paddle.shape(x)
+
+    pad_h = (window_size - H % window_size) % window_size
+    pad_w = (window_size - W % window_size) % window_size
+    x = F.pad(x.transpose([0, 3, 1, 2]),
+              paddle.to_tensor(
+                  [0, int(pad_w), 0, int(pad_h)],
+                  dtype='int32')).transpose([0, 2, 3, 1])
+    Hp, Wp = H + pad_h, W + pad_w
+
+    num_h, num_w = Hp // window_size, Wp // window_size
+
+    x = x.reshape([B, num_h, window_size, num_w, window_size, C])
+    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
+        [-1, window_size, window_size, C])
+    return windows, (Hp, Wp), (num_h, num_w)
+
+
+def window_unpartition(x, pad_hw, num_hw, hw):
+    """
+    Window unpartition into original sequences and removing padding.
+    Args:
+        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
+        pad_hw (Tuple): padded height and width (Hp, Wp).
+        hw (Tuple): original height and width (H, W) before padding.
+    Returns:
+        x: unpartitioned sequences with [B, H, W, C].
+    """
+    Hp, Wp = pad_hw
+    num_h, num_w = num_hw
+    H, W = hw
+    B, window_size, _, C = paddle.shape(x)
+    B = B // (num_h * num_w)
+    x = x.reshape([B, num_h, num_w, window_size, window_size, C])
+    x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, Hp, Wp, C])
+
+    return x[:, :H, :W, :]
--- a/rtdetr_paddle/ppdet/modeling/backbones/vision_transformer.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/vision_transformer.py
@@ -0,0 +1,652 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+from paddle.nn.initializer import Constant
+
+from ppdet.modeling.shape_spec import ShapeSpec
+from ppdet.core.workspace import register, serializable
+
+from .transformer_utils import zeros_, DropPath, Identity
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 window_size=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias_attr=False)
+
+        if qkv_bias:
+            self.q_bias = self.create_parameter(
+                shape=([dim]), default_initializer=zeros_)
+            self.v_bias = self.create_parameter(
+                shape=([dim]), default_initializer=zeros_)
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0] - 1) * (
+                2 * window_size[1] - 1) + 3
+            self.relative_position_bias_table = self.create_parameter(
+                shape=(self.num_relative_distance, num_heads),
+                default_initializer=zeros_)  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+
+            # get pair-wise relative position index for each token inside the window
+            coords_h = paddle.arange(window_size[0])
+            coords_w = paddle.arange(window_size[1])
+            coords = paddle.stack(paddle.meshgrid(
+                [coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww 
+            coords_flatten_1 = paddle.unsqueeze(coords_flatten, 2)
+            coords_flatten_2 = paddle.unsqueeze(coords_flatten, 1)
+            relative_coords = coords_flatten_1.clone() - coords_flatten_2.clone(
+            )
+
+            #relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Wh
+            relative_coords = relative_coords.transpose(
+                (1, 2, 0))  #.contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += window_size[
+                0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = \
+                paddle.zeros(shape=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
+            relative_position_index[1:, 1:] = relative_coords.sum(
+                -1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+
+            self.register_buffer("relative_position_index",
+                                 relative_position_index)
+            # trunc_normal_(self.relative_position_bias_table, std=.0)
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, rel_pos_bias=None):
+        x_shape = paddle.shape(x)
+        N, C = x_shape[1], x_shape[2]
+
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = paddle.concat(
+                (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
+        qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
+
+        qkv = qkv.reshape((-1, N, 3, self.num_heads,
+                           C // self.num_heads)).transpose((2, 0, 3, 1, 4))
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
+
+        if self.relative_position_bias_table is not None:
+            relative_position_bias = self.relative_position_bias_table[
+                self.relative_position_index.reshape([-1])].reshape([
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1, -1
+                ])  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.transpose(
+                (2, 0, 1))  #.contiguous()  # nH, Wh*Ww, Wh*Ww
+            attn = attn + relative_position_bias.unsqueeze(0)
+        if rel_pos_bias is not None:
+            attn = attn + rel_pos_bias
+
+        attn = nn.functional.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 window_size=None,
+                 init_values=None,
+                 act_layer=nn.GELU,
+                 norm_layer='nn.LayerNorm',
+                 epsilon=1e-5):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim, epsilon=1e-6)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            window_size=window_size)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+        if init_values is not None:
+            self.gamma_1 = self.create_parameter(
+                shape=([dim]), default_initializer=Constant(value=init_values))
+            self.gamma_2 = self.create_parameter(
+                shape=([dim]), default_initializer=Constant(value=init_values))
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x, rel_pos_bias=None):
+
+        if self.gamma_1 is None:
+            x = x + self.drop_path(
+                self.attn(
+                    self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(
+                self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=[224, 224],
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768):
+        super().__init__()
+        self.num_patches_w = img_size[0] // patch_size
+        self.num_patches_h = img_size[1] // patch_size
+
+        num_patches = self.num_patches_w * self.num_patches_h
+        self.patch_shape = (img_size[0] // patch_size,
+                            img_size[1] // patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2D(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    @property
+    def num_patches_in_h(self):
+        return self.img_size[1] // self.patch_size
+
+    @property
+    def num_patches_in_w(self):
+        return self.img_size[0] // self.patch_size
+
+    def forward(self, x, mask=None):
+        B, C, H, W = x.shape
+        return self.proj(x)
+
+
+class RelativePositionBias(nn.Layer):
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0] - 1) * (
+            2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = self.create_parameter(
+            shape=(self.num_relative_distance, num_heads),
+            default_initialize=zeros_)
+        # cls to token & token 2 cls & cls to cls
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = paddle.arange(window_size[0])
+        coords_w = paddle.arange(window_size[1])
+        coords = paddle.stack(paddle.meshgrid(
+            [coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = coords.flatten(1)  # 2, Wh*Ww
+
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.transpos(
+            (1, 2, 0))  # Wh*Ww, Wh*Ww, 2 
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = \
+            paddle.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:, 1:] = relative_coords.sum(
+            -1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+        self.register_buffer("relative_position_index", relative_position_index)
+
+    def forward(self):
+        relative_position_bias = \
+            self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
+                 self.window_size[0] * self.window_size[1] + 1,
+                 self.window_size[0] * self.window_size[1] + 1, -1])  # Wh*Ww,Wh*Ww,nH 
+        return relative_position_bias.transpose((2, 0, 1))  # nH, Wh*Ww, Wh*Ww
+
+
+def get_sinusoid_encoding_table(n_position, d_hid, token=False):
+    ''' Sinusoid position encoding table '''
+
+    def get_position_angle_vec(position):
+        return [
+            position / np.power(10000, 2 * (hid_j // 2) / d_hid)
+            for hid_j in range(d_hid)
+        ]
+
+    sinusoid_table = np.array(
+        [get_position_angle_vec(pos_i) for pos_i in range(n_position)])
+    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+    if token:
+        sinusoid_table = np.concatenate(
+            [sinusoid_table, np.zeros([1, d_hid])], dim=0)
+
+    return paddle.to_tensor(sinusoid_table, dtype=paddle.float32).unsqueeze(0)
+
+
+@register
+@serializable
+class VisionTransformer(nn.Layer):
+    """ Vision Transformer with support for patch input
+    """
+
+    def __init__(self,
+                 img_size=[672, 1092],
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 norm_layer='nn.LayerNorm',
+                 init_values=None,
+                 use_rel_pos_bias=False,
+                 use_shared_rel_pos_bias=False,
+                 epsilon=1e-5,
+                 final_norm=False,
+                 pretrained=None,
+                 out_indices=[3, 5, 7, 11],
+                 use_abs_pos_emb=False,
+                 use_sincos_pos_emb=True,
+                 with_fpn=True,
+                 num_fpn_levels=4,
+                 use_checkpoint=False,
+                 **args):
+        super().__init__()
+        self.img_size = img_size
+        self.embed_dim = embed_dim
+        self.with_fpn = with_fpn
+        self.use_checkpoint = use_checkpoint
+        self.use_sincos_pos_emb = use_sincos_pos_emb
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.final_norm = final_norm
+        self.out_indices = out_indices
+        self.num_fpn_levels = num_fpn_levels
+
+        if use_checkpoint:
+            paddle.seed(0)
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim)
+
+        self.pos_w = self.patch_embed.num_patches_in_w
+        self.pos_h = self.patch_embed.num_patches_in_h
+
+        self.cls_token = self.create_parameter(
+            shape=(1, 1, embed_dim),
+            default_initializer=paddle.nn.initializer.Constant(value=0.))
+
+        if use_abs_pos_emb:
+            self.pos_embed = self.create_parameter(
+                shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
+                default_initializer=paddle.nn.initializer.TruncatedNormal(
+                    std=.02))
+        elif use_sincos_pos_emb:
+            pos_embed = self.build_2d_sincos_position_embedding(embed_dim)
+
+            self.pos_embed = pos_embed
+            self.pos_embed = self.create_parameter(shape=pos_embed.shape)
+            self.pos_embed.set_value(pos_embed.numpy())
+            self.pos_embed.stop_gradient = True
+
+        else:
+            self.pos_embed = None
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(
+                window_size=self.patch_embed.patch_shape, num_heads=num_heads)
+        else:
+            self.rel_pos_bias = None
+
+        dpr = np.linspace(0, drop_path_rate, depth)
+
+        self.blocks = nn.LayerList([
+            Block(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                init_values=init_values,
+                window_size=self.patch_embed.patch_shape
+                if use_rel_pos_bias else None,
+                epsilon=epsilon) for i in range(depth)
+        ])
+
+        self.pretrained = pretrained
+        self.init_weight()
+
+        assert len(out_indices) <= 4, ''
+        self.out_indices = out_indices
+        self.out_channels = [embed_dim for _ in range(num_fpn_levels)]
+        self.out_strides = [4, 8, 16, 32][-num_fpn_levels:] if with_fpn else [
+            patch_size for _ in range(len(out_indices))
+        ]
+
+        self.norm = Identity()
+
+        if self.with_fpn:
+            assert num_fpn_levels <= 4, ''
+            self.init_fpn(
+                embed_dim=embed_dim,
+                patch_size=patch_size, )
+
+    def init_weight(self):
+        pretrained = self.pretrained
+
+        if pretrained:
+            if 'http' in pretrained:  #URL
+                path = paddle.utils.download.get_weights_path_from_url(
+                    pretrained)
+            else:  #model in local path
+                path = pretrained
+
+            load_state_dict = paddle.load(path)
+            model_state_dict = self.state_dict()
+            pos_embed_name = "pos_embed"
+
+            if pos_embed_name in load_state_dict.keys():
+                load_pos_embed = paddle.to_tensor(
+                    load_state_dict[pos_embed_name], dtype="float32")
+                if self.pos_embed.shape != load_pos_embed.shape:
+                    pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
+                    model_state_dict[pos_embed_name] = self.resize_pos_embed(
+                        load_pos_embed, (pos_size, pos_size),
+                        (self.pos_h, self.pos_w))
+
+                    # self.set_state_dict(model_state_dict)
+                    load_state_dict[pos_embed_name] = model_state_dict[
+                        pos_embed_name]
+
+                    print("Load pos_embed and resize it from {} to {} .".format(
+                        load_pos_embed.shape, self.pos_embed.shape))
+
+            self.set_state_dict(load_state_dict)
+            print("Load load_state_dict....")
+
+    def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
+        if patch_size == 16:
+            self.fpn1 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2),
+                nn.BatchNorm2D(embed_dim),
+                nn.GELU(),
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn2 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn3 = Identity()
+
+            self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
+        elif patch_size == 8:
+            self.fpn1 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn2 = Identity()
+
+            self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )
+
+            self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )
+
+        if not out_with_norm:
+            self.norm = Identity()
+        else:
+            self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6)
+
+    def interpolate_pos_encoding(self, x, w, h):
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        w0 = w // self.patch_embed.patch_size
+        h0 = h // self.patch_embed.patch_size
+        if npatch == N and w0 == self.patch_embed.num_patches_w and h0 == self.patch_embed.num_patches_h:
+            return self.pos_embed
+        class_pos_embed = self.pos_embed[:, 0]
+        patch_pos_embed = self.pos_embed[:, 1:]
+        dim = x.shape[-1]
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        # w0, h0 = w0 + 0.1, h0 + 0.1
+        # patch_pos_embed = nn.functional.interpolate(
+        #     patch_pos_embed.reshape([
+        #         1, self.patch_embed.num_patches_w,
+        #         self.patch_embed.num_patches_h, dim
+        #     ]).transpose((0, 3, 1, 2)),
+        #     scale_factor=(w0 / self.patch_embed.num_patches_w,
+        #                   h0 / self.patch_embed.num_patches_h),
+        #     mode='bicubic', )
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape([
+                1, self.patch_embed.num_patches_w,
+                self.patch_embed.num_patches_h, dim
+            ]).transpose((0, 3, 1, 2)),
+            (w0, h0),
+            mode='bicubic', )
+
+        assert int(w0) == patch_pos_embed.shape[-2] and int(
+            h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.transpose(
+            (0, 2, 3, 1)).reshape([1, -1, dim])
+        return paddle.concat(
+            (class_pos_embed.unsqueeze(0), patch_pos_embed), axis=1)
+
+    def resize_pos_embed(self, pos_embed, old_hw, new_hw):
+        """
+        Resize pos_embed weight.
+        Args:
+            pos_embed (Tensor): the pos_embed weight
+            old_hw (list[int]): the height and width of old pos_embed
+            new_hw (list[int]): the height and width of new pos_embed
+        Returns:
+            Tensor: the resized pos_embed weight
+        """
+        cls_pos_embed = pos_embed[:, :1, :]
+        pos_embed = pos_embed[:, 1:, :]
+
+        pos_embed = pos_embed.transpose([0, 2, 1])
+        pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
+        pos_embed = F.interpolate(
+            pos_embed, new_hw, mode='bicubic', align_corners=False)
+        pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
+        pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
+
+        return pos_embed
+
+    def build_2d_sincos_position_embedding(
+            self,
+            embed_dim=768,
+            temperature=10000., ):
+        h, w = self.patch_embed.patch_shape
+        grid_w = paddle.arange(w, dtype=paddle.float32)
+        grid_h = paddle.arange(h, dtype=paddle.float32)
+        grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
+        assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+        pos_dim = embed_dim // 4
+        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
+        omega = 1. / (temperature**omega)
+
+        out_w = grid_w.flatten()[..., None] @omega[None]
+        out_h = grid_h.flatten()[..., None] @omega[None]
+
+        pos_emb = paddle.concat(
+            [
+                paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
+                paddle.cos(out_h)
+            ],
+            axis=1)[None, :, :]
+
+        pe_token = paddle.zeros([1, 1, embed_dim], dtype=paddle.float32)
+        pos_embed = paddle.concat([pe_token, pos_emb], axis=1)
+        # pos_embed.stop_gradient = True
+
+        return pos_embed
+
+    def forward(self, x):
+        x = x['image'] if isinstance(x, dict) else x
+        _, _, h, w = x.shape
+
+        x = self.patch_embed(x)
+
+        B, D, Hp, Wp = x.shape  # b * c * h * w
+
+        cls_tokens = self.cls_token.expand(
+            (B, self.cls_token.shape[-2], self.cls_token.shape[-1]))
+        x = x.flatten(2).transpose([0, 2, 1])  # b * hw * c
+        x = paddle.concat([cls_tokens, x], axis=1)
+
+        if self.pos_embed is not None:
+            # x = x + self.interpolate_pos_encoding(x, w, h)
+            x = x + self.interpolate_pos_encoding(x, h, w)
+
+        x = self.pos_drop(x)
+
+        rel_pos_bias = self.rel_pos_bias(
+        ) if self.rel_pos_bias is not None else None
+
+        feats = []
+        for idx, blk in enumerate(self.blocks):
+            if self.use_checkpoint and self.training:
+                x = paddle.distributed.fleet.utils.recompute(
+                    blk, x, rel_pos_bias, **{"preserve_rng_state": True})
+            else:
+                x = blk(x, rel_pos_bias)
+
+            if idx in self.out_indices:
+                xp = paddle.reshape(
+                    paddle.transpose(
+                        self.norm(x[:, 1:, :]), perm=[0, 2, 1]),
+                    shape=[B, D, Hp, Wp])
+                feats.append(xp)
+
+        if self.with_fpn:
+            fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4][
+                -self.num_fpn_levels:]
+            assert len(fpns) == len(feats) or len(feats) == 1, ''
+            outputs = []
+            for i, m in enumerate(fpns):
+                outputs.append(
+                    m(feats[i] if len(feats) == len(fpns) else feats[-1]))
+
+            return outputs
+
+        return feats
+
+    @property
+    def num_layers(self):
+        return len(self.blocks)
+
+    @property
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=c, stride=s)
+            for c, s in zip(self.out_channels, self.out_strides)
+        ]
--- a/rtdetr_paddle/ppdet/modeling/backbones/vit_mae.py
+++ b/rtdetr_paddle/ppdet/modeling/backbones/vit_mae.py
@@ -0,0 +1,749 @@
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import numpy as np
+import math
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+from paddle.nn.initializer import Constant, TruncatedNormal
+
+from ppdet.modeling.shape_spec import ShapeSpec
+from ppdet.core.workspace import register, serializable
+
+from .transformer_utils import (zeros_, DropPath, Identity, window_partition,
+                                window_unpartition)
+from ..initializer import linear_init_
+
+__all__ = ['VisionTransformer2D', 'SimpleFeaturePyramid']
+
+
+class Mlp(nn.Layer):
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer='nn.GELU',
+                 drop=0.,
+                 lr_factor=1.0):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(
+            in_features,
+            hidden_features,
+            weight_attr=ParamAttr(learning_rate=lr_factor),
+            bias_attr=ParamAttr(learning_rate=lr_factor))
+        self.act = eval(act_layer)()
+        self.fc2 = nn.Linear(
+            hidden_features,
+            out_features,
+            weight_attr=ParamAttr(learning_rate=lr_factor),
+            bias_attr=ParamAttr(learning_rate=lr_factor))
+        self.drop = nn.Dropout(drop)
+
+        self._init_weights()
+
+    def _init_weights(self):
+        linear_init_(self.fc1)
+        linear_init_(self.fc2)
+
+    def forward(self, x):
+        x = self.drop(self.act(self.fc1(x)))
+        x = self.drop(self.fc2(x))
+        return x
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads=8,
+                 qkv_bias=False,
+                 attn_bias=False,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 use_rel_pos=False,
+                 rel_pos_zero_init=True,
+                 window_size=None,
+                 input_size=None,
+                 qk_scale=None,
+                 lr_factor=1.0):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = qk_scale or self.head_dim**-0.5
+        self.use_rel_pos = use_rel_pos
+        self.input_size = input_size
+        self.rel_pos_zero_init = rel_pos_zero_init
+        self.window_size = window_size
+        self.lr_factor = lr_factor
+
+        self.qkv = nn.Linear(
+            dim,
+            dim * 3,
+            weight_attr=ParamAttr(learning_rate=lr_factor),
+            bias_attr=ParamAttr(learning_rate=lr_factor)
+            if attn_bias else False)
+        if qkv_bias:
+            self.q_bias = self.create_parameter(
+                shape=([dim]), default_initializer=zeros_)
+            self.v_bias = self.create_parameter(
+                shape=([dim]), default_initializer=zeros_)
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        self.proj = nn.Linear(
+            dim,
+            dim,
+            weight_attr=ParamAttr(learning_rate=lr_factor),
+            bias_attr=ParamAttr(learning_rate=lr_factor))
+        self.attn_drop = nn.Dropout(attn_drop)
+        if window_size is None:
+            self.window_size = self.input_size[0]
+
+        self._init_weights()
+
+    def _init_weights(self):
+        linear_init_(self.qkv)
+        linear_init_(self.proj)
+
+        if self.use_rel_pos:
+            self.rel_pos_h = self.create_parameter(
+                [2 * self.window_size - 1, self.head_dim],
+                attr=ParamAttr(learning_rate=self.lr_factor),
+                default_initializer=Constant(value=0.))
+            self.rel_pos_w = self.create_parameter(
+                [2 * self.window_size - 1, self.head_dim],
+                attr=ParamAttr(learning_rate=self.lr_factor),
+                default_initializer=Constant(value=0.))
+
+            if not self.rel_pos_zero_init:
+                TruncatedNormal(self.rel_pos_h, std=0.02)
+                TruncatedNormal(self.rel_pos_w, std=0.02)
+
+    def get_rel_pos(self, seq_size, rel_pos):
+        max_rel_dist = int(2 * seq_size - 1)
+        # Interpolate rel pos if needed.
+        if rel_pos.shape[0] != max_rel_dist:
+            # Interpolate rel pos.
+            rel_pos = rel_pos.reshape([1, rel_pos.shape[0], -1])
+            rel_pos = rel_pos.transpose([0, 2, 1])
+            rel_pos_resized = F.interpolate(
+                rel_pos,
+                size=(max_rel_dist, ),
+                mode="linear",
+                data_format='NCW')
+            rel_pos_resized = rel_pos_resized.reshape([-1, max_rel_dist])
+            rel_pos_resized = rel_pos_resized.transpose([1, 0])
+        else:
+            rel_pos_resized = rel_pos
+
+        coords = paddle.arange(seq_size, dtype='float32')
+        relative_coords = coords.unsqueeze(-1) - coords.unsqueeze(0)
+        relative_coords += (seq_size - 1)
+        relative_coords = relative_coords.astype('int64').flatten()
+
+        return paddle.index_select(rel_pos_resized, relative_coords).reshape(
+            [seq_size, seq_size, self.head_dim])
+
+    def add_decomposed_rel_pos(self, attn, q, h, w):
+        """
+        Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
+        Args:
+            attn (Tensor): attention map.
+            q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
+        Returns:
+            attn (Tensor): attention map with added relative positional embeddings.
+        """
+        Rh = self.get_rel_pos(h, self.rel_pos_h)
+        Rw = self.get_rel_pos(w, self.rel_pos_w)
+
+        B, _, dim = q.shape
+        r_q = q.reshape([B, h, w, dim])
+        # bhwc, hch->bhwh1
+        # bwhc, wcw->bhw1w
+        rel_h = paddle.einsum("bhwc,hkc->bhwk", r_q, Rh).unsqueeze(-1)
+        rel_w = paddle.einsum("bhwc,wkc->bhwk", r_q, Rw).unsqueeze(-2)
+
+        attn = attn.reshape([B, h, w, h, w]) + rel_h + rel_w
+        return attn.reshape([B, h * w, h * w])
+
+    def forward(self, x):
+        B, H, W, C = paddle.shape(x)
+
+        if self.q_bias is not None:
+            qkv_bias = paddle.concat(
+                (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
+            qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
+        else:
+            qkv = self.qkv(x).reshape(
+                [B, H * W, 3, self.num_heads, self.head_dim]).transpose(
+                    [2, 0, 3, 1, 4]).reshape(
+                        [3, B * self.num_heads, H * W, self.head_dim])
+
+        q, k, v = qkv[0], qkv[1], qkv[2]
+        attn = q.matmul(k.transpose([0, 2, 1])) * self.scale
+
+        if self.use_rel_pos:
+            attn = self.add_decomposed_rel_pos(attn, q, H, W)
+
+        attn = F.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn)
+        x = attn.matmul(v).reshape(
+            [B, self.num_heads, H * W, self.head_dim]).transpose(
+                [0, 2, 1, 3]).reshape([B, H, W, C])
+        x = self.proj(x)
+        return x
+
+
+class Block(nn.Layer):
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 mlp_ratio=4.,
+                 qkv_bias=False,
+                 attn_bias=False,
+                 qk_scale=None,
+                 init_values=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 use_rel_pos=True,
+                 rel_pos_zero_init=True,
+                 window_size=None,
+                 input_size=None,
+                 act_layer='nn.GELU',
+                 norm_layer='nn.LayerNorm',
+                 lr_factor=1.0,
+                 epsilon=1e-5):
+        super().__init__()
+        self.window_size = window_size
+
+        self.norm1 = eval(norm_layer)(dim,
+                                      weight_attr=ParamAttr(
+                                          learning_rate=lr_factor,
+                                          regularizer=L2Decay(0.0)),
+                                      bias_attr=ParamAttr(
+                                          learning_rate=lr_factor,
+                                          regularizer=L2Decay(0.0)),
+                                      epsilon=epsilon)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            attn_bias=attn_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            use_rel_pos=use_rel_pos,
+            rel_pos_zero_init=rel_pos_zero_init,
+            window_size=window_size,
+            input_size=input_size,
+            lr_factor=lr_factor)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
+        self.norm2 = eval(norm_layer)(dim,
+                                      weight_attr=ParamAttr(
+                                          learning_rate=lr_factor,
+                                          regularizer=L2Decay(0.0)),
+                                      bias_attr=ParamAttr(
+                                          learning_rate=lr_factor,
+                                          regularizer=L2Decay(0.0)),
+                                      epsilon=epsilon)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=int(dim * mlp_ratio),
+                       act_layer=act_layer,
+                       drop=drop,
+                       lr_factor=lr_factor)
+        if init_values is not None:
+            self.gamma_1 = self.create_parameter(
+                shape=([dim]), default_initializer=Constant(value=init_values))
+            self.gamma_2 = self.create_parameter(
+                shape=([dim]), default_initializer=Constant(value=init_values))
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+    def forward(self, x):
+        y = self.norm1(x)
+        if self.window_size is not None:
+            y, pad_hw, num_hw = window_partition(y, self.window_size)
+        y = self.attn(y)
+        if self.gamma_1 is not None:
+            y = self.gamma_1 * y
+
+        if self.window_size is not None:
+            y = window_unpartition(y, pad_hw, num_hw, (x.shape[1], x.shape[2]))
+        x = x + self.drop_path(y)
+        if self.gamma_2 is None:
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+
+        return x
+
+
+class PatchEmbed(nn.Layer):
+    """ Image to Patch Embedding
+    """
+
+    def __init__(self,
+                 img_size=(224, 224),
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 lr_factor=0.01):
+        super().__init__()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.proj = nn.Conv2D(
+            in_chans,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            weight_attr=ParamAttr(learning_rate=lr_factor),
+            bias_attr=ParamAttr(learning_rate=lr_factor))
+
+    @property
+    def num_patches_in_h(self):
+        return self.img_size[1] // self.patch_size
+
+    @property
+    def num_patches_in_w(self):
+        return self.img_size[0] // self.patch_size
+
+    def forward(self, x):
+        out = self.proj(x)
+        return out
+
+
+@register
+@serializable
+class VisionTransformer2D(nn.Layer):
+    """ Vision Transformer with support for patch input
+    """
+
+    def __init__(self,
+                 img_size=(1024, 1024),
+                 patch_size=16,
+                 in_chans=3,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=False,
+                 attn_bias=False,
+                 qk_scale=None,
+                 init_values=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_layer='nn.GELU',
+                 norm_layer='nn.LayerNorm',
+                 lr_decay_rate=1.0,
+                 global_attn_indexes=(2, 5, 8, 11),
+                 use_abs_pos=False,
+                 use_rel_pos=False,
+                 use_abs_pos_emb=False,
+                 use_sincos_pos_emb=False,
+                 rel_pos_zero_init=True,
+                 epsilon=1e-5,
+                 final_norm=False,
+                 pretrained=None,
+                 window_size=None,
+                 out_indices=(11, ),
+                 with_fpn=False,
+                 use_checkpoint=False,
+                 *args,
+                 **kwargs):
+        super().__init__()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.depth = depth
+        self.global_attn_indexes = global_attn_indexes
+        self.epsilon = epsilon
+        self.with_fpn = with_fpn
+        self.use_checkpoint = use_checkpoint
+
+        self.patch_h = img_size[0] // patch_size
+        self.patch_w = img_size[1] // patch_size
+        self.num_patches = self.patch_h * self.patch_w
+        self.use_abs_pos = use_abs_pos
+        self.use_abs_pos_emb = use_abs_pos_emb
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim)
+
+        dpr = np.linspace(0, drop_path_rate, depth)
+        if use_checkpoint:
+            paddle.seed(0)
+
+        if use_abs_pos_emb:
+            self.pos_w = self.patch_embed.num_patches_in_w
+            self.pos_h = self.patch_embed.num_patches_in_h
+            self.pos_embed = self.create_parameter(
+                shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
+                default_initializer=paddle.nn.initializer.TruncatedNormal(
+                    std=.02))
+        elif use_sincos_pos_emb:
+            pos_embed = self.get_2d_sincos_position_embedding(self.patch_h,
+                                                              self.patch_w)
+
+            self.pos_embed = pos_embed
+            self.pos_embed = self.create_parameter(shape=pos_embed.shape)
+            self.pos_embed.set_value(pos_embed.numpy())
+            self.pos_embed.stop_gradient = True
+        else:
+            self.pos_embed = None
+
+        self.blocks = nn.LayerList([
+            Block(
+                embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                attn_bias=attn_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[i],
+                use_rel_pos=use_rel_pos,
+                rel_pos_zero_init=rel_pos_zero_init,
+                window_size=None
+                if i in self.global_attn_indexes else window_size,
+                input_size=[self.patch_h, self.patch_w],
+                act_layer=act_layer,
+                lr_factor=self.get_vit_lr_decay_rate(i, lr_decay_rate),
+                norm_layer=norm_layer,
+                init_values=init_values,
+                epsilon=epsilon) for i in range(depth)
+        ])
+
+        assert len(out_indices) <= 4, 'out_indices out of bound'
+        self.out_indices = out_indices
+        self.pretrained = pretrained
+        self.init_weight()
+
+        self.out_channels = [embed_dim for _ in range(len(out_indices))]
+        self.out_strides = [4, 8, 16, 32][-len(out_indices):] if with_fpn else [
+            patch_size for _ in range(len(out_indices))
+        ]
+        self.norm = Identity()
+        if self.with_fpn:
+            self.init_fpn(
+                embed_dim=embed_dim,
+                patch_size=patch_size,
+                out_with_norm=final_norm)
+
+    def get_vit_lr_decay_rate(self, layer_id, lr_decay_rate):
+        return lr_decay_rate**(self.depth - layer_id)
+
+    def init_weight(self):
+        pretrained = self.pretrained
+        if pretrained:
+            if 'http' in pretrained:
+                path = paddle.utils.download.get_weights_path_from_url(
+                    pretrained)
+            else:
+                path = pretrained
+
+            load_state_dict = paddle.load(path)
+            model_state_dict = self.state_dict()
+            pos_embed_name = "pos_embed"
+
+            if pos_embed_name in load_state_dict.keys(
+            ) and self.use_abs_pos_emb:
+                load_pos_embed = paddle.to_tensor(
+                    load_state_dict[pos_embed_name], dtype="float32")
+                if self.pos_embed.shape != load_pos_embed.shape:
+                    pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
+                    model_state_dict[pos_embed_name] = self.resize_pos_embed(
+                        load_pos_embed, (pos_size, pos_size),
+                        (self.pos_h, self.pos_w))
+
+                    # self.set_state_dict(model_state_dict)
+                    load_state_dict[pos_embed_name] = model_state_dict[
+                        pos_embed_name]
+
+                    print("Load pos_embed and resize it from {} to {} .".format(
+                        load_pos_embed.shape, self.pos_embed.shape))
+
+            self.set_state_dict(load_state_dict)
+            print("Load load_state_dict....")
+
+    def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
+        if patch_size == 16:
+            self.fpn1 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2),
+                nn.BatchNorm2D(embed_dim),
+                nn.GELU(),
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn2 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn3 = Identity()
+
+            self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
+        elif patch_size == 8:
+            self.fpn1 = nn.Sequential(
+                nn.Conv2DTranspose(
+                    embed_dim, embed_dim, kernel_size=2, stride=2), )
+
+            self.fpn2 = Identity()
+
+            self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )
+
+            self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )
+
+        if not out_with_norm:
+            self.norm = Identity()
+        else:
+            self.norm = nn.LayerNorm(embed_dim, epsilon=self.epsilon)
+
+    def resize_pos_embed(self, pos_embed, old_hw, new_hw):
+        """
+        Resize pos_embed weight.
+        Args:
+            pos_embed (Tensor): the pos_embed weight
+            old_hw (list[int]): the height and width of old pos_embed
+            new_hw (list[int]): the height and width of new pos_embed
+        Returns:
+            Tensor: the resized pos_embed weight
+        """
+        cls_pos_embed = pos_embed[:, :1, :]
+        pos_embed = pos_embed[:, 1:, :]
+
+        pos_embed = pos_embed.transpose([0, 2, 1])
+        pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
+        pos_embed = F.interpolate(
+            pos_embed, new_hw, mode='bicubic', align_corners=False)
+        pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
+        pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
+
+        return pos_embed
+
+    def get_2d_sincos_position_embedding(self, h, w, temperature=10000.):
+        grid_y, grid_x = paddle.meshgrid(
+            paddle.arange(
+                h, dtype=paddle.float32),
+            paddle.arange(
+                w, dtype=paddle.float32))
+        assert self.embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+        pos_dim = self.embed_dim // 4
+        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
+        omega = (1. / (temperature**omega)).unsqueeze(0)
+
+        out_x = grid_x.reshape([-1, 1]).matmul(omega)
+        out_y = grid_y.reshape([-1, 1]).matmul(omega)
+
+        pos_emb = paddle.concat(
+            [
+                paddle.sin(out_y), paddle.cos(out_y), paddle.sin(out_x),
+                paddle.cos(out_x)
+            ],
+            axis=1)
+
+        return pos_emb.reshape([1, h, w, self.embed_dim])
+
+    def forward(self, inputs):
+        x = self.patch_embed(inputs['image']).transpose([0, 2, 3, 1])
+        B, Hp, Wp, _ = paddle.shape(x)
+
+        if self.use_abs_pos:
+            x = x + self.get_2d_sincos_position_embedding(Hp, Wp)
+
+        if self.use_abs_pos_emb:
+            x = x + self.resize_pos_embed(self.pos_embed,
+                                          (self.pos_h, self.pos_w), (Hp, Wp))
+
+        feats = []
+        for idx, blk in enumerate(self.blocks):
+            if self.use_checkpoint and self.training:
+                x = paddle.distributed.fleet.utils.recompute(
+                    blk, x, **{"preserve_rng_state": True})
+            else:
+                x = blk(x)
+            if idx in self.out_indices:
+                feats.append(self.norm(x.transpose([0, 3, 1, 2])))
+
+        if self.with_fpn:
+            fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
+            for i in range(len(feats)):
+                feats[i] = fpns[i](feats[i])
+        return feats
+
+    @property
+    def num_layers(self):
+        return len(self.blocks)
+
+    @property
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=c, stride=s)
+            for c, s in zip(self.out_channels, self.out_strides)
+        ]
+
+
+class LayerNorm(nn.Layer):
+    """
+    A LayerNorm variant, popularized by Transformers, that performs point-wise mean and
+    variance normalization over the channel dimension for inputs that have shape
+    (batch_size, channels, height, width).    
+    Note that, the modified LayerNorm on used in ResBlock and SimpleFeaturePyramid.
+
+    In ViT, we use the nn.LayerNorm
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6):
+        super().__init__()
+        self.weight = self.create_parameter([normalized_shape])
+        self.bias = self.create_parameter([normalized_shape])
+        self.eps = eps
+        self.normalized_shape = (normalized_shape, )
+
+    def forward(self, x):
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / paddle.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+
+
+@register
+@serializable
+class SimpleFeaturePyramid(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 spatial_scales,
+                 num_levels=4,
+                 use_bias=False):
+        """
+        Args:
+            in_channels (list[int]): input channels of each level which can be 
+                derived from the output shape of backbone by from_config
+            out_channel (int): output channel of each level.
+            spatial_scales (list[float]): list of scaling factors to upsample or downsample
+                the input features for creating pyramid features which can be derived from 
+                the output shape of backbone by from_config
+            num_levels (int): number of levels of output features.
+            use_bias (bool): whether use bias or not.
+        """
+        super(SimpleFeaturePyramid, self).__init__()
+
+        self.in_channels = in_channels[0]
+        self.out_channels = out_channels
+        self.num_levels = num_levels
+
+        self.stages = []
+        dim = self.in_channels
+        if num_levels == 4:
+            scale_factors = [2.0, 1.0, 0.5]
+        elif num_levels == 5:
+            scale_factors = [4.0, 2.0, 1.0, 0.5]
+        else:
+            raise NotImplementedError(
+                f"num_levels={num_levels} is not supported yet.")
+
+        dim = in_channels[0]
+        for idx, scale in enumerate(scale_factors):
+            out_dim = dim
+            if scale == 4.0:
+                layers = [
+                    nn.Conv2DTranspose(
+                        dim, dim // 2, kernel_size=2, stride=2),
+                    nn.LayerNorm(dim // 2),
+                    nn.GELU(),
+                    nn.Conv2DTranspose(
+                        dim // 2, dim // 4, kernel_size=2, stride=2),
+                ]
+                out_dim = dim // 4
+            elif scale == 2.0:
+                layers = [
+                    nn.Conv2DTranspose(
+                        dim, dim // 2, kernel_size=2, stride=2)
+                ]
+                out_dim = dim // 2
+            elif scale == 1.0:
+                layers = []
+            elif scale == 0.5:
+                layers = [nn.MaxPool2D(kernel_size=2, stride=2)]
+
+            layers.extend([
+                nn.Conv2D(
+                    out_dim,
+                    out_channels,
+                    kernel_size=1,
+                    bias_attr=use_bias, ), LayerNorm(out_channels), nn.Conv2D(
+                        out_channels,
+                        out_channels,
+                        kernel_size=3,
+                        padding=1,
+                        bias_attr=use_bias, ), LayerNorm(out_channels)
+            ])
+            layers = nn.Sequential(*layers)
+
+            stage = -int(math.log2(spatial_scales[0] * scale_factors[idx]))
+            self.add_sublayer(f"simfp_{stage}", layers)
+            self.stages.append(layers)
+
+        # top block output feature maps.
+        self.top_block = nn.Sequential(
+            nn.MaxPool2D(
+                kernel_size=1, stride=2, padding=0))
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            'in_channels': [i.channels for i in input_shape],
+            'spatial_scales': [1.0 / i.stride for i in input_shape],
+        }
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(channels=self.out_channels)
+            for _ in range(self.num_levels)
+        ]
+
+    def forward(self, feats):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W).
+        """
+        features = feats[0]
+        results = []
+
+        for stage in self.stages:
+            results.append(stage(features))
+
+        top_block_in_feature = results[-1]
+        results.append(self.top_block(top_block_in_feature))
+        assert self.num_levels == len(results)
+
+        return results