first commit
This commit is contained in:
30
rtdetr_paddle/ppdet/modeling/backbones/__init__.py
Normal file
30
rtdetr_paddle/ppdet/modeling/backbones/__init__.py
Normal file
@@ -0,0 +1,30 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .resnet import *
|
||||
from .darknet import *
|
||||
from .mobilenet_v1 import *
|
||||
from .mobilenet_v3 import *
|
||||
from .shufflenet_v2 import *
|
||||
from .swin_transformer import *
|
||||
from .lcnet import *
|
||||
from .cspresnet import *
|
||||
from .csp_darknet import *
|
||||
from .convnext import *
|
||||
from .vision_transformer import *
|
||||
from .mobileone import *
|
||||
from .trans_encoder import *
|
||||
from .focalnet import *
|
||||
from .vit_mae import *
|
||||
from .hgnet_v2 import *
|
||||
245
rtdetr_paddle/ppdet/modeling/backbones/convnext.py
Normal file
245
rtdetr_paddle/ppdet/modeling/backbones/convnext.py
Normal file
@@ -0,0 +1,245 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
'''
|
||||
Modified from https://github.com/facebookresearch/ConvNeXt
|
||||
Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
All rights reserved.
|
||||
This source code is licensed under the license found in the
|
||||
LICENSE file in the root directory of this source tree.
|
||||
'''
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
from paddle.nn.initializer import Constant
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from ..shape_spec import ShapeSpec
|
||||
from .transformer_utils import DropPath, trunc_normal_, zeros_
|
||||
|
||||
__all__ = ['ConvNeXt']
|
||||
|
||||
|
||||
class Block(nn.Layer):
|
||||
r""" ConvNeXt Block. There are two equivalent implementations:
|
||||
(1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
|
||||
(2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
|
||||
We use (2) as we find it slightly faster in Pypaddle
|
||||
|
||||
Args:
|
||||
dim (int): Number of input channels.
|
||||
drop_path (float): Stochastic depth rate. Default: 0.0
|
||||
layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
|
||||
"""
|
||||
|
||||
def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
|
||||
super().__init__()
|
||||
self.dwconv = nn.Conv2D(
|
||||
dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
|
||||
self.norm = LayerNorm(dim, eps=1e-6)
|
||||
self.pwconv1 = nn.Linear(
|
||||
dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers
|
||||
self.act = nn.GELU()
|
||||
self.pwconv2 = nn.Linear(4 * dim, dim)
|
||||
|
||||
if layer_scale_init_value > 0:
|
||||
self.gamma = self.create_parameter(
|
||||
shape=(dim, ),
|
||||
attr=ParamAttr(initializer=Constant(layer_scale_init_value)))
|
||||
else:
|
||||
self.gamma = None
|
||||
|
||||
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity(
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
input = x
|
||||
x = self.dwconv(x)
|
||||
x = x.transpose([0, 2, 3, 1])
|
||||
x = self.norm(x)
|
||||
x = self.pwconv1(x)
|
||||
x = self.act(x)
|
||||
x = self.pwconv2(x)
|
||||
if self.gamma is not None:
|
||||
x = self.gamma * x
|
||||
x = x.transpose([0, 3, 1, 2])
|
||||
x = input + self.drop_path(x)
|
||||
return x
|
||||
|
||||
|
||||
class LayerNorm(nn.Layer):
|
||||
r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
|
||||
The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
|
||||
shape (batch_size, height, width, channels) while channels_first corresponds to inputs
|
||||
with shape (batch_size, channels, height, width).
|
||||
"""
|
||||
|
||||
def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
|
||||
super().__init__()
|
||||
|
||||
self.weight = self.create_parameter(
|
||||
shape=(normalized_shape, ),
|
||||
attr=ParamAttr(initializer=Constant(1.)))
|
||||
self.bias = self.create_parameter(
|
||||
shape=(normalized_shape, ),
|
||||
attr=ParamAttr(initializer=Constant(0.)))
|
||||
|
||||
self.eps = eps
|
||||
self.data_format = data_format
|
||||
if self.data_format not in ["channels_last", "channels_first"]:
|
||||
raise NotImplementedError
|
||||
self.normalized_shape = (normalized_shape, )
|
||||
|
||||
def forward(self, x):
|
||||
if self.data_format == "channels_last":
|
||||
return F.layer_norm(x, self.normalized_shape, self.weight,
|
||||
self.bias, self.eps)
|
||||
elif self.data_format == "channels_first":
|
||||
u = x.mean(1, keepdim=True)
|
||||
s = (x - u).pow(2).mean(1, keepdim=True)
|
||||
x = (x - u) / paddle.sqrt(s + self.eps)
|
||||
x = self.weight[:, None, None] * x + self.bias[:, None, None]
|
||||
return x
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class ConvNeXt(nn.Layer):
|
||||
r""" ConvNeXt
|
||||
A Pypaddle impl of : `A ConvNet for the 2020s` -
|
||||
https://arxiv.org/pdf/2201.03545.pdf
|
||||
|
||||
Args:
|
||||
in_chans (int): Number of input image channels. Default: 3
|
||||
depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
|
||||
dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
|
||||
drop_path_rate (float): Stochastic depth rate. Default: 0.
|
||||
layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
|
||||
"""
|
||||
|
||||
arch_settings = {
|
||||
'tiny': {
|
||||
'depths': [3, 3, 9, 3],
|
||||
'dims': [96, 192, 384, 768]
|
||||
},
|
||||
'small': {
|
||||
'depths': [3, 3, 27, 3],
|
||||
'dims': [96, 192, 384, 768]
|
||||
},
|
||||
'base': {
|
||||
'depths': [3, 3, 27, 3],
|
||||
'dims': [128, 256, 512, 1024]
|
||||
},
|
||||
'large': {
|
||||
'depths': [3, 3, 27, 3],
|
||||
'dims': [192, 384, 768, 1536]
|
||||
},
|
||||
'xlarge': {
|
||||
'depths': [3, 3, 27, 3],
|
||||
'dims': [256, 512, 1024, 2048]
|
||||
},
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
arch='tiny',
|
||||
in_chans=3,
|
||||
drop_path_rate=0.,
|
||||
layer_scale_init_value=1e-6,
|
||||
return_idx=[1, 2, 3],
|
||||
norm_output=True,
|
||||
pretrained=None, ):
|
||||
super().__init__()
|
||||
depths = self.arch_settings[arch]['depths']
|
||||
dims = self.arch_settings[arch]['dims']
|
||||
self.downsample_layers = nn.LayerList(
|
||||
) # stem and 3 intermediate downsampling conv layers
|
||||
stem = nn.Sequential(
|
||||
nn.Conv2D(
|
||||
in_chans, dims[0], kernel_size=4, stride=4),
|
||||
LayerNorm(
|
||||
dims[0], eps=1e-6, data_format="channels_first"))
|
||||
self.downsample_layers.append(stem)
|
||||
for i in range(3):
|
||||
downsample_layer = nn.Sequential(
|
||||
LayerNorm(
|
||||
dims[i], eps=1e-6, data_format="channels_first"),
|
||||
nn.Conv2D(
|
||||
dims[i], dims[i + 1], kernel_size=2, stride=2), )
|
||||
self.downsample_layers.append(downsample_layer)
|
||||
|
||||
self.stages = nn.LayerList(
|
||||
) # 4 feature resolution stages, each consisting of multiple residual blocks
|
||||
dp_rates = [x for x in np.linspace(0, drop_path_rate, sum(depths))]
|
||||
cur = 0
|
||||
for i in range(4):
|
||||
stage = nn.Sequential(* [
|
||||
Block(
|
||||
dim=dims[i],
|
||||
drop_path=dp_rates[cur + j],
|
||||
layer_scale_init_value=layer_scale_init_value)
|
||||
for j in range(depths[i])
|
||||
])
|
||||
self.stages.append(stage)
|
||||
cur += depths[i]
|
||||
|
||||
self.return_idx = return_idx
|
||||
self.dims = [dims[i] for i in return_idx] # [::-1]
|
||||
|
||||
self.norm_output = norm_output
|
||||
if norm_output:
|
||||
self.norms = nn.LayerList([
|
||||
LayerNorm(
|
||||
c, eps=1e-6, data_format="channels_first")
|
||||
for c in self.dims
|
||||
])
|
||||
|
||||
self.apply(self._init_weights)
|
||||
|
||||
if pretrained is not None:
|
||||
if 'http' in pretrained: #URL
|
||||
path = paddle.utils.download.get_weights_path_from_url(
|
||||
pretrained)
|
||||
else: #model in local path
|
||||
path = pretrained
|
||||
self.set_state_dict(paddle.load(path))
|
||||
|
||||
def _init_weights(self, m):
|
||||
if isinstance(m, (nn.Conv2D, nn.Linear)):
|
||||
trunc_normal_(m.weight)
|
||||
zeros_(m.bias)
|
||||
|
||||
def forward_features(self, x):
|
||||
output = []
|
||||
for i in range(4):
|
||||
x = self.downsample_layers[i](x)
|
||||
x = self.stages[i](x)
|
||||
output.append(x)
|
||||
|
||||
outputs = [output[i] for i in self.return_idx]
|
||||
if self.norm_output:
|
||||
outputs = [self.norms[i](out) for i, out in enumerate(outputs)]
|
||||
|
||||
return outputs
|
||||
|
||||
def forward(self, x):
|
||||
x = self.forward_features(x['image'])
|
||||
return x
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [ShapeSpec(channels=c) for c in self.dims]
|
||||
404
rtdetr_paddle/ppdet/modeling/backbones/csp_darknet.py
Normal file
404
rtdetr_paddle/ppdet/modeling/backbones/csp_darknet.py
Normal file
@@ -0,0 +1,404 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
from paddle.regularizer import L2Decay
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from ppdet.modeling.initializer import conv_init_
|
||||
from ..shape_spec import ShapeSpec
|
||||
|
||||
__all__ = [
|
||||
'CSPDarkNet', 'BaseConv', 'DWConv', 'BottleNeck', 'SPPLayer', 'SPPFLayer'
|
||||
]
|
||||
|
||||
|
||||
class BaseConv(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
ksize,
|
||||
stride,
|
||||
groups=1,
|
||||
bias=False,
|
||||
act="silu"):
|
||||
super(BaseConv, self).__init__()
|
||||
self.conv = nn.Conv2D(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size=ksize,
|
||||
stride=stride,
|
||||
padding=(ksize - 1) // 2,
|
||||
groups=groups,
|
||||
bias_attr=bias)
|
||||
self.bn = nn.BatchNorm2D(
|
||||
out_channels,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
|
||||
|
||||
self._init_weights()
|
||||
|
||||
def _init_weights(self):
|
||||
conv_init_(self.conv)
|
||||
|
||||
def forward(self, x):
|
||||
# use 'x * F.sigmoid(x)' replace 'silu'
|
||||
x = self.bn(self.conv(x))
|
||||
y = x * F.sigmoid(x)
|
||||
return y
|
||||
|
||||
|
||||
class DWConv(nn.Layer):
|
||||
"""Depthwise Conv"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
ksize,
|
||||
stride=1,
|
||||
bias=False,
|
||||
act="silu"):
|
||||
super(DWConv, self).__init__()
|
||||
self.dw_conv = BaseConv(
|
||||
in_channels,
|
||||
in_channels,
|
||||
ksize=ksize,
|
||||
stride=stride,
|
||||
groups=in_channels,
|
||||
bias=bias,
|
||||
act=act)
|
||||
self.pw_conv = BaseConv(
|
||||
in_channels,
|
||||
out_channels,
|
||||
ksize=1,
|
||||
stride=1,
|
||||
groups=1,
|
||||
bias=bias,
|
||||
act=act)
|
||||
|
||||
def forward(self, x):
|
||||
return self.pw_conv(self.dw_conv(x))
|
||||
|
||||
|
||||
class Focus(nn.Layer):
|
||||
"""Focus width and height information into channel space, used in YOLOX."""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
ksize=3,
|
||||
stride=1,
|
||||
bias=False,
|
||||
act="silu"):
|
||||
super(Focus, self).__init__()
|
||||
self.conv = BaseConv(
|
||||
in_channels * 4,
|
||||
out_channels,
|
||||
ksize=ksize,
|
||||
stride=stride,
|
||||
bias=bias,
|
||||
act=act)
|
||||
|
||||
def forward(self, inputs):
|
||||
# inputs [bs, C, H, W] -> outputs [bs, 4C, W/2, H/2]
|
||||
top_left = inputs[:, :, 0::2, 0::2]
|
||||
top_right = inputs[:, :, 0::2, 1::2]
|
||||
bottom_left = inputs[:, :, 1::2, 0::2]
|
||||
bottom_right = inputs[:, :, 1::2, 1::2]
|
||||
outputs = paddle.concat(
|
||||
[top_left, bottom_left, top_right, bottom_right], 1)
|
||||
return self.conv(outputs)
|
||||
|
||||
|
||||
class BottleNeck(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
shortcut=True,
|
||||
expansion=0.5,
|
||||
depthwise=False,
|
||||
bias=False,
|
||||
act="silu"):
|
||||
super(BottleNeck, self).__init__()
|
||||
hidden_channels = int(out_channels * expansion)
|
||||
Conv = DWConv if depthwise else BaseConv
|
||||
self.conv1 = BaseConv(
|
||||
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
|
||||
self.conv2 = Conv(
|
||||
hidden_channels,
|
||||
out_channels,
|
||||
ksize=3,
|
||||
stride=1,
|
||||
bias=bias,
|
||||
act=act)
|
||||
self.add_shortcut = shortcut and in_channels == out_channels
|
||||
|
||||
def forward(self, x):
|
||||
y = self.conv2(self.conv1(x))
|
||||
if self.add_shortcut:
|
||||
y = y + x
|
||||
return y
|
||||
|
||||
|
||||
class SPPLayer(nn.Layer):
|
||||
"""Spatial Pyramid Pooling (SPP) layer used in YOLOv3-SPP and YOLOX"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_sizes=(5, 9, 13),
|
||||
bias=False,
|
||||
act="silu"):
|
||||
super(SPPLayer, self).__init__()
|
||||
hidden_channels = in_channels // 2
|
||||
self.conv1 = BaseConv(
|
||||
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
|
||||
self.maxpoolings = nn.LayerList([
|
||||
nn.MaxPool2D(
|
||||
kernel_size=ks, stride=1, padding=ks // 2)
|
||||
for ks in kernel_sizes
|
||||
])
|
||||
conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
|
||||
self.conv2 = BaseConv(
|
||||
conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
x = paddle.concat([x] + [mp(x) for mp in self.maxpoolings], axis=1)
|
||||
x = self.conv2(x)
|
||||
return x
|
||||
|
||||
|
||||
class SPPFLayer(nn.Layer):
|
||||
""" Spatial Pyramid Pooling - Fast (SPPF) layer used in YOLOv5 by Glenn Jocher,
|
||||
equivalent to SPP(k=(5, 9, 13))
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
ksize=5,
|
||||
bias=False,
|
||||
act='silu'):
|
||||
super(SPPFLayer, self).__init__()
|
||||
hidden_channels = in_channels // 2
|
||||
self.conv1 = BaseConv(
|
||||
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
|
||||
self.maxpooling = nn.MaxPool2D(
|
||||
kernel_size=ksize, stride=1, padding=ksize // 2)
|
||||
conv2_channels = hidden_channels * 4
|
||||
self.conv2 = BaseConv(
|
||||
conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
y1 = self.maxpooling(x)
|
||||
y2 = self.maxpooling(y1)
|
||||
y3 = self.maxpooling(y2)
|
||||
concats = paddle.concat([x, y1, y2, y3], axis=1)
|
||||
out = self.conv2(concats)
|
||||
return out
|
||||
|
||||
|
||||
class CSPLayer(nn.Layer):
|
||||
"""CSP (Cross Stage Partial) layer with 3 convs, named C3 in YOLOv5"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
num_blocks=1,
|
||||
shortcut=True,
|
||||
expansion=0.5,
|
||||
depthwise=False,
|
||||
bias=False,
|
||||
act="silu"):
|
||||
super(CSPLayer, self).__init__()
|
||||
hidden_channels = int(out_channels * expansion)
|
||||
self.conv1 = BaseConv(
|
||||
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
|
||||
self.conv2 = BaseConv(
|
||||
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
|
||||
self.bottlenecks = nn.Sequential(* [
|
||||
BottleNeck(
|
||||
hidden_channels,
|
||||
hidden_channels,
|
||||
shortcut=shortcut,
|
||||
expansion=1.0,
|
||||
depthwise=depthwise,
|
||||
bias=bias,
|
||||
act=act) for _ in range(num_blocks)
|
||||
])
|
||||
self.conv3 = BaseConv(
|
||||
hidden_channels * 2,
|
||||
out_channels,
|
||||
ksize=1,
|
||||
stride=1,
|
||||
bias=bias,
|
||||
act=act)
|
||||
|
||||
def forward(self, x):
|
||||
x_1 = self.conv1(x)
|
||||
x_1 = self.bottlenecks(x_1)
|
||||
x_2 = self.conv2(x)
|
||||
x = paddle.concat([x_1, x_2], axis=1)
|
||||
x = self.conv3(x)
|
||||
return x
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class CSPDarkNet(nn.Layer):
|
||||
"""
|
||||
CSPDarkNet backbone.
|
||||
Args:
|
||||
arch (str): Architecture of CSPDarkNet, from {P5, P6, X}, default as X,
|
||||
and 'X' means used in YOLOX, 'P5/P6' means used in YOLOv5.
|
||||
depth_mult (float): Depth multiplier, multiply number of channels in
|
||||
each layer, default as 1.0.
|
||||
width_mult (float): Width multiplier, multiply number of blocks in
|
||||
CSPLayer, default as 1.0.
|
||||
depthwise (bool): Whether to use depth-wise conv layer.
|
||||
act (str): Activation function type, default as 'silu'.
|
||||
return_idx (list): Index of stages whose feature maps are returned.
|
||||
"""
|
||||
|
||||
__shared__ = ['depth_mult', 'width_mult', 'act', 'trt']
|
||||
|
||||
# in_channels, out_channels, num_blocks, add_shortcut, use_spp(use_sppf)
|
||||
# 'X' means setting used in YOLOX, 'P5/P6' means setting used in YOLOv5.
|
||||
arch_settings = {
|
||||
'X': [[64, 128, 3, True, False], [128, 256, 9, True, False],
|
||||
[256, 512, 9, True, False], [512, 1024, 3, False, True]],
|
||||
'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
|
||||
[256, 512, 9, True, False], [512, 1024, 3, True, True]],
|
||||
'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
|
||||
[256, 512, 9, True, False], [512, 768, 3, True, False],
|
||||
[768, 1024, 3, True, True]],
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
arch='X',
|
||||
depth_mult=1.0,
|
||||
width_mult=1.0,
|
||||
depthwise=False,
|
||||
act='silu',
|
||||
trt=False,
|
||||
return_idx=[2, 3, 4]):
|
||||
super(CSPDarkNet, self).__init__()
|
||||
self.arch = arch
|
||||
self.return_idx = return_idx
|
||||
Conv = DWConv if depthwise else BaseConv
|
||||
arch_setting = self.arch_settings[arch]
|
||||
base_channels = int(arch_setting[0][0] * width_mult)
|
||||
|
||||
# Note: differences between the latest YOLOv5 and the original YOLOX
|
||||
# 1. self.stem, use SPPF(in YOLOv5) or SPP(in YOLOX)
|
||||
# 2. use SPPF(in YOLOv5) or SPP(in YOLOX)
|
||||
# 3. put SPPF before(YOLOv5) or SPP after(YOLOX) the last cspdark block's CSPLayer
|
||||
# 4. whether SPPF(SPP)'CSPLayer add shortcut, True in YOLOv5, False in YOLOX
|
||||
if arch in ['P5', 'P6']:
|
||||
# in the latest YOLOv5, use Conv stem, and SPPF (fast, only single spp kernal size)
|
||||
self.stem = Conv(
|
||||
3, base_channels, ksize=6, stride=2, bias=False, act=act)
|
||||
spp_kernal_sizes = 5
|
||||
elif arch in ['X']:
|
||||
# in the original YOLOX, use Focus stem, and SPP (three spp kernal sizes)
|
||||
self.stem = Focus(
|
||||
3, base_channels, ksize=3, stride=1, bias=False, act=act)
|
||||
spp_kernal_sizes = (5, 9, 13)
|
||||
else:
|
||||
raise AttributeError("Unsupported arch type: {}".format(arch))
|
||||
|
||||
_out_channels = [base_channels]
|
||||
layers_num = 1
|
||||
self.csp_dark_blocks = []
|
||||
|
||||
for i, (in_channels, out_channels, num_blocks, shortcut,
|
||||
use_spp) in enumerate(arch_setting):
|
||||
in_channels = int(in_channels * width_mult)
|
||||
out_channels = int(out_channels * width_mult)
|
||||
_out_channels.append(out_channels)
|
||||
num_blocks = max(round(num_blocks * depth_mult), 1)
|
||||
stage = []
|
||||
|
||||
conv_layer = self.add_sublayer(
|
||||
'layers{}.stage{}.conv_layer'.format(layers_num, i + 1),
|
||||
Conv(
|
||||
in_channels, out_channels, 3, 2, bias=False, act=act))
|
||||
stage.append(conv_layer)
|
||||
layers_num += 1
|
||||
|
||||
if use_spp and arch in ['X']:
|
||||
# in YOLOX use SPPLayer
|
||||
spp_layer = self.add_sublayer(
|
||||
'layers{}.stage{}.spp_layer'.format(layers_num, i + 1),
|
||||
SPPLayer(
|
||||
out_channels,
|
||||
out_channels,
|
||||
kernel_sizes=spp_kernal_sizes,
|
||||
bias=False,
|
||||
act=act))
|
||||
stage.append(spp_layer)
|
||||
layers_num += 1
|
||||
|
||||
csp_layer = self.add_sublayer(
|
||||
'layers{}.stage{}.csp_layer'.format(layers_num, i + 1),
|
||||
CSPLayer(
|
||||
out_channels,
|
||||
out_channels,
|
||||
num_blocks=num_blocks,
|
||||
shortcut=shortcut,
|
||||
depthwise=depthwise,
|
||||
bias=False,
|
||||
act=act))
|
||||
stage.append(csp_layer)
|
||||
layers_num += 1
|
||||
|
||||
if use_spp and arch in ['P5', 'P6']:
|
||||
# in latest YOLOv5 use SPPFLayer instead of SPPLayer
|
||||
sppf_layer = self.add_sublayer(
|
||||
'layers{}.stage{}.sppf_layer'.format(layers_num, i + 1),
|
||||
SPPFLayer(
|
||||
out_channels,
|
||||
out_channels,
|
||||
ksize=5,
|
||||
bias=False,
|
||||
act=act))
|
||||
stage.append(sppf_layer)
|
||||
layers_num += 1
|
||||
|
||||
self.csp_dark_blocks.append(nn.Sequential(*stage))
|
||||
|
||||
self._out_channels = [_out_channels[i] for i in self.return_idx]
|
||||
self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx]
|
||||
|
||||
def forward(self, inputs):
|
||||
x = inputs['image']
|
||||
outputs = []
|
||||
x = self.stem(x)
|
||||
for i, layer in enumerate(self.csp_dark_blocks):
|
||||
x = layer(x)
|
||||
if i + 1 in self.return_idx:
|
||||
outputs.append(x)
|
||||
return outputs
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [
|
||||
ShapeSpec(
|
||||
channels=c, stride=s)
|
||||
for c, s in zip(self._out_channels, self.strides)
|
||||
]
|
||||
321
rtdetr_paddle/ppdet/modeling/backbones/cspresnet.py
Normal file
321
rtdetr_paddle/ppdet/modeling/backbones/cspresnet.py
Normal file
@@ -0,0 +1,321 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
from paddle.regularizer import L2Decay
|
||||
from paddle.nn.initializer import Constant
|
||||
|
||||
from ppdet.modeling.ops import get_act_fn
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from ..shape_spec import ShapeSpec
|
||||
|
||||
__all__ = ['CSPResNet', 'BasicBlock', 'EffectiveSELayer', 'ConvBNLayer']
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
ch_in,
|
||||
ch_out,
|
||||
filter_size=3,
|
||||
stride=1,
|
||||
groups=1,
|
||||
padding=0,
|
||||
act=None):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
|
||||
self.conv = nn.Conv2D(
|
||||
in_channels=ch_in,
|
||||
out_channels=ch_out,
|
||||
kernel_size=filter_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
groups=groups,
|
||||
bias_attr=False)
|
||||
|
||||
self.bn = nn.BatchNorm2D(
|
||||
ch_out,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
|
||||
self.act = get_act_fn(act) if act is None or isinstance(act, (
|
||||
str, dict)) else act
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
x = self.act(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class RepVggBlock(nn.Layer):
|
||||
def __init__(self, ch_in, ch_out, act='relu', alpha=False):
|
||||
super(RepVggBlock, self).__init__()
|
||||
self.ch_in = ch_in
|
||||
self.ch_out = ch_out
|
||||
self.conv1 = ConvBNLayer(
|
||||
ch_in, ch_out, 3, stride=1, padding=1, act=None)
|
||||
self.conv2 = ConvBNLayer(
|
||||
ch_in, ch_out, 1, stride=1, padding=0, act=None)
|
||||
self.act = get_act_fn(act) if act is None or isinstance(act, (
|
||||
str, dict)) else act
|
||||
if alpha:
|
||||
self.alpha = self.create_parameter(
|
||||
shape=[1],
|
||||
attr=ParamAttr(initializer=Constant(value=1.)),
|
||||
dtype="float32")
|
||||
else:
|
||||
self.alpha = None
|
||||
|
||||
def forward(self, x):
|
||||
if hasattr(self, 'conv'):
|
||||
y = self.conv(x)
|
||||
else:
|
||||
if self.alpha:
|
||||
y = self.conv1(x) + self.alpha * self.conv2(x)
|
||||
else:
|
||||
y = self.conv1(x) + self.conv2(x)
|
||||
y = self.act(y)
|
||||
return y
|
||||
|
||||
def convert_to_deploy(self):
|
||||
if not hasattr(self, 'conv'):
|
||||
self.conv = nn.Conv2D(
|
||||
in_channels=self.ch_in,
|
||||
out_channels=self.ch_out,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
groups=1)
|
||||
kernel, bias = self.get_equivalent_kernel_bias()
|
||||
self.conv.weight.set_value(kernel)
|
||||
self.conv.bias.set_value(bias)
|
||||
self.__delattr__('conv1')
|
||||
self.__delattr__('conv2')
|
||||
|
||||
def get_equivalent_kernel_bias(self):
|
||||
kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
|
||||
kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
|
||||
if self.alpha:
|
||||
return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor(
|
||||
kernel1x1), bias3x3 + self.alpha * bias1x1
|
||||
else:
|
||||
return kernel3x3 + self._pad_1x1_to_3x3_tensor(
|
||||
kernel1x1), bias3x3 + bias1x1
|
||||
|
||||
def _pad_1x1_to_3x3_tensor(self, kernel1x1):
|
||||
if kernel1x1 is None:
|
||||
return 0
|
||||
else:
|
||||
return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
|
||||
|
||||
def _fuse_bn_tensor(self, branch):
|
||||
if branch is None:
|
||||
return 0, 0
|
||||
kernel = branch.conv.weight
|
||||
running_mean = branch.bn._mean
|
||||
running_var = branch.bn._variance
|
||||
gamma = branch.bn.weight
|
||||
beta = branch.bn.bias
|
||||
eps = branch.bn._epsilon
|
||||
std = (running_var + eps).sqrt()
|
||||
t = (gamma / std).reshape((-1, 1, 1, 1))
|
||||
return kernel * t, beta - running_mean * gamma / std
|
||||
|
||||
|
||||
class BasicBlock(nn.Layer):
|
||||
def __init__(self,
|
||||
ch_in,
|
||||
ch_out,
|
||||
act='relu',
|
||||
shortcut=True,
|
||||
use_alpha=False):
|
||||
super(BasicBlock, self).__init__()
|
||||
assert ch_in == ch_out
|
||||
self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act)
|
||||
self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha)
|
||||
self.shortcut = shortcut
|
||||
|
||||
def forward(self, x):
|
||||
y = self.conv1(x)
|
||||
y = self.conv2(y)
|
||||
if self.shortcut:
|
||||
return paddle.add(x, y)
|
||||
else:
|
||||
return y
|
||||
|
||||
|
||||
class EffectiveSELayer(nn.Layer):
|
||||
""" Effective Squeeze-Excitation
|
||||
From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
|
||||
"""
|
||||
|
||||
def __init__(self, channels, act='hardsigmoid'):
|
||||
super(EffectiveSELayer, self).__init__()
|
||||
self.fc = nn.Conv2D(channels, channels, kernel_size=1, padding=0)
|
||||
self.act = get_act_fn(act) if act is None or isinstance(act, (
|
||||
str, dict)) else act
|
||||
|
||||
def forward(self, x):
|
||||
x_se = x.mean((2, 3), keepdim=True)
|
||||
x_se = self.fc(x_se)
|
||||
return x * self.act(x_se)
|
||||
|
||||
|
||||
class CSPResStage(nn.Layer):
|
||||
def __init__(self,
|
||||
block_fn,
|
||||
ch_in,
|
||||
ch_out,
|
||||
n,
|
||||
stride,
|
||||
act='relu',
|
||||
attn='eca',
|
||||
use_alpha=False):
|
||||
super(CSPResStage, self).__init__()
|
||||
|
||||
ch_mid = (ch_in + ch_out) // 2
|
||||
if stride == 2:
|
||||
self.conv_down = ConvBNLayer(
|
||||
ch_in, ch_mid, 3, stride=2, padding=1, act=act)
|
||||
else:
|
||||
self.conv_down = None
|
||||
self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
|
||||
self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
|
||||
self.blocks = nn.Sequential(*[
|
||||
block_fn(
|
||||
ch_mid // 2,
|
||||
ch_mid // 2,
|
||||
act=act,
|
||||
shortcut=True,
|
||||
use_alpha=use_alpha) for i in range(n)
|
||||
])
|
||||
if attn:
|
||||
self.attn = EffectiveSELayer(ch_mid, act='hardsigmoid')
|
||||
else:
|
||||
self.attn = None
|
||||
|
||||
self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act)
|
||||
|
||||
def forward(self, x):
|
||||
if self.conv_down is not None:
|
||||
x = self.conv_down(x)
|
||||
y1 = self.conv1(x)
|
||||
y2 = self.blocks(self.conv2(x))
|
||||
y = paddle.concat([y1, y2], axis=1)
|
||||
if self.attn is not None:
|
||||
y = self.attn(y)
|
||||
y = self.conv3(y)
|
||||
return y
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class CSPResNet(nn.Layer):
|
||||
__shared__ = ['width_mult', 'depth_mult', 'trt']
|
||||
|
||||
def __init__(self,
|
||||
layers=[3, 6, 6, 3],
|
||||
channels=[64, 128, 256, 512, 1024],
|
||||
act='swish',
|
||||
return_idx=[1, 2, 3],
|
||||
depth_wise=False,
|
||||
use_large_stem=False,
|
||||
width_mult=1.0,
|
||||
depth_mult=1.0,
|
||||
trt=False,
|
||||
use_checkpoint=False,
|
||||
use_alpha=False,
|
||||
**args):
|
||||
super(CSPResNet, self).__init__()
|
||||
self.use_checkpoint = use_checkpoint
|
||||
channels = [max(round(c * width_mult), 1) for c in channels]
|
||||
layers = [max(round(l * depth_mult), 1) for l in layers]
|
||||
act = get_act_fn(
|
||||
act, trt=trt) if act is None or isinstance(act,
|
||||
(str, dict)) else act
|
||||
|
||||
if use_large_stem:
|
||||
self.stem = nn.Sequential(
|
||||
('conv1', ConvBNLayer(
|
||||
3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
|
||||
('conv2', ConvBNLayer(
|
||||
channels[0] // 2,
|
||||
channels[0] // 2,
|
||||
3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
act=act)), ('conv3', ConvBNLayer(
|
||||
channels[0] // 2,
|
||||
channels[0],
|
||||
3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
act=act)))
|
||||
else:
|
||||
self.stem = nn.Sequential(
|
||||
('conv1', ConvBNLayer(
|
||||
3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
|
||||
('conv2', ConvBNLayer(
|
||||
channels[0] // 2,
|
||||
channels[0],
|
||||
3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
act=act)))
|
||||
|
||||
n = len(channels) - 1
|
||||
self.stages = nn.Sequential(*[(str(i), CSPResStage(
|
||||
BasicBlock,
|
||||
channels[i],
|
||||
channels[i + 1],
|
||||
layers[i],
|
||||
2,
|
||||
act=act,
|
||||
use_alpha=use_alpha)) for i in range(n)])
|
||||
|
||||
self._out_channels = channels[1:]
|
||||
self._out_strides = [4 * 2**i for i in range(n)]
|
||||
self.return_idx = return_idx
|
||||
if use_checkpoint:
|
||||
paddle.seed(0)
|
||||
|
||||
def forward(self, inputs):
|
||||
x = inputs['image']
|
||||
x = self.stem(x)
|
||||
outs = []
|
||||
for idx, stage in enumerate(self.stages):
|
||||
if self.use_checkpoint and self.training:
|
||||
x = paddle.distributed.fleet.utils.recompute(
|
||||
stage, x, **{"preserve_rng_state": True})
|
||||
else:
|
||||
x = stage(x)
|
||||
if idx in self.return_idx:
|
||||
outs.append(x)
|
||||
|
||||
return outs
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [
|
||||
ShapeSpec(
|
||||
channels=self._out_channels[i], stride=self._out_strides[i])
|
||||
for i in self.return_idx
|
||||
]
|
||||
345
rtdetr_paddle/ppdet/modeling/backbones/darknet.py
Executable file
345
rtdetr_paddle/ppdet/modeling/backbones/darknet.py
Executable file
@@ -0,0 +1,345 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from ppdet.modeling.ops import batch_norm, mish
|
||||
from ..shape_spec import ShapeSpec
|
||||
|
||||
__all__ = ['DarkNet', 'ConvBNLayer']
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
ch_in,
|
||||
ch_out,
|
||||
filter_size=3,
|
||||
stride=1,
|
||||
groups=1,
|
||||
padding=0,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
act="leaky",
|
||||
freeze_norm=False,
|
||||
data_format='NCHW',
|
||||
name=''):
|
||||
"""
|
||||
conv + bn + activation layer
|
||||
|
||||
Args:
|
||||
ch_in (int): input channel
|
||||
ch_out (int): output channel
|
||||
filter_size (int): filter size, default 3
|
||||
stride (int): stride, default 1
|
||||
groups (int): number of groups of conv layer, default 1
|
||||
padding (int): padding size, default 0
|
||||
norm_type (str): batch norm type, default bn
|
||||
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
|
||||
act (str): activation function type, default 'leaky', which means leaky_relu
|
||||
freeze_norm (bool): whether to freeze norm, default False
|
||||
data_format (str): data format, NCHW or NHWC
|
||||
"""
|
||||
super(ConvBNLayer, self).__init__()
|
||||
|
||||
self.conv = nn.Conv2D(
|
||||
in_channels=ch_in,
|
||||
out_channels=ch_out,
|
||||
kernel_size=filter_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
groups=groups,
|
||||
data_format=data_format,
|
||||
bias_attr=False)
|
||||
self.batch_norm = batch_norm(
|
||||
ch_out,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
data_format=data_format)
|
||||
self.act = act
|
||||
|
||||
def forward(self, inputs):
|
||||
out = self.conv(inputs)
|
||||
out = self.batch_norm(out)
|
||||
if self.act == 'leaky':
|
||||
out = F.leaky_relu(out, 0.1)
|
||||
else:
|
||||
out = getattr(F, self.act)(out)
|
||||
return out
|
||||
|
||||
|
||||
class DownSample(nn.Layer):
|
||||
def __init__(self,
|
||||
ch_in,
|
||||
ch_out,
|
||||
filter_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
freeze_norm=False,
|
||||
data_format='NCHW'):
|
||||
"""
|
||||
downsample layer
|
||||
|
||||
Args:
|
||||
ch_in (int): input channel
|
||||
ch_out (int): output channel
|
||||
filter_size (int): filter size, default 3
|
||||
stride (int): stride, default 2
|
||||
padding (int): padding size, default 1
|
||||
norm_type (str): batch norm type, default bn
|
||||
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
|
||||
freeze_norm (bool): whether to freeze norm, default False
|
||||
data_format (str): data format, NCHW or NHWC
|
||||
"""
|
||||
|
||||
super(DownSample, self).__init__()
|
||||
|
||||
self.conv_bn_layer = ConvBNLayer(
|
||||
ch_in=ch_in,
|
||||
ch_out=ch_out,
|
||||
filter_size=filter_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
data_format=data_format)
|
||||
self.ch_out = ch_out
|
||||
|
||||
def forward(self, inputs):
|
||||
out = self.conv_bn_layer(inputs)
|
||||
return out
|
||||
|
||||
|
||||
class BasicBlock(nn.Layer):
|
||||
def __init__(self,
|
||||
ch_in,
|
||||
ch_out,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
freeze_norm=False,
|
||||
data_format='NCHW'):
|
||||
"""
|
||||
BasicBlock layer of DarkNet
|
||||
|
||||
Args:
|
||||
ch_in (int): input channel
|
||||
ch_out (int): output channel
|
||||
norm_type (str): batch norm type, default bn
|
||||
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
|
||||
freeze_norm (bool): whether to freeze norm, default False
|
||||
data_format (str): data format, NCHW or NHWC
|
||||
"""
|
||||
|
||||
super(BasicBlock, self).__init__()
|
||||
|
||||
assert ch_in == ch_out and (ch_in % 2) == 0, \
|
||||
f"ch_in and ch_out should be the same even int, but the input \'ch_in is {ch_in}, \'ch_out is {ch_out}"
|
||||
# example:
|
||||
# --------------{conv1} --> {conv2}
|
||||
# channel route: 10-->5 --> 5-->10
|
||||
self.conv1 = ConvBNLayer(
|
||||
ch_in=ch_in,
|
||||
ch_out=int(ch_out / 2),
|
||||
filter_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
data_format=data_format)
|
||||
self.conv2 = ConvBNLayer(
|
||||
ch_in=int(ch_out / 2),
|
||||
ch_out=ch_out,
|
||||
filter_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
data_format=data_format)
|
||||
|
||||
def forward(self, inputs):
|
||||
conv1 = self.conv1(inputs)
|
||||
conv2 = self.conv2(conv1)
|
||||
out = paddle.add(x=inputs, y=conv2)
|
||||
return out
|
||||
|
||||
|
||||
class Blocks(nn.Layer):
|
||||
def __init__(self,
|
||||
ch_in,
|
||||
ch_out,
|
||||
count,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
freeze_norm=False,
|
||||
name=None,
|
||||
data_format='NCHW'):
|
||||
"""
|
||||
Blocks layer, which consist of some BaickBlock layers
|
||||
|
||||
Args:
|
||||
ch_in (int): input channel
|
||||
ch_out (int): output channel
|
||||
count (int): number of BasicBlock layer
|
||||
norm_type (str): batch norm type, default bn
|
||||
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
|
||||
freeze_norm (bool): whether to freeze norm, default False
|
||||
name (str): layer name
|
||||
data_format (str): data format, NCHW or NHWC
|
||||
"""
|
||||
super(Blocks, self).__init__()
|
||||
|
||||
self.basicblock0 = BasicBlock(
|
||||
ch_in,
|
||||
ch_out,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
data_format=data_format)
|
||||
self.res_out_list = []
|
||||
for i in range(1, count):
|
||||
block_name = '{}.{}'.format(name, i)
|
||||
res_out = self.add_sublayer(
|
||||
block_name,
|
||||
BasicBlock(
|
||||
ch_out,
|
||||
ch_out,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
data_format=data_format))
|
||||
self.res_out_list.append(res_out)
|
||||
self.ch_out = ch_out
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self.basicblock0(inputs)
|
||||
for basic_block_i in self.res_out_list:
|
||||
y = basic_block_i(y)
|
||||
return y
|
||||
|
||||
|
||||
DarkNet_cfg = {53: ([1, 2, 8, 8, 4])}
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class DarkNet(nn.Layer):
|
||||
__shared__ = ['norm_type', 'data_format']
|
||||
|
||||
def __init__(self,
|
||||
depth=53,
|
||||
freeze_at=-1,
|
||||
return_idx=[2, 3, 4],
|
||||
num_stages=5,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
freeze_norm=False,
|
||||
data_format='NCHW'):
|
||||
"""
|
||||
Darknet, see https://pjreddie.com/darknet/yolo/
|
||||
|
||||
Args:
|
||||
depth (int): depth of network
|
||||
freeze_at (int): freeze the backbone at which stage
|
||||
filter_size (int): filter size, default 3
|
||||
return_idx (list): index of stages whose feature maps are returned
|
||||
norm_type (str): batch norm type, default bn
|
||||
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
|
||||
data_format (str): data format, NCHW or NHWC
|
||||
"""
|
||||
super(DarkNet, self).__init__()
|
||||
self.depth = depth
|
||||
self.freeze_at = freeze_at
|
||||
self.return_idx = return_idx
|
||||
self.num_stages = num_stages
|
||||
self.stages = DarkNet_cfg[self.depth][0:num_stages]
|
||||
|
||||
self.conv0 = ConvBNLayer(
|
||||
ch_in=3,
|
||||
ch_out=32,
|
||||
filter_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
data_format=data_format)
|
||||
|
||||
self.downsample0 = DownSample(
|
||||
ch_in=32,
|
||||
ch_out=32 * 2,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
data_format=data_format)
|
||||
|
||||
self._out_channels = []
|
||||
self.darknet_conv_block_list = []
|
||||
self.downsample_list = []
|
||||
ch_in = [64, 128, 256, 512, 1024]
|
||||
for i, stage in enumerate(self.stages):
|
||||
name = 'stage.{}'.format(i)
|
||||
conv_block = self.add_sublayer(
|
||||
name,
|
||||
Blocks(
|
||||
int(ch_in[i]),
|
||||
int(ch_in[i]),
|
||||
stage,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
data_format=data_format,
|
||||
name=name))
|
||||
self.darknet_conv_block_list.append(conv_block)
|
||||
if i in return_idx:
|
||||
self._out_channels.append(int(ch_in[i]))
|
||||
for i in range(num_stages - 1):
|
||||
down_name = 'stage.{}.downsample'.format(i)
|
||||
downsample = self.add_sublayer(
|
||||
down_name,
|
||||
DownSample(
|
||||
ch_in=int(ch_in[i]),
|
||||
ch_out=int(ch_in[i + 1]),
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
data_format=data_format))
|
||||
self.downsample_list.append(downsample)
|
||||
|
||||
def forward(self, inputs):
|
||||
x = inputs['image']
|
||||
|
||||
out = self.conv0(x)
|
||||
out = self.downsample0(out)
|
||||
blocks = []
|
||||
for i, conv_block_i in enumerate(self.darknet_conv_block_list):
|
||||
out = conv_block_i(out)
|
||||
if i == self.freeze_at:
|
||||
out.stop_gradient = True
|
||||
if i in self.return_idx:
|
||||
blocks.append(out)
|
||||
if i < self.num_stages - 1:
|
||||
out = self.downsample_list[i](out)
|
||||
return blocks
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [ShapeSpec(channels=c) for c in self._out_channels]
|
||||
720
rtdetr_paddle/ppdet/modeling/backbones/focalnet.py
Normal file
720
rtdetr_paddle/ppdet/modeling/backbones/focalnet.py
Normal file
@@ -0,0 +1,720 @@
|
||||
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This code is based on https://github.com/microsoft/FocalNet/blob/main/classification/focalnet.py
|
||||
"""
|
||||
import numpy as np
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from ppdet.modeling.shape_spec import ShapeSpec
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from .transformer_utils import DropPath, Identity
|
||||
from .transformer_utils import add_parameter, to_2tuple
|
||||
from .transformer_utils import ones_, zeros_, trunc_normal_
|
||||
from .swin_transformer import Mlp
|
||||
|
||||
__all__ = ['FocalNet']
|
||||
|
||||
MODEL_cfg = {
|
||||
'focalnet_T_224_1k_srf': dict(
|
||||
embed_dim=96,
|
||||
depths=[2, 2, 6, 2],
|
||||
focal_levels=[2, 2, 2, 2],
|
||||
focal_windows=[3, 3, 3, 3],
|
||||
drop_path_rate=0.2,
|
||||
use_conv_embed=False,
|
||||
use_postln=False,
|
||||
use_postln_in_modulation=False,
|
||||
use_layerscale=False,
|
||||
normalize_modulator=False,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_srf_pretrained.pdparams',
|
||||
),
|
||||
'focalnet_S_224_1k_srf': dict(
|
||||
embed_dim=96,
|
||||
depths=[2, 2, 18, 2],
|
||||
focal_levels=[2, 2, 2, 2],
|
||||
focal_windows=[3, 3, 3, 3],
|
||||
drop_path_rate=0.3,
|
||||
use_conv_embed=False,
|
||||
use_postln=False,
|
||||
use_postln_in_modulation=False,
|
||||
use_layerscale=False,
|
||||
normalize_modulator=False,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_srf_pretrained.pdparams',
|
||||
),
|
||||
'focalnet_B_224_1k_srf': dict(
|
||||
embed_dim=128,
|
||||
depths=[2, 2, 18, 2],
|
||||
focal_levels=[2, 2, 2, 2],
|
||||
focal_windows=[3, 3, 3, 3],
|
||||
drop_path_rate=0.5,
|
||||
use_conv_embed=False,
|
||||
use_postln=False,
|
||||
use_postln_in_modulation=False,
|
||||
use_layerscale=False,
|
||||
normalize_modulator=False,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_srf_pretrained.pdparams',
|
||||
),
|
||||
'focalnet_T_224_1k_lrf': dict(
|
||||
embed_dim=96,
|
||||
depths=[2, 2, 6, 2],
|
||||
focal_levels=[3, 3, 3, 3],
|
||||
focal_windows=[3, 3, 3, 3],
|
||||
drop_path_rate=0.2,
|
||||
use_conv_embed=False,
|
||||
use_postln=False,
|
||||
use_postln_in_modulation=False,
|
||||
use_layerscale=False,
|
||||
normalize_modulator=False,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_lrf_pretrained.pdparams',
|
||||
),
|
||||
'focalnet_S_224_1k_lrf': dict(
|
||||
embed_dim=96,
|
||||
depths=[2, 2, 18, 2],
|
||||
focal_levels=[3, 3, 3, 3],
|
||||
focal_windows=[3, 3, 3, 3],
|
||||
drop_path_rate=0.3,
|
||||
use_conv_embed=False,
|
||||
use_postln=False,
|
||||
use_postln_in_modulation=False,
|
||||
use_layerscale=False,
|
||||
normalize_modulator=False,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_lrf_pretrained.pdparams',
|
||||
),
|
||||
'focalnet_B_224_1k_lrf': dict(
|
||||
embed_dim=128,
|
||||
depths=[2, 2, 18, 2],
|
||||
focal_levels=[3, 3, 3, 3],
|
||||
focal_windows=[3, 3, 3, 3],
|
||||
drop_path_rate=0.5,
|
||||
use_conv_embed=False,
|
||||
use_postln=False,
|
||||
use_postln_in_modulation=False,
|
||||
use_layerscale=False,
|
||||
normalize_modulator=False,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_lrf_pretrained.pdparams',
|
||||
),
|
||||
'focalnet_L_384_22k_fl3': dict(
|
||||
embed_dim=192,
|
||||
depths=[2, 2, 18, 2],
|
||||
focal_levels=[3, 3, 3, 3],
|
||||
focal_windows=[5, 5, 5, 5],
|
||||
drop_path_rate=0.5,
|
||||
use_conv_embed=True,
|
||||
use_postln=True,
|
||||
use_postln_in_modulation=False,
|
||||
use_layerscale=True,
|
||||
normalize_modulator=False,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_pretrained.pdparams',
|
||||
),
|
||||
'focalnet_L_384_22k_fl4': dict(
|
||||
embed_dim=192,
|
||||
depths=[2, 2, 18, 2],
|
||||
focal_levels=[4, 4, 4, 4],
|
||||
focal_windows=[3, 3, 3, 3],
|
||||
drop_path_rate=0.5,
|
||||
use_conv_embed=True,
|
||||
use_postln=True,
|
||||
use_postln_in_modulation=False,
|
||||
use_layerscale=True,
|
||||
normalize_modulator=True, #
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_fl4_pretrained.pdparams',
|
||||
),
|
||||
'focalnet_XL_384_22k_fl3': dict(
|
||||
embed_dim=256,
|
||||
depths=[2, 2, 18, 2],
|
||||
focal_levels=[3, 3, 3, 3],
|
||||
focal_windows=[5, 5, 5, 5],
|
||||
drop_path_rate=0.5,
|
||||
use_conv_embed=True,
|
||||
use_postln=True,
|
||||
use_postln_in_modulation=False,
|
||||
use_layerscale=True,
|
||||
normalize_modulator=False,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_pretrained.pdparams',
|
||||
),
|
||||
'focalnet_XL_384_22k_fl4': dict(
|
||||
embed_dim=256,
|
||||
depths=[2, 2, 18, 2],
|
||||
focal_levels=[4, 4, 4, 4],
|
||||
focal_windows=[3, 3, 3, 3],
|
||||
drop_path_rate=0.5,
|
||||
use_conv_embed=True,
|
||||
use_postln=True,
|
||||
use_postln_in_modulation=False,
|
||||
use_layerscale=True,
|
||||
normalize_modulator=False,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_fl4_pretrained.pdparams',
|
||||
),
|
||||
'focalnet_H_224_22k_fl3': dict(
|
||||
embed_dim=352,
|
||||
depths=[2, 2, 18, 2],
|
||||
focal_levels=[3, 3, 3, 3],
|
||||
focal_windows=[3, 3, 3, 3],
|
||||
drop_path_rate=0.5,
|
||||
use_conv_embed=True,
|
||||
use_postln=True,
|
||||
use_postln_in_modulation=True, #
|
||||
use_layerscale=True,
|
||||
normalize_modulator=False,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_pretrained.pdparams',
|
||||
),
|
||||
'focalnet_H_224_22k_fl4': dict(
|
||||
embed_dim=352,
|
||||
depths=[2, 2, 18, 2],
|
||||
focal_levels=[4, 4, 4, 4],
|
||||
focal_windows=[3, 3, 3, 3],
|
||||
drop_path_rate=0.5,
|
||||
use_conv_embed=True,
|
||||
use_postln=True,
|
||||
use_postln_in_modulation=True, #
|
||||
use_layerscale=True,
|
||||
normalize_modulator=False,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_fl4_pretrained.pdparams',
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
class FocalModulation(nn.Layer):
|
||||
"""
|
||||
Args:
|
||||
dim (int): Number of input channels.
|
||||
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
|
||||
focal_level (int): Number of focal levels
|
||||
focal_window (int): Focal window size at focal level 1
|
||||
focal_factor (int): Step to increase the focal window. Default: 2
|
||||
use_postln_in_modulation (bool): Whether use post-modulation layernorm
|
||||
normalize_modulator (bool): Whether use normalize in modulator
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
dim,
|
||||
proj_drop=0.,
|
||||
focal_level=2,
|
||||
focal_window=7,
|
||||
focal_factor=2,
|
||||
use_postln_in_modulation=False,
|
||||
normalize_modulator=False):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
|
||||
# specific args for focalv3
|
||||
self.focal_level = focal_level
|
||||
self.focal_window = focal_window
|
||||
self.focal_factor = focal_factor
|
||||
self.use_postln_in_modulation = use_postln_in_modulation
|
||||
self.normalize_modulator = normalize_modulator
|
||||
|
||||
self.f = nn.Linear(
|
||||
dim, 2 * dim + (self.focal_level + 1), bias_attr=True)
|
||||
self.h = nn.Conv2D(
|
||||
dim,
|
||||
dim,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
groups=1,
|
||||
bias_attr=True)
|
||||
|
||||
self.act = nn.GELU()
|
||||
self.proj = nn.Linear(dim, dim)
|
||||
self.proj_drop = nn.Dropout(proj_drop)
|
||||
self.focal_layers = nn.LayerList()
|
||||
|
||||
if self.use_postln_in_modulation:
|
||||
self.ln = nn.LayerNorm(dim)
|
||||
|
||||
for k in range(self.focal_level):
|
||||
kernel_size = self.focal_factor * k + self.focal_window
|
||||
self.focal_layers.append(
|
||||
nn.Sequential(
|
||||
nn.Conv2D(
|
||||
dim,
|
||||
dim,
|
||||
kernel_size=kernel_size,
|
||||
stride=1,
|
||||
groups=dim,
|
||||
padding=kernel_size // 2,
|
||||
bias_attr=False),
|
||||
nn.GELU()))
|
||||
|
||||
def forward(self, x):
|
||||
""" Forward function.
|
||||
Args:
|
||||
x: input features with shape of (B, H, W, C)
|
||||
"""
|
||||
_, _, _, C = x.shape
|
||||
x = self.f(x)
|
||||
x = x.transpose([0, 3, 1, 2])
|
||||
q, ctx, gates = paddle.split(x, (C, C, self.focal_level + 1), 1)
|
||||
|
||||
ctx_all = 0
|
||||
for l in range(self.focal_level):
|
||||
ctx = self.focal_layers[l](ctx)
|
||||
ctx_all = ctx_all + ctx * gates[:, l:l + 1]
|
||||
ctx_global = self.act(ctx.mean(2, keepdim=True).mean(3, keepdim=True))
|
||||
ctx_all = ctx_all + ctx_global * gates[:, self.focal_level:]
|
||||
if self.normalize_modulator:
|
||||
ctx_all = ctx_all / (self.focal_level + 1)
|
||||
|
||||
x_out = q * self.h(ctx_all)
|
||||
x_out = x_out.transpose([0, 2, 3, 1])
|
||||
if self.use_postln_in_modulation:
|
||||
x_out = self.ln(x_out)
|
||||
x_out = self.proj(x_out)
|
||||
x_out = self.proj_drop(x_out)
|
||||
return x_out
|
||||
|
||||
|
||||
class FocalModulationBlock(nn.Layer):
|
||||
""" Focal Modulation Block.
|
||||
Args:
|
||||
dim (int): Number of input channels.
|
||||
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
|
||||
drop (float, optional): Dropout rate. Default: 0.0
|
||||
drop_path (float, optional): Stochastic depth rate. Default: 0.0
|
||||
act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
|
||||
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
|
||||
focal_level (int): number of focal levels
|
||||
focal_window (int): focal kernel size at level 1
|
||||
use_postln (bool): Whether use layernorm after modulation. Default: False.
|
||||
use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
|
||||
normalize_modulator (bool): Whether use normalize in modulator
|
||||
use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False
|
||||
layerscale_value (float): Value for layer scale. Default: 1e-4
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
dim,
|
||||
mlp_ratio=4.,
|
||||
drop=0.,
|
||||
drop_path=0.,
|
||||
act_layer=nn.GELU,
|
||||
norm_layer=nn.LayerNorm,
|
||||
focal_level=2,
|
||||
focal_window=9,
|
||||
use_postln=False,
|
||||
use_postln_in_modulation=False,
|
||||
normalize_modulator=False,
|
||||
use_layerscale=False,
|
||||
layerscale_value=1e-4):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.mlp_ratio = mlp_ratio
|
||||
self.focal_window = focal_window
|
||||
self.focal_level = focal_level
|
||||
self.use_postln = use_postln
|
||||
self.use_layerscale = use_layerscale
|
||||
|
||||
self.norm1 = norm_layer(dim)
|
||||
self.modulation = FocalModulation(
|
||||
dim,
|
||||
proj_drop=drop,
|
||||
focal_level=self.focal_level,
|
||||
focal_window=self.focal_window,
|
||||
use_postln_in_modulation=use_postln_in_modulation,
|
||||
normalize_modulator=normalize_modulator)
|
||||
|
||||
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
|
||||
self.norm2 = norm_layer(dim)
|
||||
mlp_hidden_dim = int(dim * mlp_ratio)
|
||||
self.mlp = Mlp(in_features=dim,
|
||||
hidden_features=mlp_hidden_dim,
|
||||
act_layer=act_layer,
|
||||
drop=drop)
|
||||
self.H = None
|
||||
self.W = None
|
||||
|
||||
self.gamma_1 = 1.0
|
||||
self.gamma_2 = 1.0
|
||||
if self.use_layerscale:
|
||||
self.gamma_1 = add_parameter(self,
|
||||
layerscale_value * paddle.ones([dim]))
|
||||
self.gamma_2 = add_parameter(self,
|
||||
layerscale_value * paddle.ones([dim]))
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Args:
|
||||
x: Input feature, tensor size (B, H*W, C).
|
||||
"""
|
||||
B, L, C = x.shape
|
||||
H, W = self.H, self.W
|
||||
assert L == H * W, "input feature has wrong size"
|
||||
|
||||
shortcut = x
|
||||
if not self.use_postln:
|
||||
x = self.norm1(x)
|
||||
x = x.reshape([-1, H, W, C])
|
||||
|
||||
# FM
|
||||
x = self.modulation(x).reshape([-1, H * W, C])
|
||||
if self.use_postln:
|
||||
x = self.norm1(x)
|
||||
|
||||
# FFN
|
||||
x = shortcut + self.drop_path(self.gamma_1 * x)
|
||||
|
||||
if self.use_postln:
|
||||
x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x)))
|
||||
else:
|
||||
x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
|
||||
return x
|
||||
|
||||
|
||||
class BasicLayer(nn.Layer):
|
||||
""" A basic focal modulation layer for one stage.
|
||||
Args:
|
||||
dim (int): Number of feature channels
|
||||
depth (int): Depths of this stage.
|
||||
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
|
||||
drop (float, optional): Dropout rate. Default: 0.0
|
||||
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
|
||||
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
|
||||
downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
|
||||
focal_level (int): Number of focal levels
|
||||
focal_window (int): Focal window size at focal level 1
|
||||
use_conv_embed (bool): Whether use overlapped convolution for patch embedding
|
||||
use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False
|
||||
layerscale_value (float): Value of layerscale
|
||||
use_postln (bool): Whether use layernorm after modulation. Default: False.
|
||||
use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
|
||||
normalize_modulator (bool): Whether use normalize in modulator
|
||||
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
dim,
|
||||
depth,
|
||||
mlp_ratio=4.,
|
||||
drop=0.,
|
||||
drop_path=0.,
|
||||
norm_layer=nn.LayerNorm,
|
||||
downsample=None,
|
||||
focal_level=2,
|
||||
focal_window=9,
|
||||
use_conv_embed=False,
|
||||
use_layerscale=False,
|
||||
layerscale_value=1e-4,
|
||||
use_postln=False,
|
||||
use_postln_in_modulation=False,
|
||||
normalize_modulator=False,
|
||||
use_checkpoint=False):
|
||||
super().__init__()
|
||||
self.depth = depth
|
||||
self.use_checkpoint = use_checkpoint
|
||||
|
||||
# build blocks
|
||||
self.blocks = nn.LayerList([
|
||||
FocalModulationBlock(
|
||||
dim=dim,
|
||||
mlp_ratio=mlp_ratio,
|
||||
drop=drop,
|
||||
drop_path=drop_path[i]
|
||||
if isinstance(drop_path, np.ndarray) else drop_path,
|
||||
act_layer=nn.GELU,
|
||||
norm_layer=norm_layer,
|
||||
focal_level=focal_level,
|
||||
focal_window=focal_window,
|
||||
use_postln=use_postln,
|
||||
use_postln_in_modulation=use_postln_in_modulation,
|
||||
normalize_modulator=normalize_modulator,
|
||||
use_layerscale=use_layerscale,
|
||||
layerscale_value=layerscale_value) for i in range(depth)
|
||||
])
|
||||
|
||||
# patch merging layer
|
||||
if downsample is not None:
|
||||
self.downsample = downsample(
|
||||
patch_size=2,
|
||||
in_chans=dim,
|
||||
embed_dim=2 * dim,
|
||||
use_conv_embed=use_conv_embed,
|
||||
norm_layer=norm_layer,
|
||||
is_stem=False)
|
||||
else:
|
||||
self.downsample = None
|
||||
|
||||
def forward(self, x, H, W):
|
||||
"""
|
||||
Args:
|
||||
x: Input feature, tensor size (B, H*W, C).
|
||||
"""
|
||||
for blk in self.blocks:
|
||||
blk.H, blk.W = H, W
|
||||
x = blk(x)
|
||||
|
||||
if self.downsample is not None:
|
||||
x_reshaped = x.transpose([0, 2, 1]).reshape(
|
||||
[x.shape[0], x.shape[-1], H, W])
|
||||
x_down = self.downsample(x_reshaped)
|
||||
x_down = x_down.flatten(2).transpose([0, 2, 1])
|
||||
Wh, Ww = (H + 1) // 2, (W + 1) // 2
|
||||
return x, H, W, x_down, Wh, Ww
|
||||
else:
|
||||
return x, H, W, x, H, W
|
||||
|
||||
|
||||
class PatchEmbed(nn.Layer):
|
||||
""" Image to Patch Embedding
|
||||
Args:
|
||||
patch_size (int): Patch token size. Default: 4.
|
||||
in_chans (int): Number of input image channels. Default: 3.
|
||||
embed_dim (int): Number of linear projection output channels. Default: 96.
|
||||
norm_layer (nn.Layer, optional): Normalization layer. Default: None
|
||||
use_conv_embed (bool): Whether use overlapped convolution for patch embedding. Default: False
|
||||
is_stem (bool): Is the stem block or not.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
patch_size=4,
|
||||
in_chans=3,
|
||||
embed_dim=96,
|
||||
norm_layer=None,
|
||||
use_conv_embed=False,
|
||||
is_stem=False):
|
||||
super().__init__()
|
||||
patch_size = to_2tuple(patch_size)
|
||||
self.patch_size = patch_size
|
||||
|
||||
self.in_chans = in_chans
|
||||
self.embed_dim = embed_dim
|
||||
|
||||
if use_conv_embed:
|
||||
# if we choose to use conv embedding, then we treat the stem and non-stem differently
|
||||
if is_stem:
|
||||
kernel_size = 7
|
||||
padding = 2
|
||||
stride = 4
|
||||
else:
|
||||
kernel_size = 3
|
||||
padding = 1
|
||||
stride = 2
|
||||
self.proj = nn.Conv2D(
|
||||
in_chans,
|
||||
embed_dim,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=padding)
|
||||
else:
|
||||
self.proj = nn.Conv2D(
|
||||
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
|
||||
|
||||
if norm_layer is not None:
|
||||
self.norm = norm_layer(embed_dim)
|
||||
else:
|
||||
self.norm = None
|
||||
|
||||
def forward(self, x):
|
||||
_, _, H, W = x.shape
|
||||
|
||||
if W % self.patch_size[1] != 0:
|
||||
# for 3D tensor: [pad_left, pad_right]
|
||||
# for 4D tensor: [pad_left, pad_right, pad_top, pad_bottom]
|
||||
x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
|
||||
W += W % self.patch_size[1]
|
||||
if H % self.patch_size[0] != 0:
|
||||
x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
|
||||
H += H % self.patch_size[0]
|
||||
|
||||
x = self.proj(x)
|
||||
if self.norm is not None:
|
||||
_, _, Wh, Ww = x.shape
|
||||
x = x.flatten(2).transpose([0, 2, 1])
|
||||
x = self.norm(x)
|
||||
x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
|
||||
|
||||
return x
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class FocalNet(nn.Layer):
|
||||
""" FocalNet backbone
|
||||
Args:
|
||||
arch (str): Architecture of FocalNet
|
||||
out_indices (Sequence[int]): Output from which stages.
|
||||
frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
|
||||
-1 means not freezing any parameters.
|
||||
patch_size (int | tuple(int)): Patch size. Default: 4.
|
||||
in_chans (int): Number of input image channels. Default: 3.
|
||||
embed_dim (int): Number of linear projection output channels. Default: 96.
|
||||
depths (tuple[int]): Depths of each FocalNet Transformer stage.
|
||||
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
|
||||
drop_rate (float): Dropout rate.
|
||||
drop_path_rate (float): Stochastic depth rate. Default: 0.2.
|
||||
norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
|
||||
patch_norm (bool): If True, add normalization after patch embedding. Default: True.
|
||||
focal_levels (Sequence[int]): Number of focal levels at four stages
|
||||
focal_windows (Sequence[int]): Focal window sizes at first focal level at four stages
|
||||
use_conv_embed (bool): Whether use overlapped convolution for patch embedding
|
||||
use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False
|
||||
layerscale_value (float): Value of layerscale
|
||||
use_postln (bool): Whether use layernorm after modulation. Default: False.
|
||||
use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
|
||||
normalize_modulator (bool): Whether use normalize in modulator
|
||||
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
arch='focalnet_T_224_1k_srf',
|
||||
out_indices=(0, 1, 2, 3),
|
||||
frozen_stages=-1,
|
||||
patch_size=4,
|
||||
in_chans=3,
|
||||
embed_dim=96,
|
||||
depths=[2, 2, 6, 2],
|
||||
mlp_ratio=4.,
|
||||
drop_rate=0.,
|
||||
drop_path_rate=0.2, # 0.5 better for large+ models
|
||||
norm_layer=nn.LayerNorm,
|
||||
patch_norm=True,
|
||||
focal_levels=[2, 2, 2, 2],
|
||||
focal_windows=[3, 3, 3, 3],
|
||||
use_conv_embed=False,
|
||||
use_layerscale=False,
|
||||
layerscale_value=1e-4,
|
||||
use_postln=False,
|
||||
use_postln_in_modulation=False,
|
||||
normalize_modulator=False,
|
||||
use_checkpoint=False,
|
||||
pretrained=None):
|
||||
super(FocalNet, self).__init__()
|
||||
assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch)
|
||||
|
||||
embed_dim = MODEL_cfg[arch]['embed_dim']
|
||||
depths = MODEL_cfg[arch]['depths']
|
||||
drop_path_rate = MODEL_cfg[arch]['drop_path_rate']
|
||||
focal_levels = MODEL_cfg[arch]['focal_levels']
|
||||
focal_windows = MODEL_cfg[arch]['focal_windows']
|
||||
use_conv_embed = MODEL_cfg[arch]['use_conv_embed']
|
||||
use_layerscale = MODEL_cfg[arch]['use_layerscale']
|
||||
use_postln = MODEL_cfg[arch]['use_postln']
|
||||
use_postln_in_modulation = MODEL_cfg[arch]['use_postln_in_modulation']
|
||||
normalize_modulator = MODEL_cfg[arch]['normalize_modulator']
|
||||
if pretrained is None:
|
||||
pretrained = MODEL_cfg[arch]['pretrained']
|
||||
|
||||
self.out_indices = out_indices
|
||||
self.frozen_stages = frozen_stages
|
||||
self.num_layers = len(depths)
|
||||
self.patch_norm = patch_norm
|
||||
|
||||
# split image into non-overlapping patches
|
||||
self.patch_embed = PatchEmbed(
|
||||
patch_size=patch_size,
|
||||
in_chans=in_chans,
|
||||
embed_dim=embed_dim,
|
||||
norm_layer=norm_layer if self.patch_norm else None,
|
||||
use_conv_embed=use_conv_embed,
|
||||
is_stem=True)
|
||||
|
||||
self.pos_drop = nn.Dropout(p=drop_rate)
|
||||
|
||||
# stochastic depth decay rule
|
||||
dpr = np.linspace(0, drop_path_rate, sum(depths))
|
||||
|
||||
# build layers
|
||||
self.layers = nn.LayerList()
|
||||
for i_layer in range(self.num_layers):
|
||||
layer = BasicLayer(
|
||||
dim=int(embed_dim * 2**i_layer),
|
||||
depth=depths[i_layer],
|
||||
mlp_ratio=mlp_ratio,
|
||||
drop=drop_rate,
|
||||
drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
|
||||
norm_layer=norm_layer,
|
||||
downsample=PatchEmbed
|
||||
if (i_layer < self.num_layers - 1) else None,
|
||||
focal_level=focal_levels[i_layer],
|
||||
focal_window=focal_windows[i_layer],
|
||||
use_conv_embed=use_conv_embed,
|
||||
use_layerscale=use_layerscale,
|
||||
layerscale_value=layerscale_value,
|
||||
use_postln=use_postln,
|
||||
use_postln_in_modulation=use_postln_in_modulation,
|
||||
normalize_modulator=normalize_modulator,
|
||||
use_checkpoint=use_checkpoint)
|
||||
self.layers.append(layer)
|
||||
|
||||
num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
|
||||
self.num_features = num_features
|
||||
|
||||
# add a norm layer for each output
|
||||
for i_layer in out_indices:
|
||||
layer = norm_layer(num_features[i_layer])
|
||||
layer_name = f'norm{i_layer}'
|
||||
self.add_sublayer(layer_name, layer)
|
||||
|
||||
self.apply(self._init_weights)
|
||||
self._freeze_stages()
|
||||
if pretrained:
|
||||
if 'http' in pretrained: #URL
|
||||
path = paddle.utils.download.get_weights_path_from_url(
|
||||
pretrained)
|
||||
else: #model in local path
|
||||
path = pretrained
|
||||
self.set_state_dict(paddle.load(path))
|
||||
|
||||
def _freeze_stages(self):
|
||||
if self.frozen_stages >= 0:
|
||||
self.patch_embed.eval()
|
||||
for param in self.patch_embed.parameters():
|
||||
param.stop_gradient = True
|
||||
|
||||
if self.frozen_stages >= 2:
|
||||
self.pos_drop.eval()
|
||||
for i in range(0, self.frozen_stages - 1):
|
||||
m = self.layers[i]
|
||||
m.eval()
|
||||
for param in m.parameters():
|
||||
param.stop_gradient = True
|
||||
|
||||
def _init_weights(self, m):
|
||||
if isinstance(m, nn.Linear):
|
||||
trunc_normal_(m.weight)
|
||||
if isinstance(m, nn.Linear) and m.bias is not None:
|
||||
zeros_(m.bias)
|
||||
elif isinstance(m, nn.LayerNorm):
|
||||
zeros_(m.bias)
|
||||
ones_(m.weight)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.patch_embed(x['image'])
|
||||
B, _, Wh, Ww = x.shape
|
||||
x = x.flatten(2).transpose([0, 2, 1])
|
||||
x = self.pos_drop(x)
|
||||
outs = []
|
||||
for i in range(self.num_layers):
|
||||
layer = self.layers[i]
|
||||
x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
|
||||
if i in self.out_indices:
|
||||
norm_layer = getattr(self, f'norm{i}')
|
||||
x_out = norm_layer(x_out)
|
||||
out = x_out.reshape([-1, H, W, self.num_features[i]]).transpose(
|
||||
(0, 3, 1, 2))
|
||||
outs.append(out)
|
||||
|
||||
return outs
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
out_strides = [4, 8, 16, 32]
|
||||
return [
|
||||
ShapeSpec(
|
||||
channels=self.num_features[i], stride=out_strides[i])
|
||||
for i in self.out_indices
|
||||
]
|
||||
447
rtdetr_paddle/ppdet/modeling/backbones/hgnet_v2.py
Normal file
447
rtdetr_paddle/ppdet/modeling/backbones/hgnet_v2.py
Normal file
@@ -0,0 +1,447 @@
|
||||
# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle.nn.initializer import KaimingNormal, Constant
|
||||
from paddle.nn import Conv2D, BatchNorm2D, ReLU, AdaptiveAvgPool2D, MaxPool2D
|
||||
from paddle.regularizer import L2Decay
|
||||
from paddle import ParamAttr
|
||||
|
||||
import copy
|
||||
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from ..shape_spec import ShapeSpec
|
||||
|
||||
__all__ = ['PPHGNetV2']
|
||||
|
||||
kaiming_normal_ = KaimingNormal()
|
||||
zeros_ = Constant(value=0.)
|
||||
ones_ = Constant(value=1.)
|
||||
|
||||
|
||||
class LearnableAffineBlock(nn.Layer):
|
||||
def __init__(self,
|
||||
scale_value=1.0,
|
||||
bias_value=0.0,
|
||||
lr_mult=1.0,
|
||||
lab_lr=0.01):
|
||||
super().__init__()
|
||||
self.scale = self.create_parameter(
|
||||
shape=[1, ],
|
||||
default_initializer=Constant(value=scale_value),
|
||||
attr=ParamAttr(learning_rate=lr_mult * lab_lr))
|
||||
self.add_parameter("scale", self.scale)
|
||||
self.bias = self.create_parameter(
|
||||
shape=[1, ],
|
||||
default_initializer=Constant(value=bias_value),
|
||||
attr=ParamAttr(learning_rate=lr_mult * lab_lr))
|
||||
self.add_parameter("bias", self.bias)
|
||||
|
||||
def forward(self, x):
|
||||
return self.scale * x + self.bias
|
||||
|
||||
|
||||
class ConvBNAct(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
groups=1,
|
||||
use_act=True,
|
||||
use_lab=False,
|
||||
lr_mult=1.0):
|
||||
super().__init__()
|
||||
self.use_act = use_act
|
||||
self.use_lab = use_lab
|
||||
self.conv = Conv2D(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding=padding
|
||||
if isinstance(padding, str) else (kernel_size - 1) // 2,
|
||||
groups=groups,
|
||||
weight_attr=ParamAttr(learning_rate=lr_mult),
|
||||
bias_attr=False)
|
||||
self.bn = BatchNorm2D(
|
||||
out_channels,
|
||||
weight_attr=ParamAttr(
|
||||
regularizer=L2Decay(0.0), learning_rate=lr_mult),
|
||||
bias_attr=ParamAttr(
|
||||
regularizer=L2Decay(0.0), learning_rate=lr_mult))
|
||||
if self.use_act:
|
||||
self.act = ReLU()
|
||||
if self.use_lab:
|
||||
self.lab = LearnableAffineBlock(lr_mult=lr_mult)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
if self.use_act:
|
||||
x = self.act(x)
|
||||
if self.use_lab:
|
||||
x = self.lab(x)
|
||||
return x
|
||||
|
||||
|
||||
class LightConvBNAct(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
groups=1,
|
||||
use_lab=False,
|
||||
lr_mult=1.0):
|
||||
super().__init__()
|
||||
self.conv1 = ConvBNAct(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
use_act=False,
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult)
|
||||
self.conv2 = ConvBNAct(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
groups=out_channels,
|
||||
use_act=True,
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
x = self.conv2(x)
|
||||
return x
|
||||
|
||||
|
||||
class StemBlock(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
mid_channels,
|
||||
out_channels,
|
||||
use_lab=False,
|
||||
lr_mult=1.0):
|
||||
super().__init__()
|
||||
self.stem1 = ConvBNAct(
|
||||
in_channels=in_channels,
|
||||
out_channels=mid_channels,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult)
|
||||
self.stem2a = ConvBNAct(
|
||||
in_channels=mid_channels,
|
||||
out_channels=mid_channels // 2,
|
||||
kernel_size=2,
|
||||
stride=1,
|
||||
padding="SAME",
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult)
|
||||
self.stem2b = ConvBNAct(
|
||||
in_channels=mid_channels // 2,
|
||||
out_channels=mid_channels,
|
||||
kernel_size=2,
|
||||
stride=1,
|
||||
padding="SAME",
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult)
|
||||
self.stem3 = ConvBNAct(
|
||||
in_channels=mid_channels * 2,
|
||||
out_channels=mid_channels,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult)
|
||||
self.stem4 = ConvBNAct(
|
||||
in_channels=mid_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult)
|
||||
self.pool = nn.MaxPool2D(
|
||||
kernel_size=2, stride=1, ceil_mode=True, padding="SAME")
|
||||
|
||||
def forward(self, x):
|
||||
x = self.stem1(x)
|
||||
x2 = self.stem2a(x)
|
||||
x2 = self.stem2b(x2)
|
||||
x1 = self.pool(x)
|
||||
x = paddle.concat([x1, x2], 1)
|
||||
x = self.stem3(x)
|
||||
x = self.stem4(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class HG_Block(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
mid_channels,
|
||||
out_channels,
|
||||
kernel_size=3,
|
||||
layer_num=6,
|
||||
identity=False,
|
||||
light_block=True,
|
||||
use_lab=False,
|
||||
lr_mult=1.0):
|
||||
super().__init__()
|
||||
self.identity = identity
|
||||
|
||||
self.layers = nn.LayerList()
|
||||
block_type = "LightConvBNAct" if light_block else "ConvBNAct"
|
||||
for i in range(layer_num):
|
||||
self.layers.append(
|
||||
eval(block_type)(in_channels=in_channels
|
||||
if i == 0 else mid_channels,
|
||||
out_channels=mid_channels,
|
||||
stride=1,
|
||||
kernel_size=kernel_size,
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult))
|
||||
# feature aggregation
|
||||
total_channels = in_channels + layer_num * mid_channels
|
||||
self.aggregation_squeeze_conv = ConvBNAct(
|
||||
in_channels=total_channels,
|
||||
out_channels=out_channels // 2,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult)
|
||||
self.aggregation_excitation_conv = ConvBNAct(
|
||||
in_channels=out_channels // 2,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult)
|
||||
|
||||
def forward(self, x):
|
||||
identity = x
|
||||
output = []
|
||||
output.append(x)
|
||||
for layer in self.layers:
|
||||
x = layer(x)
|
||||
output.append(x)
|
||||
x = paddle.concat(output, axis=1)
|
||||
x = self.aggregation_squeeze_conv(x)
|
||||
x = self.aggregation_excitation_conv(x)
|
||||
if self.identity:
|
||||
x += identity
|
||||
return x
|
||||
|
||||
|
||||
class HG_Stage(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
mid_channels,
|
||||
out_channels,
|
||||
block_num,
|
||||
layer_num=6,
|
||||
downsample=True,
|
||||
light_block=True,
|
||||
kernel_size=3,
|
||||
use_lab=False,
|
||||
lr_mult=1.0):
|
||||
super().__init__()
|
||||
self.downsample = downsample
|
||||
if downsample:
|
||||
self.downsample = ConvBNAct(
|
||||
in_channels=in_channels,
|
||||
out_channels=in_channels,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
groups=in_channels,
|
||||
use_act=False,
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult)
|
||||
|
||||
blocks_list = []
|
||||
for i in range(block_num):
|
||||
blocks_list.append(
|
||||
HG_Block(
|
||||
in_channels=in_channels if i == 0 else out_channels,
|
||||
mid_channels=mid_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
layer_num=layer_num,
|
||||
identity=False if i == 0 else True,
|
||||
light_block=light_block,
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult))
|
||||
self.blocks = nn.Sequential(*blocks_list)
|
||||
|
||||
def forward(self, x):
|
||||
if self.downsample:
|
||||
x = self.downsample(x)
|
||||
x = self.blocks(x)
|
||||
return x
|
||||
|
||||
|
||||
def _freeze_norm(m: nn.BatchNorm2D):
|
||||
param_attr = ParamAttr(
|
||||
learning_rate=0., regularizer=L2Decay(0.), trainable=False)
|
||||
bias_attr = ParamAttr(
|
||||
learning_rate=0., regularizer=L2Decay(0.), trainable=False)
|
||||
global_stats = True
|
||||
norm = nn.BatchNorm2D(
|
||||
m._num_features,
|
||||
weight_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
use_global_stats=global_stats)
|
||||
for param in norm.parameters():
|
||||
param.stop_gradient = True
|
||||
return norm
|
||||
|
||||
|
||||
def reset_bn(model: nn.Layer, reset_func=_freeze_norm):
|
||||
if isinstance(model, nn.BatchNorm2D):
|
||||
model = reset_func(model)
|
||||
else:
|
||||
for name, child in model.named_children():
|
||||
_child = reset_bn(child, reset_func)
|
||||
if _child is not child:
|
||||
setattr(model, name, _child)
|
||||
return model
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class PPHGNetV2(nn.Layer):
|
||||
"""
|
||||
PPHGNetV2
|
||||
Args:
|
||||
stem_channels: list. Number of channels for the stem block.
|
||||
stage_type: str. The stage configuration of PPHGNet. such as the number of channels, stride, etc.
|
||||
use_lab: boolean. Whether to use LearnableAffineBlock in network.
|
||||
lr_mult_list: list. Control the learning rate of different stages.
|
||||
Returns:
|
||||
model: nn.Layer. Specific PPHGNetV2 model depends on args.
|
||||
"""
|
||||
|
||||
arch_configs = {
|
||||
'L': {
|
||||
'stem_channels': [3, 32, 48],
|
||||
'stage_config': {
|
||||
# in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
|
||||
"stage1": [48, 48, 128, 1, False, False, 3, 6],
|
||||
"stage2": [128, 96, 512, 1, True, False, 3, 6],
|
||||
"stage3": [512, 192, 1024, 3, True, True, 5, 6],
|
||||
"stage4": [1024, 384, 2048, 1, True, True, 5, 6],
|
||||
}
|
||||
},
|
||||
'X': {
|
||||
'stem_channels': [3, 32, 64],
|
||||
'stage_config': {
|
||||
# in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
|
||||
"stage1": [64, 64, 128, 1, False, False, 3, 6],
|
||||
"stage2": [128, 128, 512, 2, True, False, 3, 6],
|
||||
"stage3": [512, 256, 1024, 5, True, True, 5, 6],
|
||||
"stage4": [1024, 512, 2048, 2, True, True, 5, 6],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
arch,
|
||||
use_lab=False,
|
||||
lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
|
||||
return_idx=[1, 2, 3],
|
||||
freeze_stem_only=True,
|
||||
freeze_at=0,
|
||||
freeze_norm=True):
|
||||
super().__init__()
|
||||
self.use_lab = use_lab
|
||||
self.return_idx = return_idx
|
||||
|
||||
stem_channels = self.arch_configs[arch]['stem_channels']
|
||||
stage_config = self.arch_configs[arch]['stage_config']
|
||||
|
||||
self._out_strides = [4, 8, 16, 32]
|
||||
self._out_channels = [stage_config[k][2] for k in stage_config]
|
||||
|
||||
# stem
|
||||
self.stem = StemBlock(
|
||||
in_channels=stem_channels[0],
|
||||
mid_channels=stem_channels[1],
|
||||
out_channels=stem_channels[2],
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult_list[0])
|
||||
|
||||
# stages
|
||||
self.stages = nn.LayerList()
|
||||
for i, k in enumerate(stage_config):
|
||||
in_channels, mid_channels, out_channels, block_num, downsample, light_block, kernel_size, layer_num = stage_config[
|
||||
k]
|
||||
self.stages.append(
|
||||
HG_Stage(
|
||||
in_channels,
|
||||
mid_channels,
|
||||
out_channels,
|
||||
block_num,
|
||||
layer_num,
|
||||
downsample,
|
||||
light_block,
|
||||
kernel_size,
|
||||
use_lab,
|
||||
lr_mult=lr_mult_list[i + 1]))
|
||||
|
||||
if freeze_at >= 0:
|
||||
self._freeze_parameters(self.stem)
|
||||
if not freeze_stem_only:
|
||||
for i in range(min(freeze_at + 1, len(self.stages))):
|
||||
self._freeze_parameters(self.stages[i])
|
||||
|
||||
if freeze_norm:
|
||||
reset_bn(self, reset_func=_freeze_norm)
|
||||
|
||||
self._init_weights()
|
||||
|
||||
def _freeze_parameters(self, m):
|
||||
for p in m.parameters():
|
||||
p.stop_gradient = True
|
||||
|
||||
def _init_weights(self):
|
||||
for m in self.sublayers():
|
||||
if isinstance(m, nn.Conv2D):
|
||||
kaiming_normal_(m.weight)
|
||||
elif isinstance(m, (nn.BatchNorm2D)):
|
||||
ones_(m.weight)
|
||||
zeros_(m.bias)
|
||||
elif isinstance(m, nn.Linear):
|
||||
zeros_(m.bias)
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [
|
||||
ShapeSpec(
|
||||
channels=self._out_channels[i], stride=self._out_strides[i])
|
||||
for i in self.return_idx
|
||||
]
|
||||
|
||||
def forward(self, inputs):
|
||||
x = inputs['image']
|
||||
x = self.stem(x)
|
||||
outs = []
|
||||
for idx, stage in enumerate(self.stages):
|
||||
x = stage(x)
|
||||
if idx in self.return_idx:
|
||||
outs.append(x)
|
||||
return outs
|
||||
271
rtdetr_paddle/ppdet/modeling/backbones/lcnet.py
Normal file
271
rtdetr_paddle/ppdet/modeling/backbones/lcnet.py
Normal file
@@ -0,0 +1,271 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
from paddle import ParamAttr
|
||||
from paddle.nn import AdaptiveAvgPool2D, Conv2D
|
||||
from paddle.regularizer import L2Decay
|
||||
from paddle.nn.initializer import KaimingNormal
|
||||
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from numbers import Integral
|
||||
from ..shape_spec import ShapeSpec
|
||||
|
||||
__all__ = ['LCNet']
|
||||
|
||||
NET_CONFIG = {
|
||||
"blocks2":
|
||||
#k, in_c, out_c, s, use_se
|
||||
[[3, 16, 32, 1, False], ],
|
||||
"blocks3": [
|
||||
[3, 32, 64, 2, False],
|
||||
[3, 64, 64, 1, False],
|
||||
],
|
||||
"blocks4": [
|
||||
[3, 64, 128, 2, False],
|
||||
[3, 128, 128, 1, False],
|
||||
],
|
||||
"blocks5": [
|
||||
[3, 128, 256, 2, False],
|
||||
[5, 256, 256, 1, False],
|
||||
[5, 256, 256, 1, False],
|
||||
[5, 256, 256, 1, False],
|
||||
[5, 256, 256, 1, False],
|
||||
[5, 256, 256, 1, False],
|
||||
],
|
||||
"blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]]
|
||||
}
|
||||
|
||||
|
||||
def make_divisible(v, divisor=8, min_value=None):
|
||||
if min_value is None:
|
||||
min_value = divisor
|
||||
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
|
||||
if new_v < 0.9 * v:
|
||||
new_v += divisor
|
||||
return new_v
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
num_channels,
|
||||
filter_size,
|
||||
num_filters,
|
||||
stride,
|
||||
num_groups=1,
|
||||
act='hard_swish'):
|
||||
super().__init__()
|
||||
|
||||
self.conv = Conv2D(
|
||||
in_channels=num_channels,
|
||||
out_channels=num_filters,
|
||||
kernel_size=filter_size,
|
||||
stride=stride,
|
||||
padding=(filter_size - 1) // 2,
|
||||
groups=num_groups,
|
||||
weight_attr=ParamAttr(initializer=KaimingNormal()),
|
||||
bias_attr=False)
|
||||
|
||||
self.bn = nn.BatchNorm2D(
|
||||
num_filters,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
|
||||
if act == 'hard_swish':
|
||||
self.act = nn.Hardswish()
|
||||
elif act == 'relu6':
|
||||
self.act = nn.ReLU6()
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
x = self.act(x)
|
||||
return x
|
||||
|
||||
|
||||
class DepthwiseSeparable(nn.Layer):
|
||||
def __init__(self,
|
||||
num_channels,
|
||||
num_filters,
|
||||
stride,
|
||||
dw_size=3,
|
||||
use_se=False,
|
||||
act='hard_swish'):
|
||||
super().__init__()
|
||||
self.use_se = use_se
|
||||
self.dw_conv = ConvBNLayer(
|
||||
num_channels=num_channels,
|
||||
num_filters=num_channels,
|
||||
filter_size=dw_size,
|
||||
stride=stride,
|
||||
num_groups=num_channels,
|
||||
act=act)
|
||||
if use_se:
|
||||
self.se = SEModule(num_channels)
|
||||
self.pw_conv = ConvBNLayer(
|
||||
num_channels=num_channels,
|
||||
filter_size=1,
|
||||
num_filters=num_filters,
|
||||
stride=1,
|
||||
act=act)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.dw_conv(x)
|
||||
if self.use_se:
|
||||
x = self.se(x)
|
||||
x = self.pw_conv(x)
|
||||
return x
|
||||
|
||||
|
||||
class SEModule(nn.Layer):
|
||||
def __init__(self, channel, reduction=4):
|
||||
super().__init__()
|
||||
self.avg_pool = AdaptiveAvgPool2D(1)
|
||||
self.conv1 = Conv2D(
|
||||
in_channels=channel,
|
||||
out_channels=channel // reduction,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0)
|
||||
self.relu = nn.ReLU()
|
||||
self.conv2 = Conv2D(
|
||||
in_channels=channel // reduction,
|
||||
out_channels=channel,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0)
|
||||
self.hardsigmoid = nn.Hardsigmoid()
|
||||
|
||||
def forward(self, x):
|
||||
identity = x
|
||||
x = self.avg_pool(x)
|
||||
x = self.conv1(x)
|
||||
x = self.relu(x)
|
||||
x = self.conv2(x)
|
||||
x = self.hardsigmoid(x)
|
||||
x = paddle.multiply(x=identity, y=x)
|
||||
return x
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class LCNet(nn.Layer):
|
||||
def __init__(self, scale=1.0, feature_maps=[3, 4, 5], act='hard_swish'):
|
||||
super().__init__()
|
||||
self.scale = scale
|
||||
self.feature_maps = feature_maps
|
||||
|
||||
out_channels = []
|
||||
|
||||
self.conv1 = ConvBNLayer(
|
||||
num_channels=3,
|
||||
filter_size=3,
|
||||
num_filters=make_divisible(16 * scale),
|
||||
stride=2,
|
||||
act=act)
|
||||
|
||||
self.blocks2 = nn.Sequential(* [
|
||||
DepthwiseSeparable(
|
||||
num_channels=make_divisible(in_c * scale),
|
||||
num_filters=make_divisible(out_c * scale),
|
||||
dw_size=k,
|
||||
stride=s,
|
||||
use_se=se,
|
||||
act=act)
|
||||
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"])
|
||||
])
|
||||
|
||||
self.blocks3 = nn.Sequential(* [
|
||||
DepthwiseSeparable(
|
||||
num_channels=make_divisible(in_c * scale),
|
||||
num_filters=make_divisible(out_c * scale),
|
||||
dw_size=k,
|
||||
stride=s,
|
||||
use_se=se,
|
||||
act=act)
|
||||
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"])
|
||||
])
|
||||
|
||||
out_channels.append(
|
||||
make_divisible(NET_CONFIG["blocks3"][-1][2] * scale))
|
||||
|
||||
self.blocks4 = nn.Sequential(* [
|
||||
DepthwiseSeparable(
|
||||
num_channels=make_divisible(in_c * scale),
|
||||
num_filters=make_divisible(out_c * scale),
|
||||
dw_size=k,
|
||||
stride=s,
|
||||
use_se=se,
|
||||
act=act)
|
||||
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"])
|
||||
])
|
||||
|
||||
out_channels.append(
|
||||
make_divisible(NET_CONFIG["blocks4"][-1][2] * scale))
|
||||
|
||||
self.blocks5 = nn.Sequential(* [
|
||||
DepthwiseSeparable(
|
||||
num_channels=make_divisible(in_c * scale),
|
||||
num_filters=make_divisible(out_c * scale),
|
||||
dw_size=k,
|
||||
stride=s,
|
||||
use_se=se,
|
||||
act=act)
|
||||
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"])
|
||||
])
|
||||
|
||||
out_channels.append(
|
||||
make_divisible(NET_CONFIG["blocks5"][-1][2] * scale))
|
||||
|
||||
self.blocks6 = nn.Sequential(* [
|
||||
DepthwiseSeparable(
|
||||
num_channels=make_divisible(in_c * scale),
|
||||
num_filters=make_divisible(out_c * scale),
|
||||
dw_size=k,
|
||||
stride=s,
|
||||
use_se=se,
|
||||
act=act)
|
||||
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"])
|
||||
])
|
||||
|
||||
out_channels.append(
|
||||
make_divisible(NET_CONFIG["blocks6"][-1][2] * scale))
|
||||
self._out_channels = [
|
||||
ch for idx, ch in enumerate(out_channels) if idx + 2 in feature_maps
|
||||
]
|
||||
|
||||
def forward(self, inputs):
|
||||
x = inputs['image']
|
||||
outs = []
|
||||
|
||||
x = self.conv1(x)
|
||||
x = self.blocks2(x)
|
||||
x = self.blocks3(x)
|
||||
outs.append(x)
|
||||
x = self.blocks4(x)
|
||||
outs.append(x)
|
||||
x = self.blocks5(x)
|
||||
outs.append(x)
|
||||
x = self.blocks6(x)
|
||||
outs.append(x)
|
||||
outs = [o for i, o in enumerate(outs) if i + 2 in self.feature_maps]
|
||||
return outs
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [ShapeSpec(channels=c) for c in self._out_channels]
|
||||
402
rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v1.py
Normal file
402
rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v1.py
Normal file
@@ -0,0 +1,402 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
from paddle.regularizer import L2Decay
|
||||
from paddle.nn.initializer import KaimingNormal
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from numbers import Integral
|
||||
from ..shape_spec import ShapeSpec
|
||||
|
||||
__all__ = ['MobileNet']
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
num_groups=1,
|
||||
act='relu',
|
||||
conv_lr=1.,
|
||||
conv_decay=0.,
|
||||
norm_decay=0.,
|
||||
norm_type='bn',
|
||||
name=None):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
self.act = act
|
||||
self._conv = nn.Conv2D(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
groups=num_groups,
|
||||
weight_attr=ParamAttr(
|
||||
learning_rate=conv_lr,
|
||||
initializer=KaimingNormal(),
|
||||
regularizer=L2Decay(conv_decay)),
|
||||
bias_attr=False)
|
||||
|
||||
param_attr = ParamAttr(regularizer=L2Decay(norm_decay))
|
||||
bias_attr = ParamAttr(regularizer=L2Decay(norm_decay))
|
||||
if norm_type in ['sync_bn', 'bn']:
|
||||
self._batch_norm = nn.BatchNorm2D(
|
||||
out_channels, weight_attr=param_attr, bias_attr=bias_attr)
|
||||
|
||||
def forward(self, x):
|
||||
x = self._conv(x)
|
||||
x = self._batch_norm(x)
|
||||
if self.act == "relu":
|
||||
x = F.relu(x)
|
||||
elif self.act == "relu6":
|
||||
x = F.relu6(x)
|
||||
return x
|
||||
|
||||
|
||||
class DepthwiseSeparable(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels1,
|
||||
out_channels2,
|
||||
num_groups,
|
||||
stride,
|
||||
scale,
|
||||
conv_lr=1.,
|
||||
conv_decay=0.,
|
||||
norm_decay=0.,
|
||||
norm_type='bn',
|
||||
name=None):
|
||||
super(DepthwiseSeparable, self).__init__()
|
||||
|
||||
self._depthwise_conv = ConvBNLayer(
|
||||
in_channels,
|
||||
int(out_channels1 * scale),
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
padding=1,
|
||||
num_groups=int(num_groups * scale),
|
||||
conv_lr=conv_lr,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name=name + "_dw")
|
||||
|
||||
self._pointwise_conv = ConvBNLayer(
|
||||
int(out_channels1 * scale),
|
||||
int(out_channels2 * scale),
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
conv_lr=conv_lr,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name=name + "_sep")
|
||||
|
||||
def forward(self, x):
|
||||
x = self._depthwise_conv(x)
|
||||
x = self._pointwise_conv(x)
|
||||
return x
|
||||
|
||||
|
||||
class ExtraBlock(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels1,
|
||||
out_channels2,
|
||||
num_groups=1,
|
||||
stride=2,
|
||||
conv_lr=1.,
|
||||
conv_decay=0.,
|
||||
norm_decay=0.,
|
||||
norm_type='bn',
|
||||
name=None):
|
||||
super(ExtraBlock, self).__init__()
|
||||
|
||||
self.pointwise_conv = ConvBNLayer(
|
||||
in_channels,
|
||||
int(out_channels1),
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
num_groups=int(num_groups),
|
||||
act='relu6',
|
||||
conv_lr=conv_lr,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name=name + "_extra1")
|
||||
|
||||
self.normal_conv = ConvBNLayer(
|
||||
int(out_channels1),
|
||||
int(out_channels2),
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
padding=1,
|
||||
num_groups=int(num_groups),
|
||||
act='relu6',
|
||||
conv_lr=conv_lr,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name=name + "_extra2")
|
||||
|
||||
def forward(self, x):
|
||||
x = self.pointwise_conv(x)
|
||||
x = self.normal_conv(x)
|
||||
return x
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class MobileNet(nn.Layer):
|
||||
__shared__ = ['norm_type']
|
||||
|
||||
def __init__(self,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
conv_decay=0.,
|
||||
scale=1,
|
||||
conv_learning_rate=1.0,
|
||||
feature_maps=[4, 6, 13],
|
||||
with_extra_blocks=False,
|
||||
extra_block_filters=[[256, 512], [128, 256], [128, 256],
|
||||
[64, 128]]):
|
||||
super(MobileNet, self).__init__()
|
||||
if isinstance(feature_maps, Integral):
|
||||
feature_maps = [feature_maps]
|
||||
self.feature_maps = feature_maps
|
||||
self.with_extra_blocks = with_extra_blocks
|
||||
self.extra_block_filters = extra_block_filters
|
||||
|
||||
self._out_channels = []
|
||||
|
||||
self.conv1 = ConvBNLayer(
|
||||
in_channels=3,
|
||||
out_channels=int(32 * scale),
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
conv_lr=conv_learning_rate,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name="conv1")
|
||||
|
||||
self.dwsl = []
|
||||
dws21 = self.add_sublayer(
|
||||
"conv2_1",
|
||||
sublayer=DepthwiseSeparable(
|
||||
in_channels=int(32 * scale),
|
||||
out_channels1=32,
|
||||
out_channels2=64,
|
||||
num_groups=32,
|
||||
stride=1,
|
||||
scale=scale,
|
||||
conv_lr=conv_learning_rate,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name="conv2_1"))
|
||||
self.dwsl.append(dws21)
|
||||
self._update_out_channels(int(64 * scale), len(self.dwsl), feature_maps)
|
||||
dws22 = self.add_sublayer(
|
||||
"conv2_2",
|
||||
sublayer=DepthwiseSeparable(
|
||||
in_channels=int(64 * scale),
|
||||
out_channels1=64,
|
||||
out_channels2=128,
|
||||
num_groups=64,
|
||||
stride=2,
|
||||
scale=scale,
|
||||
conv_lr=conv_learning_rate,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name="conv2_2"))
|
||||
self.dwsl.append(dws22)
|
||||
self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps)
|
||||
# 1/4
|
||||
dws31 = self.add_sublayer(
|
||||
"conv3_1",
|
||||
sublayer=DepthwiseSeparable(
|
||||
in_channels=int(128 * scale),
|
||||
out_channels1=128,
|
||||
out_channels2=128,
|
||||
num_groups=128,
|
||||
stride=1,
|
||||
scale=scale,
|
||||
conv_lr=conv_learning_rate,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name="conv3_1"))
|
||||
self.dwsl.append(dws31)
|
||||
self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps)
|
||||
dws32 = self.add_sublayer(
|
||||
"conv3_2",
|
||||
sublayer=DepthwiseSeparable(
|
||||
in_channels=int(128 * scale),
|
||||
out_channels1=128,
|
||||
out_channels2=256,
|
||||
num_groups=128,
|
||||
stride=2,
|
||||
scale=scale,
|
||||
conv_lr=conv_learning_rate,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name="conv3_2"))
|
||||
self.dwsl.append(dws32)
|
||||
self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps)
|
||||
# 1/8
|
||||
dws41 = self.add_sublayer(
|
||||
"conv4_1",
|
||||
sublayer=DepthwiseSeparable(
|
||||
in_channels=int(256 * scale),
|
||||
out_channels1=256,
|
||||
out_channels2=256,
|
||||
num_groups=256,
|
||||
stride=1,
|
||||
scale=scale,
|
||||
conv_lr=conv_learning_rate,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name="conv4_1"))
|
||||
self.dwsl.append(dws41)
|
||||
self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps)
|
||||
dws42 = self.add_sublayer(
|
||||
"conv4_2",
|
||||
sublayer=DepthwiseSeparable(
|
||||
in_channels=int(256 * scale),
|
||||
out_channels1=256,
|
||||
out_channels2=512,
|
||||
num_groups=256,
|
||||
stride=2,
|
||||
scale=scale,
|
||||
conv_lr=conv_learning_rate,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name="conv4_2"))
|
||||
self.dwsl.append(dws42)
|
||||
self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps)
|
||||
# 1/16
|
||||
for i in range(5):
|
||||
tmp = self.add_sublayer(
|
||||
"conv5_" + str(i + 1),
|
||||
sublayer=DepthwiseSeparable(
|
||||
in_channels=int(512 * scale),
|
||||
out_channels1=512,
|
||||
out_channels2=512,
|
||||
num_groups=512,
|
||||
stride=1,
|
||||
scale=scale,
|
||||
conv_lr=conv_learning_rate,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name="conv5_" + str(i + 1)))
|
||||
self.dwsl.append(tmp)
|
||||
self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps)
|
||||
dws56 = self.add_sublayer(
|
||||
"conv5_6",
|
||||
sublayer=DepthwiseSeparable(
|
||||
in_channels=int(512 * scale),
|
||||
out_channels1=512,
|
||||
out_channels2=1024,
|
||||
num_groups=512,
|
||||
stride=2,
|
||||
scale=scale,
|
||||
conv_lr=conv_learning_rate,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name="conv5_6"))
|
||||
self.dwsl.append(dws56)
|
||||
self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps)
|
||||
# 1/32
|
||||
dws6 = self.add_sublayer(
|
||||
"conv6",
|
||||
sublayer=DepthwiseSeparable(
|
||||
in_channels=int(1024 * scale),
|
||||
out_channels1=1024,
|
||||
out_channels2=1024,
|
||||
num_groups=1024,
|
||||
stride=1,
|
||||
scale=scale,
|
||||
conv_lr=conv_learning_rate,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name="conv6"))
|
||||
self.dwsl.append(dws6)
|
||||
self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps)
|
||||
|
||||
if self.with_extra_blocks:
|
||||
self.extra_blocks = []
|
||||
for i, block_filter in enumerate(self.extra_block_filters):
|
||||
in_c = 1024 if i == 0 else self.extra_block_filters[i - 1][1]
|
||||
conv_extra = self.add_sublayer(
|
||||
"conv7_" + str(i + 1),
|
||||
sublayer=ExtraBlock(
|
||||
in_c,
|
||||
block_filter[0],
|
||||
block_filter[1],
|
||||
conv_lr=conv_learning_rate,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name="conv7_" + str(i + 1)))
|
||||
self.extra_blocks.append(conv_extra)
|
||||
self._update_out_channels(
|
||||
block_filter[1],
|
||||
len(self.dwsl) + len(self.extra_blocks), feature_maps)
|
||||
|
||||
def _update_out_channels(self, channel, feature_idx, feature_maps):
|
||||
if feature_idx in feature_maps:
|
||||
self._out_channels.append(channel)
|
||||
|
||||
def forward(self, inputs):
|
||||
outs = []
|
||||
y = self.conv1(inputs['image'])
|
||||
for i, block in enumerate(self.dwsl):
|
||||
y = block(y)
|
||||
if i + 1 in self.feature_maps:
|
||||
outs.append(y)
|
||||
|
||||
if not self.with_extra_blocks:
|
||||
return outs
|
||||
|
||||
y = outs[-1]
|
||||
for i, block in enumerate(self.extra_blocks):
|
||||
idx = i + len(self.dwsl)
|
||||
y = block(y)
|
||||
if idx + 1 in self.feature_maps:
|
||||
outs.append(y)
|
||||
return outs
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [ShapeSpec(channels=c) for c in self._out_channels]
|
||||
478
rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v3.py
Normal file
478
rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v3.py
Normal file
@@ -0,0 +1,478 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
from paddle.regularizer import L2Decay
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from numbers import Integral
|
||||
from ..shape_spec import ShapeSpec
|
||||
|
||||
__all__ = ['MobileNetV3']
|
||||
|
||||
|
||||
def make_divisible(v, divisor=8, min_value=None):
|
||||
if min_value is None:
|
||||
min_value = divisor
|
||||
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
|
||||
if new_v < 0.9 * v:
|
||||
new_v += divisor
|
||||
return new_v
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
in_c,
|
||||
out_c,
|
||||
filter_size,
|
||||
stride,
|
||||
padding,
|
||||
num_groups=1,
|
||||
act=None,
|
||||
lr_mult=1.,
|
||||
conv_decay=0.,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
freeze_norm=False,
|
||||
name=""):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
self.act = act
|
||||
self.conv = nn.Conv2D(
|
||||
in_channels=in_c,
|
||||
out_channels=out_c,
|
||||
kernel_size=filter_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
groups=num_groups,
|
||||
weight_attr=ParamAttr(
|
||||
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
|
||||
bias_attr=False)
|
||||
|
||||
norm_lr = 0. if freeze_norm else lr_mult
|
||||
param_attr = ParamAttr(
|
||||
learning_rate=norm_lr,
|
||||
regularizer=L2Decay(norm_decay),
|
||||
trainable=False if freeze_norm else True)
|
||||
bias_attr = ParamAttr(
|
||||
learning_rate=norm_lr,
|
||||
regularizer=L2Decay(norm_decay),
|
||||
trainable=False if freeze_norm else True)
|
||||
global_stats = True if freeze_norm else None
|
||||
if norm_type in ['sync_bn', 'bn']:
|
||||
self.bn = nn.BatchNorm2D(
|
||||
out_c,
|
||||
weight_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
use_global_stats=global_stats)
|
||||
norm_params = self.bn.parameters()
|
||||
if freeze_norm:
|
||||
for param in norm_params:
|
||||
param.stop_gradient = True
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
if self.act is not None:
|
||||
if self.act == "relu":
|
||||
x = F.relu(x)
|
||||
elif self.act == "relu6":
|
||||
x = F.relu6(x)
|
||||
elif self.act == "hard_swish":
|
||||
x = F.hardswish(x)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"The activation function is selected incorrectly.")
|
||||
return x
|
||||
|
||||
|
||||
class ResidualUnit(nn.Layer):
|
||||
def __init__(self,
|
||||
in_c,
|
||||
mid_c,
|
||||
out_c,
|
||||
filter_size,
|
||||
stride,
|
||||
use_se,
|
||||
lr_mult,
|
||||
conv_decay=0.,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
freeze_norm=False,
|
||||
act=None,
|
||||
return_list=False,
|
||||
name=''):
|
||||
super(ResidualUnit, self).__init__()
|
||||
self.if_shortcut = stride == 1 and in_c == out_c
|
||||
self.use_se = use_se
|
||||
self.return_list = return_list
|
||||
|
||||
self.expand_conv = ConvBNLayer(
|
||||
in_c=in_c,
|
||||
out_c=mid_c,
|
||||
filter_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
act=act,
|
||||
lr_mult=lr_mult,
|
||||
conv_decay=conv_decay,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
name=name + "_expand")
|
||||
self.bottleneck_conv = ConvBNLayer(
|
||||
in_c=mid_c,
|
||||
out_c=mid_c,
|
||||
filter_size=filter_size,
|
||||
stride=stride,
|
||||
padding=int((filter_size - 1) // 2),
|
||||
num_groups=mid_c,
|
||||
act=act,
|
||||
lr_mult=lr_mult,
|
||||
conv_decay=conv_decay,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
name=name + "_depthwise")
|
||||
if self.use_se:
|
||||
self.mid_se = SEModule(
|
||||
mid_c, lr_mult, conv_decay, name=name + "_se")
|
||||
self.linear_conv = ConvBNLayer(
|
||||
in_c=mid_c,
|
||||
out_c=out_c,
|
||||
filter_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
act=None,
|
||||
lr_mult=lr_mult,
|
||||
conv_decay=conv_decay,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
name=name + "_linear")
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self.expand_conv(inputs)
|
||||
x = self.bottleneck_conv(y)
|
||||
if self.use_se:
|
||||
x = self.mid_se(x)
|
||||
x = self.linear_conv(x)
|
||||
if self.if_shortcut:
|
||||
x = paddle.add(inputs, x)
|
||||
if self.return_list:
|
||||
return [y, x]
|
||||
else:
|
||||
return x
|
||||
|
||||
|
||||
class SEModule(nn.Layer):
|
||||
def __init__(self, channel, lr_mult, conv_decay, reduction=4, name=""):
|
||||
super(SEModule, self).__init__()
|
||||
self.avg_pool = nn.AdaptiveAvgPool2D(1)
|
||||
mid_channels = int(channel // reduction)
|
||||
self.conv1 = nn.Conv2D(
|
||||
in_channels=channel,
|
||||
out_channels=mid_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
weight_attr=ParamAttr(
|
||||
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
|
||||
bias_attr=ParamAttr(
|
||||
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
|
||||
self.conv2 = nn.Conv2D(
|
||||
in_channels=mid_channels,
|
||||
out_channels=channel,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
weight_attr=ParamAttr(
|
||||
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
|
||||
bias_attr=ParamAttr(
|
||||
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
|
||||
|
||||
def forward(self, inputs):
|
||||
outputs = self.avg_pool(inputs)
|
||||
outputs = self.conv1(outputs)
|
||||
outputs = F.relu(outputs)
|
||||
outputs = self.conv2(outputs)
|
||||
outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5)
|
||||
return paddle.multiply(x=inputs, y=outputs)
|
||||
|
||||
|
||||
class ExtraBlockDW(nn.Layer):
|
||||
def __init__(self,
|
||||
in_c,
|
||||
ch_1,
|
||||
ch_2,
|
||||
stride,
|
||||
lr_mult,
|
||||
conv_decay=0.,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
freeze_norm=False,
|
||||
name=None):
|
||||
super(ExtraBlockDW, self).__init__()
|
||||
self.pointwise_conv = ConvBNLayer(
|
||||
in_c=in_c,
|
||||
out_c=ch_1,
|
||||
filter_size=1,
|
||||
stride=1,
|
||||
padding='SAME',
|
||||
act='relu6',
|
||||
lr_mult=lr_mult,
|
||||
conv_decay=conv_decay,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
name=name + "_extra1")
|
||||
self.depthwise_conv = ConvBNLayer(
|
||||
in_c=ch_1,
|
||||
out_c=ch_2,
|
||||
filter_size=3,
|
||||
stride=stride,
|
||||
padding='SAME',
|
||||
num_groups=int(ch_1),
|
||||
act='relu6',
|
||||
lr_mult=lr_mult,
|
||||
conv_decay=conv_decay,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
name=name + "_extra2_dw")
|
||||
self.normal_conv = ConvBNLayer(
|
||||
in_c=ch_2,
|
||||
out_c=ch_2,
|
||||
filter_size=1,
|
||||
stride=1,
|
||||
padding='SAME',
|
||||
act='relu6',
|
||||
lr_mult=lr_mult,
|
||||
conv_decay=conv_decay,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
name=name + "_extra2_sep")
|
||||
|
||||
def forward(self, inputs):
|
||||
x = self.pointwise_conv(inputs)
|
||||
x = self.depthwise_conv(x)
|
||||
x = self.normal_conv(x)
|
||||
return x
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class MobileNetV3(nn.Layer):
|
||||
__shared__ = ['norm_type']
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
scale=1.0,
|
||||
model_name="large",
|
||||
feature_maps=[6, 12, 15],
|
||||
with_extra_blocks=False,
|
||||
extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]],
|
||||
lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
|
||||
conv_decay=0.0,
|
||||
multiplier=1.0,
|
||||
norm_type='bn',
|
||||
norm_decay=0.0,
|
||||
freeze_norm=False):
|
||||
super(MobileNetV3, self).__init__()
|
||||
if isinstance(feature_maps, Integral):
|
||||
feature_maps = [feature_maps]
|
||||
if norm_type == 'sync_bn' and freeze_norm:
|
||||
raise ValueError(
|
||||
"The norm_type should not be sync_bn when freeze_norm is True")
|
||||
self.feature_maps = feature_maps
|
||||
self.with_extra_blocks = with_extra_blocks
|
||||
self.extra_block_filters = extra_block_filters
|
||||
|
||||
inplanes = 16
|
||||
if model_name == "large":
|
||||
self.cfg = [
|
||||
# k, exp, c, se, nl, s,
|
||||
[3, 16, 16, False, "relu", 1],
|
||||
[3, 64, 24, False, "relu", 2],
|
||||
[3, 72, 24, False, "relu", 1],
|
||||
[5, 72, 40, True, "relu", 2], # RCNN output
|
||||
[5, 120, 40, True, "relu", 1],
|
||||
[5, 120, 40, True, "relu", 1], # YOLOv3 output
|
||||
[3, 240, 80, False, "hard_swish", 2], # RCNN output
|
||||
[3, 200, 80, False, "hard_swish", 1],
|
||||
[3, 184, 80, False, "hard_swish", 1],
|
||||
[3, 184, 80, False, "hard_swish", 1],
|
||||
[3, 480, 112, True, "hard_swish", 1],
|
||||
[3, 672, 112, True, "hard_swish", 1], # YOLOv3 output
|
||||
[5, 672, 160, True, "hard_swish", 2], # SSD/SSDLite/RCNN output
|
||||
[5, 960, 160, True, "hard_swish", 1],
|
||||
[5, 960, 160, True, "hard_swish", 1], # YOLOv3 output
|
||||
]
|
||||
elif model_name == "small":
|
||||
self.cfg = [
|
||||
# k, exp, c, se, nl, s,
|
||||
[3, 16, 16, True, "relu", 2],
|
||||
[3, 72, 24, False, "relu", 2], # RCNN output
|
||||
[3, 88, 24, False, "relu", 1], # YOLOv3 output
|
||||
[5, 96, 40, True, "hard_swish", 2], # RCNN output
|
||||
[5, 240, 40, True, "hard_swish", 1],
|
||||
[5, 240, 40, True, "hard_swish", 1],
|
||||
[5, 120, 48, True, "hard_swish", 1],
|
||||
[5, 144, 48, True, "hard_swish", 1], # YOLOv3 output
|
||||
[5, 288, 96, True, "hard_swish", 2], # SSD/SSDLite/RCNN output
|
||||
[5, 576, 96, True, "hard_swish", 1],
|
||||
[5, 576, 96, True, "hard_swish", 1], # YOLOv3 output
|
||||
]
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"mode[{}_model] is not implemented!".format(model_name))
|
||||
|
||||
if multiplier != 1.0:
|
||||
self.cfg[-3][2] = int(self.cfg[-3][2] * multiplier)
|
||||
self.cfg[-2][1] = int(self.cfg[-2][1] * multiplier)
|
||||
self.cfg[-2][2] = int(self.cfg[-2][2] * multiplier)
|
||||
self.cfg[-1][1] = int(self.cfg[-1][1] * multiplier)
|
||||
self.cfg[-1][2] = int(self.cfg[-1][2] * multiplier)
|
||||
|
||||
self.conv1 = ConvBNLayer(
|
||||
in_c=3,
|
||||
out_c=make_divisible(inplanes * scale),
|
||||
filter_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
num_groups=1,
|
||||
act="hard_swish",
|
||||
lr_mult=lr_mult_list[0],
|
||||
conv_decay=conv_decay,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
name="conv1")
|
||||
|
||||
self._out_channels = []
|
||||
self.block_list = []
|
||||
i = 0
|
||||
inplanes = make_divisible(inplanes * scale)
|
||||
for (k, exp, c, se, nl, s) in self.cfg:
|
||||
lr_idx = min(i // 3, len(lr_mult_list) - 1)
|
||||
lr_mult = lr_mult_list[lr_idx]
|
||||
|
||||
# for SSD/SSDLite, first head input is after ResidualUnit expand_conv
|
||||
return_list = self.with_extra_blocks and i + 2 in self.feature_maps
|
||||
|
||||
block = self.add_sublayer(
|
||||
"conv" + str(i + 2),
|
||||
sublayer=ResidualUnit(
|
||||
in_c=inplanes,
|
||||
mid_c=make_divisible(scale * exp),
|
||||
out_c=make_divisible(scale * c),
|
||||
filter_size=k,
|
||||
stride=s,
|
||||
use_se=se,
|
||||
act=nl,
|
||||
lr_mult=lr_mult,
|
||||
conv_decay=conv_decay,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
return_list=return_list,
|
||||
name="conv" + str(i + 2)))
|
||||
self.block_list.append(block)
|
||||
inplanes = make_divisible(scale * c)
|
||||
i += 1
|
||||
self._update_out_channels(
|
||||
make_divisible(scale * exp)
|
||||
if return_list else inplanes, i + 1, feature_maps)
|
||||
|
||||
if self.with_extra_blocks:
|
||||
self.extra_block_list = []
|
||||
extra_out_c = make_divisible(scale * self.cfg[-1][1])
|
||||
lr_idx = min(i // 3, len(lr_mult_list) - 1)
|
||||
lr_mult = lr_mult_list[lr_idx]
|
||||
|
||||
conv_extra = self.add_sublayer(
|
||||
"conv" + str(i + 2),
|
||||
sublayer=ConvBNLayer(
|
||||
in_c=inplanes,
|
||||
out_c=extra_out_c,
|
||||
filter_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
num_groups=1,
|
||||
act="hard_swish",
|
||||
lr_mult=lr_mult,
|
||||
conv_decay=conv_decay,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
name="conv" + str(i + 2)))
|
||||
self.extra_block_list.append(conv_extra)
|
||||
i += 1
|
||||
self._update_out_channels(extra_out_c, i + 1, feature_maps)
|
||||
|
||||
for j, block_filter in enumerate(self.extra_block_filters):
|
||||
in_c = extra_out_c if j == 0 else self.extra_block_filters[j -
|
||||
1][1]
|
||||
conv_extra = self.add_sublayer(
|
||||
"conv" + str(i + 2),
|
||||
sublayer=ExtraBlockDW(
|
||||
in_c,
|
||||
block_filter[0],
|
||||
block_filter[1],
|
||||
stride=2,
|
||||
lr_mult=lr_mult,
|
||||
conv_decay=conv_decay,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
name='conv' + str(i + 2)))
|
||||
self.extra_block_list.append(conv_extra)
|
||||
i += 1
|
||||
self._update_out_channels(block_filter[1], i + 1, feature_maps)
|
||||
|
||||
def _update_out_channels(self, channel, feature_idx, feature_maps):
|
||||
if feature_idx in feature_maps:
|
||||
self._out_channels.append(channel)
|
||||
|
||||
def forward(self, inputs):
|
||||
x = self.conv1(inputs['image'])
|
||||
outs = []
|
||||
for idx, block in enumerate(self.block_list):
|
||||
x = block(x)
|
||||
if idx + 2 in self.feature_maps:
|
||||
if isinstance(x, list):
|
||||
outs.append(x[0])
|
||||
x = x[1]
|
||||
else:
|
||||
outs.append(x)
|
||||
|
||||
if not self.with_extra_blocks:
|
||||
return outs
|
||||
|
||||
for i, block in enumerate(self.extra_block_list):
|
||||
idx = i + len(self.block_list)
|
||||
x = block(x)
|
||||
if idx + 2 in self.feature_maps:
|
||||
outs.append(x)
|
||||
return outs
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [ShapeSpec(channels=c) for c in self._out_channels]
|
||||
266
rtdetr_paddle/ppdet/modeling/backbones/mobileone.py
Normal file
266
rtdetr_paddle/ppdet/modeling/backbones/mobileone.py
Normal file
@@ -0,0 +1,266 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This code is the paddle implementation of MobileOne block, see: https://arxiv.org/pdf/2206.04040.pdf.
|
||||
Some codes are based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
|
||||
Ths copyright of microsoft/Swin-Transformer is as follows:
|
||||
MIT License [see LICENSE for details]
|
||||
"""
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
from paddle import ParamAttr
|
||||
from paddle.regularizer import L2Decay
|
||||
from paddle.nn.initializer import Normal, Constant
|
||||
|
||||
from ppdet.modeling.ops import get_act_fn
|
||||
from ppdet.modeling.layers import ConvNormLayer
|
||||
|
||||
|
||||
class MobileOneBlock(nn.Layer):
|
||||
def __init__(
|
||||
self,
|
||||
ch_in,
|
||||
ch_out,
|
||||
stride,
|
||||
kernel_size,
|
||||
conv_num=1,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
norm_groups=32,
|
||||
bias_on=False,
|
||||
lr_scale=1.,
|
||||
freeze_norm=False,
|
||||
initializer=Normal(
|
||||
mean=0., std=0.01),
|
||||
skip_quant=False,
|
||||
act='relu', ):
|
||||
super(MobileOneBlock, self).__init__()
|
||||
|
||||
self.ch_in = ch_in
|
||||
self.ch_out = ch_out
|
||||
self.kernel_size = kernel_size
|
||||
self.stride = stride
|
||||
self.padding = (kernel_size - 1) // 2
|
||||
self.k = conv_num
|
||||
|
||||
self.depth_conv = nn.LayerList()
|
||||
self.point_conv = nn.LayerList()
|
||||
for _ in range(self.k):
|
||||
self.depth_conv.append(
|
||||
ConvNormLayer(
|
||||
ch_in,
|
||||
ch_in,
|
||||
kernel_size,
|
||||
stride=stride,
|
||||
groups=ch_in,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
norm_groups=norm_groups,
|
||||
bias_on=bias_on,
|
||||
lr_scale=lr_scale,
|
||||
freeze_norm=freeze_norm,
|
||||
initializer=initializer,
|
||||
skip_quant=skip_quant))
|
||||
self.point_conv.append(
|
||||
ConvNormLayer(
|
||||
ch_in,
|
||||
ch_out,
|
||||
1,
|
||||
stride=1,
|
||||
groups=1,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
norm_groups=norm_groups,
|
||||
bias_on=bias_on,
|
||||
lr_scale=lr_scale,
|
||||
freeze_norm=freeze_norm,
|
||||
initializer=initializer,
|
||||
skip_quant=skip_quant))
|
||||
self.rbr_1x1 = ConvNormLayer(
|
||||
ch_in,
|
||||
ch_in,
|
||||
1,
|
||||
stride=self.stride,
|
||||
groups=ch_in,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
norm_groups=norm_groups,
|
||||
bias_on=bias_on,
|
||||
lr_scale=lr_scale,
|
||||
freeze_norm=freeze_norm,
|
||||
initializer=initializer,
|
||||
skip_quant=skip_quant)
|
||||
self.rbr_identity_st1 = nn.BatchNorm2D(
|
||||
num_features=ch_in,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(
|
||||
0.0))) if ch_in == ch_out and self.stride == 1 else None
|
||||
self.rbr_identity_st2 = nn.BatchNorm2D(
|
||||
num_features=ch_out,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(
|
||||
0.0))) if ch_in == ch_out and self.stride == 1 else None
|
||||
self.act = get_act_fn(act) if act is None or isinstance(act, (
|
||||
str, dict)) else act
|
||||
|
||||
def forward(self, x):
|
||||
if hasattr(self, "conv1") and hasattr(self, "conv2"):
|
||||
y = self.act(self.conv2(self.act(self.conv1(x))))
|
||||
else:
|
||||
if self.rbr_identity_st1 is None:
|
||||
id_out_st1 = 0
|
||||
else:
|
||||
id_out_st1 = self.rbr_identity_st1(x)
|
||||
|
||||
x1_1 = 0
|
||||
for i in range(self.k):
|
||||
x1_1 += self.depth_conv[i](x)
|
||||
|
||||
x1_2 = self.rbr_1x1(x)
|
||||
x1 = self.act(x1_1 + x1_2 + id_out_st1)
|
||||
|
||||
if self.rbr_identity_st2 is None:
|
||||
id_out_st2 = 0
|
||||
else:
|
||||
id_out_st2 = self.rbr_identity_st2(x1)
|
||||
|
||||
x2_1 = 0
|
||||
for i in range(self.k):
|
||||
x2_1 += self.point_conv[i](x1)
|
||||
y = self.act(x2_1 + id_out_st2)
|
||||
|
||||
return y
|
||||
|
||||
def convert_to_deploy(self):
|
||||
if not hasattr(self, 'conv1'):
|
||||
self.conv1 = nn.Conv2D(
|
||||
in_channels=self.ch_in,
|
||||
out_channels=self.ch_in,
|
||||
kernel_size=self.kernel_size,
|
||||
stride=self.stride,
|
||||
padding=self.padding,
|
||||
groups=self.ch_in,
|
||||
bias_attr=ParamAttr(
|
||||
initializer=Constant(value=0.), learning_rate=1.))
|
||||
if not hasattr(self, 'conv2'):
|
||||
self.conv2 = nn.Conv2D(
|
||||
in_channels=self.ch_in,
|
||||
out_channels=self.ch_out,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding='SAME',
|
||||
groups=1,
|
||||
bias_attr=ParamAttr(
|
||||
initializer=Constant(value=0.), learning_rate=1.))
|
||||
|
||||
conv1_kernel, conv1_bias, conv2_kernel, conv2_bias = self.get_equivalent_kernel_bias(
|
||||
)
|
||||
self.conv1.weight.set_value(conv1_kernel)
|
||||
self.conv1.bias.set_value(conv1_bias)
|
||||
self.conv2.weight.set_value(conv2_kernel)
|
||||
self.conv2.bias.set_value(conv2_bias)
|
||||
self.__delattr__('depth_conv')
|
||||
self.__delattr__('point_conv')
|
||||
self.__delattr__('rbr_1x1')
|
||||
if hasattr(self, 'rbr_identity_st1'):
|
||||
self.__delattr__('rbr_identity_st1')
|
||||
if hasattr(self, 'rbr_identity_st2'):
|
||||
self.__delattr__('rbr_identity_st2')
|
||||
|
||||
def get_equivalent_kernel_bias(self):
|
||||
st1_kernel3x3, st1_bias3x3 = self._fuse_bn_tensor(self.depth_conv)
|
||||
st1_kernel1x1, st1_bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
|
||||
st1_kernelid, st1_biasid = self._fuse_bn_tensor(
|
||||
self.rbr_identity_st1, kernel_size=self.kernel_size)
|
||||
|
||||
st2_kernel1x1, st2_bias1x1 = self._fuse_bn_tensor(self.point_conv)
|
||||
st2_kernelid, st2_biasid = self._fuse_bn_tensor(
|
||||
self.rbr_identity_st2, kernel_size=1)
|
||||
|
||||
conv1_kernel = st1_kernel3x3 + self._pad_1x1_to_3x3_tensor(
|
||||
st1_kernel1x1) + st1_kernelid
|
||||
|
||||
conv1_bias = st1_bias3x3 + st1_bias1x1 + st1_biasid
|
||||
|
||||
conv2_kernel = st2_kernel1x1 + st2_kernelid
|
||||
conv2_bias = st2_bias1x1 + st2_biasid
|
||||
|
||||
return conv1_kernel, conv1_bias, conv2_kernel, conv2_bias
|
||||
|
||||
def _pad_1x1_to_3x3_tensor(self, kernel1x1):
|
||||
if kernel1x1 is None:
|
||||
return 0
|
||||
else:
|
||||
padding_size = (self.kernel_size - 1) // 2
|
||||
return nn.functional.pad(
|
||||
kernel1x1,
|
||||
[padding_size, padding_size, padding_size, padding_size])
|
||||
|
||||
def _fuse_bn_tensor(self, branch, kernel_size=3):
|
||||
if branch is None:
|
||||
return 0, 0
|
||||
|
||||
if isinstance(branch, nn.LayerList):
|
||||
fused_kernels = []
|
||||
fused_bias = []
|
||||
for block in branch:
|
||||
kernel = block.conv.weight
|
||||
running_mean = block.norm._mean
|
||||
running_var = block.norm._variance
|
||||
gamma = block.norm.weight
|
||||
beta = block.norm.bias
|
||||
eps = block.norm._epsilon
|
||||
|
||||
std = (running_var + eps).sqrt()
|
||||
t = (gamma / std).reshape((-1, 1, 1, 1))
|
||||
|
||||
fused_kernels.append(kernel * t)
|
||||
fused_bias.append(beta - running_mean * gamma / std)
|
||||
|
||||
return sum(fused_kernels), sum(fused_bias)
|
||||
|
||||
elif isinstance(branch, ConvNormLayer):
|
||||
kernel = branch.conv.weight
|
||||
running_mean = branch.norm._mean
|
||||
running_var = branch.norm._variance
|
||||
gamma = branch.norm.weight
|
||||
beta = branch.norm.bias
|
||||
eps = branch.norm._epsilon
|
||||
else:
|
||||
assert isinstance(branch, nn.BatchNorm2D)
|
||||
input_dim = self.ch_in if kernel_size == 1 else 1
|
||||
kernel_value = paddle.zeros(
|
||||
shape=[self.ch_in, input_dim, kernel_size, kernel_size],
|
||||
dtype='float32')
|
||||
if kernel_size > 1:
|
||||
for i in range(self.ch_in):
|
||||
kernel_value[i, i % input_dim, (kernel_size - 1) // 2, (
|
||||
kernel_size - 1) // 2] = 1
|
||||
elif kernel_size == 1:
|
||||
for i in range(self.ch_in):
|
||||
kernel_value[i, i % input_dim, 0, 0] = 1
|
||||
else:
|
||||
raise ValueError("Invalid kernel size recieved!")
|
||||
kernel = paddle.to_tensor(kernel_value, place=branch.weight.place)
|
||||
running_mean = branch._mean
|
||||
running_var = branch._variance
|
||||
gamma = branch.weight
|
||||
beta = branch.bias
|
||||
eps = branch._epsilon
|
||||
|
||||
std = (running_var + eps).sqrt()
|
||||
t = (gamma / std).reshape((-1, 1, 1, 1))
|
||||
|
||||
return kernel * t, beta - running_mean * gamma / std
|
||||
69
rtdetr_paddle/ppdet/modeling/backbones/name_adapter.py
Normal file
69
rtdetr_paddle/ppdet/modeling/backbones/name_adapter.py
Normal file
@@ -0,0 +1,69 @@
|
||||
class NameAdapter(object):
|
||||
"""Fix the backbones variable names for pretrained weight"""
|
||||
|
||||
def __init__(self, model):
|
||||
super(NameAdapter, self).__init__()
|
||||
self.model = model
|
||||
|
||||
@property
|
||||
def model_type(self):
|
||||
return getattr(self.model, '_model_type', '')
|
||||
|
||||
@property
|
||||
def variant(self):
|
||||
return getattr(self.model, 'variant', '')
|
||||
|
||||
def fix_conv_norm_name(self, name):
|
||||
if name == "conv1":
|
||||
bn_name = "bn_" + name
|
||||
else:
|
||||
bn_name = "bn" + name[3:]
|
||||
# the naming rule is same as pretrained weight
|
||||
if self.model_type == 'SEResNeXt':
|
||||
bn_name = name + "_bn"
|
||||
return bn_name
|
||||
|
||||
def fix_shortcut_name(self, name):
|
||||
if self.model_type == 'SEResNeXt':
|
||||
name = 'conv' + name + '_prj'
|
||||
return name
|
||||
|
||||
def fix_bottleneck_name(self, name):
|
||||
if self.model_type == 'SEResNeXt':
|
||||
conv_name1 = 'conv' + name + '_x1'
|
||||
conv_name2 = 'conv' + name + '_x2'
|
||||
conv_name3 = 'conv' + name + '_x3'
|
||||
shortcut_name = name
|
||||
else:
|
||||
conv_name1 = name + "_branch2a"
|
||||
conv_name2 = name + "_branch2b"
|
||||
conv_name3 = name + "_branch2c"
|
||||
shortcut_name = name + "_branch1"
|
||||
return conv_name1, conv_name2, conv_name3, shortcut_name
|
||||
|
||||
def fix_basicblock_name(self, name):
|
||||
if self.model_type == 'SEResNeXt':
|
||||
conv_name1 = 'conv' + name + '_x1'
|
||||
conv_name2 = 'conv' + name + '_x2'
|
||||
shortcut_name = name
|
||||
else:
|
||||
conv_name1 = name + "_branch2a"
|
||||
conv_name2 = name + "_branch2b"
|
||||
shortcut_name = name + "_branch1"
|
||||
return conv_name1, conv_name2, shortcut_name
|
||||
|
||||
def fix_layer_warp_name(self, stage_num, count, i):
|
||||
name = 'res' + str(stage_num)
|
||||
if count > 10 and stage_num == 4:
|
||||
if i == 0:
|
||||
conv_name = name + "a"
|
||||
else:
|
||||
conv_name = name + "b" + str(i)
|
||||
else:
|
||||
conv_name = name + chr(ord("a") + i)
|
||||
if self.model_type == 'SEResNeXt':
|
||||
conv_name = str(stage_num + 2) + '_' + str(i + 1)
|
||||
return conv_name
|
||||
|
||||
def fix_c1_stage_name(self):
|
||||
return "res_conv1" if self.model_type == 'ResNeXt' else "conv1"
|
||||
611
rtdetr_paddle/ppdet/modeling/backbones/resnet.py
Executable file
611
rtdetr_paddle/ppdet/modeling/backbones/resnet.py
Executable file
@@ -0,0 +1,611 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
from numbers import Integral
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from paddle.regularizer import L2Decay
|
||||
from paddle.nn.initializer import Uniform
|
||||
from paddle import ParamAttr
|
||||
from paddle.nn.initializer import Constant
|
||||
from paddle.vision.ops import DeformConv2D
|
||||
from .name_adapter import NameAdapter
|
||||
from ..shape_spec import ShapeSpec
|
||||
|
||||
__all__ = ['ResNet', 'Res5Head', 'Blocks', 'BasicBlock', 'BottleNeck']
|
||||
|
||||
ResNet_cfg = {
|
||||
18: [2, 2, 2, 2],
|
||||
34: [3, 4, 6, 3],
|
||||
50: [3, 4, 6, 3],
|
||||
101: [3, 4, 23, 3],
|
||||
152: [3, 8, 36, 3],
|
||||
}
|
||||
|
||||
|
||||
class ConvNormLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
ch_in,
|
||||
ch_out,
|
||||
filter_size,
|
||||
stride,
|
||||
groups=1,
|
||||
act=None,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
freeze_norm=True,
|
||||
lr=1.0,
|
||||
dcn_v2=False):
|
||||
super(ConvNormLayer, self).__init__()
|
||||
assert norm_type in ['bn', 'sync_bn']
|
||||
self.norm_type = norm_type
|
||||
self.act = act
|
||||
self.dcn_v2 = dcn_v2
|
||||
|
||||
if not self.dcn_v2:
|
||||
self.conv = nn.Conv2D(
|
||||
in_channels=ch_in,
|
||||
out_channels=ch_out,
|
||||
kernel_size=filter_size,
|
||||
stride=stride,
|
||||
padding=(filter_size - 1) // 2,
|
||||
groups=groups,
|
||||
weight_attr=ParamAttr(learning_rate=lr),
|
||||
bias_attr=False)
|
||||
else:
|
||||
self.offset_channel = 2 * filter_size**2
|
||||
self.mask_channel = filter_size**2
|
||||
|
||||
self.conv_offset = nn.Conv2D(
|
||||
in_channels=ch_in,
|
||||
out_channels=3 * filter_size**2,
|
||||
kernel_size=filter_size,
|
||||
stride=stride,
|
||||
padding=(filter_size - 1) // 2,
|
||||
weight_attr=ParamAttr(initializer=Constant(0.)),
|
||||
bias_attr=ParamAttr(initializer=Constant(0.)))
|
||||
self.conv = DeformConv2D(
|
||||
in_channels=ch_in,
|
||||
out_channels=ch_out,
|
||||
kernel_size=filter_size,
|
||||
stride=stride,
|
||||
padding=(filter_size - 1) // 2,
|
||||
dilation=1,
|
||||
groups=groups,
|
||||
weight_attr=ParamAttr(learning_rate=lr),
|
||||
bias_attr=False)
|
||||
|
||||
norm_lr = 0. if freeze_norm else lr
|
||||
param_attr = ParamAttr(
|
||||
learning_rate=norm_lr,
|
||||
regularizer=L2Decay(norm_decay),
|
||||
trainable=False if freeze_norm else True)
|
||||
bias_attr = ParamAttr(
|
||||
learning_rate=norm_lr,
|
||||
regularizer=L2Decay(norm_decay),
|
||||
trainable=False if freeze_norm else True)
|
||||
|
||||
global_stats = True if freeze_norm else None
|
||||
if norm_type in ['sync_bn', 'bn']:
|
||||
self.norm = nn.BatchNorm2D(
|
||||
ch_out,
|
||||
weight_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
use_global_stats=global_stats)
|
||||
norm_params = self.norm.parameters()
|
||||
|
||||
if freeze_norm:
|
||||
for param in norm_params:
|
||||
param.stop_gradient = True
|
||||
|
||||
def forward(self, inputs):
|
||||
if not self.dcn_v2:
|
||||
out = self.conv(inputs)
|
||||
else:
|
||||
offset_mask = self.conv_offset(inputs)
|
||||
offset, mask = paddle.split(
|
||||
offset_mask,
|
||||
num_or_sections=[self.offset_channel, self.mask_channel],
|
||||
axis=1)
|
||||
mask = F.sigmoid(mask)
|
||||
out = self.conv(inputs, offset, mask=mask)
|
||||
|
||||
if self.norm_type in ['bn', 'sync_bn']:
|
||||
out = self.norm(out)
|
||||
if self.act:
|
||||
out = getattr(F, self.act)(out)
|
||||
return out
|
||||
|
||||
|
||||
class SELayer(nn.Layer):
|
||||
def __init__(self, ch, reduction_ratio=16):
|
||||
super(SELayer, self).__init__()
|
||||
self.pool = nn.AdaptiveAvgPool2D(1)
|
||||
stdv = 1.0 / math.sqrt(ch)
|
||||
c_ = ch // reduction_ratio
|
||||
self.squeeze = nn.Linear(
|
||||
ch,
|
||||
c_,
|
||||
weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
|
||||
bias_attr=True)
|
||||
|
||||
stdv = 1.0 / math.sqrt(c_)
|
||||
self.extract = nn.Linear(
|
||||
c_,
|
||||
ch,
|
||||
weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
|
||||
bias_attr=True)
|
||||
|
||||
def forward(self, inputs):
|
||||
out = self.pool(inputs)
|
||||
out = paddle.squeeze(out, axis=[2, 3])
|
||||
out = self.squeeze(out)
|
||||
out = F.relu(out)
|
||||
out = self.extract(out)
|
||||
out = F.sigmoid(out)
|
||||
out = paddle.unsqueeze(out, axis=[2, 3])
|
||||
scale = out * inputs
|
||||
return scale
|
||||
|
||||
|
||||
class BasicBlock(nn.Layer):
|
||||
|
||||
expansion = 1
|
||||
|
||||
def __init__(self,
|
||||
ch_in,
|
||||
ch_out,
|
||||
stride,
|
||||
shortcut,
|
||||
variant='b',
|
||||
groups=1,
|
||||
base_width=64,
|
||||
lr=1.0,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
freeze_norm=True,
|
||||
dcn_v2=False,
|
||||
std_senet=False):
|
||||
super(BasicBlock, self).__init__()
|
||||
assert groups == 1 and base_width == 64, 'BasicBlock only supports groups=1 and base_width=64'
|
||||
|
||||
self.shortcut = shortcut
|
||||
if not shortcut:
|
||||
if variant == 'd' and stride == 2:
|
||||
self.short = nn.Sequential()
|
||||
self.short.add_sublayer(
|
||||
'pool',
|
||||
nn.AvgPool2D(
|
||||
kernel_size=2, stride=2, padding=0, ceil_mode=True))
|
||||
self.short.add_sublayer(
|
||||
'conv',
|
||||
ConvNormLayer(
|
||||
ch_in=ch_in,
|
||||
ch_out=ch_out,
|
||||
filter_size=1,
|
||||
stride=1,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
lr=lr))
|
||||
else:
|
||||
self.short = ConvNormLayer(
|
||||
ch_in=ch_in,
|
||||
ch_out=ch_out,
|
||||
filter_size=1,
|
||||
stride=stride,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
lr=lr)
|
||||
|
||||
self.branch2a = ConvNormLayer(
|
||||
ch_in=ch_in,
|
||||
ch_out=ch_out,
|
||||
filter_size=3,
|
||||
stride=stride,
|
||||
act='relu',
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
lr=lr)
|
||||
|
||||
self.branch2b = ConvNormLayer(
|
||||
ch_in=ch_out,
|
||||
ch_out=ch_out,
|
||||
filter_size=3,
|
||||
stride=1,
|
||||
act=None,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
lr=lr,
|
||||
dcn_v2=dcn_v2)
|
||||
|
||||
self.std_senet = std_senet
|
||||
if self.std_senet:
|
||||
self.se = SELayer(ch_out)
|
||||
|
||||
def forward(self, inputs):
|
||||
out = self.branch2a(inputs)
|
||||
out = self.branch2b(out)
|
||||
if self.std_senet:
|
||||
out = self.se(out)
|
||||
|
||||
if self.shortcut:
|
||||
short = inputs
|
||||
else:
|
||||
short = self.short(inputs)
|
||||
|
||||
out = paddle.add(x=out, y=short)
|
||||
out = F.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class BottleNeck(nn.Layer):
|
||||
|
||||
expansion = 4
|
||||
|
||||
def __init__(self,
|
||||
ch_in,
|
||||
ch_out,
|
||||
stride,
|
||||
shortcut,
|
||||
variant='b',
|
||||
groups=1,
|
||||
base_width=4,
|
||||
lr=1.0,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
freeze_norm=True,
|
||||
dcn_v2=False,
|
||||
std_senet=False):
|
||||
super(BottleNeck, self).__init__()
|
||||
if variant == 'a':
|
||||
stride1, stride2 = stride, 1
|
||||
else:
|
||||
stride1, stride2 = 1, stride
|
||||
|
||||
# ResNeXt
|
||||
width = int(ch_out * (base_width / 64.)) * groups
|
||||
|
||||
self.branch2a = ConvNormLayer(
|
||||
ch_in=ch_in,
|
||||
ch_out=width,
|
||||
filter_size=1,
|
||||
stride=stride1,
|
||||
groups=1,
|
||||
act='relu',
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
lr=lr)
|
||||
|
||||
self.branch2b = ConvNormLayer(
|
||||
ch_in=width,
|
||||
ch_out=width,
|
||||
filter_size=3,
|
||||
stride=stride2,
|
||||
groups=groups,
|
||||
act='relu',
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
lr=lr,
|
||||
dcn_v2=dcn_v2)
|
||||
|
||||
self.branch2c = ConvNormLayer(
|
||||
ch_in=width,
|
||||
ch_out=ch_out * self.expansion,
|
||||
filter_size=1,
|
||||
stride=1,
|
||||
groups=1,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
lr=lr)
|
||||
|
||||
self.shortcut = shortcut
|
||||
if not shortcut:
|
||||
if variant == 'd' and stride == 2:
|
||||
self.short = nn.Sequential()
|
||||
self.short.add_sublayer(
|
||||
'pool',
|
||||
nn.AvgPool2D(
|
||||
kernel_size=2, stride=2, padding=0, ceil_mode=True))
|
||||
self.short.add_sublayer(
|
||||
'conv',
|
||||
ConvNormLayer(
|
||||
ch_in=ch_in,
|
||||
ch_out=ch_out * self.expansion,
|
||||
filter_size=1,
|
||||
stride=1,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
lr=lr))
|
||||
else:
|
||||
self.short = ConvNormLayer(
|
||||
ch_in=ch_in,
|
||||
ch_out=ch_out * self.expansion,
|
||||
filter_size=1,
|
||||
stride=stride,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
lr=lr)
|
||||
|
||||
self.std_senet = std_senet
|
||||
if self.std_senet:
|
||||
self.se = SELayer(ch_out * self.expansion)
|
||||
|
||||
def forward(self, inputs):
|
||||
|
||||
out = self.branch2a(inputs)
|
||||
out = self.branch2b(out)
|
||||
out = self.branch2c(out)
|
||||
|
||||
if self.std_senet:
|
||||
out = self.se(out)
|
||||
|
||||
if self.shortcut:
|
||||
short = inputs
|
||||
else:
|
||||
short = self.short(inputs)
|
||||
|
||||
out = paddle.add(x=out, y=short)
|
||||
out = F.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class Blocks(nn.Layer):
|
||||
def __init__(self,
|
||||
block,
|
||||
ch_in,
|
||||
ch_out,
|
||||
count,
|
||||
name_adapter,
|
||||
stage_num,
|
||||
variant='b',
|
||||
groups=1,
|
||||
base_width=64,
|
||||
lr=1.0,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
freeze_norm=True,
|
||||
dcn_v2=False,
|
||||
std_senet=False):
|
||||
super(Blocks, self).__init__()
|
||||
|
||||
self.blocks = []
|
||||
for i in range(count):
|
||||
conv_name = name_adapter.fix_layer_warp_name(stage_num, count, i)
|
||||
layer = self.add_sublayer(
|
||||
conv_name,
|
||||
block(
|
||||
ch_in=ch_in,
|
||||
ch_out=ch_out,
|
||||
stride=2 if i == 0 and stage_num != 2 else 1,
|
||||
shortcut=False if i == 0 else True,
|
||||
variant=variant,
|
||||
groups=groups,
|
||||
base_width=base_width,
|
||||
lr=lr,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
dcn_v2=dcn_v2,
|
||||
std_senet=std_senet))
|
||||
self.blocks.append(layer)
|
||||
if i == 0:
|
||||
ch_in = ch_out * block.expansion
|
||||
|
||||
def forward(self, inputs):
|
||||
block_out = inputs
|
||||
for block in self.blocks:
|
||||
block_out = block(block_out)
|
||||
return block_out
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class ResNet(nn.Layer):
|
||||
__shared__ = ['norm_type']
|
||||
|
||||
def __init__(self,
|
||||
depth=50,
|
||||
ch_in=64,
|
||||
variant='b',
|
||||
lr_mult_list=[1.0, 1.0, 1.0, 1.0],
|
||||
groups=1,
|
||||
base_width=64,
|
||||
norm_type='bn',
|
||||
norm_decay=0,
|
||||
freeze_norm=True,
|
||||
freeze_at=0,
|
||||
return_idx=[0, 1, 2, 3],
|
||||
dcn_v2_stages=[-1],
|
||||
num_stages=4,
|
||||
std_senet=False,
|
||||
freeze_stem_only=False):
|
||||
"""
|
||||
Residual Network, see https://arxiv.org/abs/1512.03385
|
||||
|
||||
Args:
|
||||
depth (int): ResNet depth, should be 18, 34, 50, 101, 152.
|
||||
ch_in (int): output channel of first stage, default 64
|
||||
variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently
|
||||
lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),
|
||||
lower learning rate ratio is need for pretrained model
|
||||
got using distillation(default as [1.0, 1.0, 1.0, 1.0]).
|
||||
groups (int): group convolution cardinality
|
||||
base_width (int): base width of each group convolution
|
||||
norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'
|
||||
norm_decay (float): weight decay for normalization layer weights
|
||||
freeze_norm (bool): freeze normalization layers
|
||||
freeze_at (int): freeze the backbone at which stage
|
||||
return_idx (list): index of the stages whose feature maps are returned
|
||||
dcn_v2_stages (list): index of stages who select deformable conv v2
|
||||
num_stages (int): total num of stages
|
||||
std_senet (bool): whether use senet, default False.
|
||||
"""
|
||||
super(ResNet, self).__init__()
|
||||
self._model_type = 'ResNet' if groups == 1 else 'ResNeXt'
|
||||
assert num_stages >= 1 and num_stages <= 4
|
||||
self.depth = depth
|
||||
self.variant = variant
|
||||
self.groups = groups
|
||||
self.base_width = base_width
|
||||
self.norm_type = norm_type
|
||||
self.norm_decay = norm_decay
|
||||
self.freeze_norm = freeze_norm
|
||||
self.freeze_at = freeze_at
|
||||
if isinstance(return_idx, Integral):
|
||||
return_idx = [return_idx]
|
||||
assert max(return_idx) < num_stages, \
|
||||
'the maximum return index must smaller than num_stages, ' \
|
||||
'but received maximum return index is {} and num_stages ' \
|
||||
'is {}'.format(max(return_idx), num_stages)
|
||||
self.return_idx = return_idx
|
||||
self.num_stages = num_stages
|
||||
assert len(lr_mult_list) == 4, \
|
||||
"lr_mult_list length must be 4 but got {}".format(len(lr_mult_list))
|
||||
if isinstance(dcn_v2_stages, Integral):
|
||||
dcn_v2_stages = [dcn_v2_stages]
|
||||
assert max(dcn_v2_stages) < num_stages
|
||||
|
||||
if isinstance(dcn_v2_stages, Integral):
|
||||
dcn_v2_stages = [dcn_v2_stages]
|
||||
assert max(dcn_v2_stages) < num_stages
|
||||
self.dcn_v2_stages = dcn_v2_stages
|
||||
|
||||
block_nums = ResNet_cfg[depth]
|
||||
na = NameAdapter(self)
|
||||
|
||||
conv1_name = na.fix_c1_stage_name()
|
||||
if variant in ['c', 'd']:
|
||||
conv_def = [
|
||||
[3, ch_in // 2, 3, 2, "conv1_1"],
|
||||
[ch_in // 2, ch_in // 2, 3, 1, "conv1_2"],
|
||||
[ch_in // 2, ch_in, 3, 1, "conv1_3"],
|
||||
]
|
||||
else:
|
||||
conv_def = [[3, ch_in, 7, 2, conv1_name]]
|
||||
self.conv1 = nn.Sequential()
|
||||
for (c_in, c_out, k, s, _name) in conv_def:
|
||||
self.conv1.add_sublayer(
|
||||
_name,
|
||||
ConvNormLayer(
|
||||
ch_in=c_in,
|
||||
ch_out=c_out,
|
||||
filter_size=k,
|
||||
stride=s,
|
||||
groups=1,
|
||||
act='relu',
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
lr=1.0))
|
||||
|
||||
self.ch_in = ch_in
|
||||
ch_out_list = [64, 128, 256, 512]
|
||||
block = BottleNeck if depth >= 50 else BasicBlock
|
||||
|
||||
self._out_channels = [block.expansion * v for v in ch_out_list]
|
||||
self._out_strides = [4, 8, 16, 32]
|
||||
|
||||
self.res_layers = []
|
||||
for i in range(num_stages):
|
||||
lr_mult = lr_mult_list[i]
|
||||
stage_num = i + 2
|
||||
res_name = "res{}".format(stage_num)
|
||||
res_layer = self.add_sublayer(
|
||||
res_name,
|
||||
Blocks(
|
||||
block,
|
||||
self.ch_in,
|
||||
ch_out_list[i],
|
||||
count=block_nums[i],
|
||||
name_adapter=na,
|
||||
stage_num=stage_num,
|
||||
variant=variant,
|
||||
groups=groups,
|
||||
base_width=base_width,
|
||||
lr=lr_mult,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
dcn_v2=(i in self.dcn_v2_stages),
|
||||
std_senet=std_senet))
|
||||
self.res_layers.append(res_layer)
|
||||
self.ch_in = self._out_channels[i]
|
||||
|
||||
if freeze_at >= 0:
|
||||
self._freeze_parameters(self.conv1)
|
||||
if not freeze_stem_only:
|
||||
for i in range(min(freeze_at + 1, num_stages)):
|
||||
self._freeze_parameters(self.res_layers[i])
|
||||
|
||||
def _freeze_parameters(self, m):
|
||||
for p in m.parameters():
|
||||
p.stop_gradient = True
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [
|
||||
ShapeSpec(
|
||||
channels=self._out_channels[i], stride=self._out_strides[i])
|
||||
for i in self.return_idx
|
||||
]
|
||||
|
||||
def forward(self, inputs):
|
||||
x = inputs['image']
|
||||
conv1 = self.conv1(x)
|
||||
x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1)
|
||||
outs = []
|
||||
for idx, stage in enumerate(self.res_layers):
|
||||
x = stage(x)
|
||||
if idx in self.return_idx:
|
||||
outs.append(x)
|
||||
return outs
|
||||
|
||||
|
||||
@register
|
||||
class Res5Head(nn.Layer):
|
||||
def __init__(self, depth=50):
|
||||
super(Res5Head, self).__init__()
|
||||
feat_in, feat_out = [1024, 512]
|
||||
if depth < 50:
|
||||
feat_in = 256
|
||||
na = NameAdapter(self)
|
||||
block = BottleNeck if depth >= 50 else BasicBlock
|
||||
self.res5 = Blocks(
|
||||
block, feat_in, feat_out, count=3, name_adapter=na, stage_num=5)
|
||||
self.feat_out = feat_out if depth < 50 else feat_out * 4
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [ShapeSpec(
|
||||
channels=self.feat_out,
|
||||
stride=16, )]
|
||||
|
||||
def forward(self, roi_feat, stage=0):
|
||||
y = self.res5(roi_feat)
|
||||
return y
|
||||
250
rtdetr_paddle/ppdet/modeling/backbones/shufflenet_v2.py
Normal file
250
rtdetr_paddle/ppdet/modeling/backbones/shufflenet_v2.py
Normal file
@@ -0,0 +1,250 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
from paddle import ParamAttr
|
||||
import paddle.nn.functional as F
|
||||
from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm2D
|
||||
from paddle.nn.initializer import KaimingNormal
|
||||
from paddle.regularizer import L2Decay
|
||||
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from numbers import Integral
|
||||
from ..shape_spec import ShapeSpec
|
||||
from ppdet.modeling.ops import channel_shuffle
|
||||
|
||||
__all__ = ['ShuffleNetV2']
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
groups=1,
|
||||
act=None):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
self._conv = Conv2D(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
groups=groups,
|
||||
weight_attr=ParamAttr(initializer=KaimingNormal()),
|
||||
bias_attr=False)
|
||||
|
||||
self._batch_norm = BatchNorm2D(
|
||||
out_channels,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
|
||||
if act == "hard_swish":
|
||||
act = 'hardswish'
|
||||
self.act = act
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self._conv(inputs)
|
||||
y = self._batch_norm(y)
|
||||
if self.act:
|
||||
y = getattr(F, self.act)(y)
|
||||
return y
|
||||
|
||||
|
||||
class InvertedResidual(nn.Layer):
|
||||
def __init__(self, in_channels, out_channels, stride, act="relu"):
|
||||
super(InvertedResidual, self).__init__()
|
||||
self._conv_pw = ConvBNLayer(
|
||||
in_channels=in_channels // 2,
|
||||
out_channels=out_channels // 2,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
groups=1,
|
||||
act=act)
|
||||
self._conv_dw = ConvBNLayer(
|
||||
in_channels=out_channels // 2,
|
||||
out_channels=out_channels // 2,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
padding=1,
|
||||
groups=out_channels // 2,
|
||||
act=None)
|
||||
self._conv_linear = ConvBNLayer(
|
||||
in_channels=out_channels // 2,
|
||||
out_channels=out_channels // 2,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
groups=1,
|
||||
act=act)
|
||||
|
||||
def forward(self, inputs):
|
||||
x1, x2 = paddle.split(
|
||||
inputs,
|
||||
num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],
|
||||
axis=1)
|
||||
x2 = self._conv_pw(x2)
|
||||
x2 = self._conv_dw(x2)
|
||||
x2 = self._conv_linear(x2)
|
||||
out = paddle.concat([x1, x2], axis=1)
|
||||
return channel_shuffle(out, 2)
|
||||
|
||||
|
||||
class InvertedResidualDS(nn.Layer):
|
||||
def __init__(self, in_channels, out_channels, stride, act="relu"):
|
||||
super(InvertedResidualDS, self).__init__()
|
||||
|
||||
# branch1
|
||||
self._conv_dw_1 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=in_channels,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
padding=1,
|
||||
groups=in_channels,
|
||||
act=None)
|
||||
self._conv_linear_1 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels // 2,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
groups=1,
|
||||
act=act)
|
||||
# branch2
|
||||
self._conv_pw_2 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels // 2,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
groups=1,
|
||||
act=act)
|
||||
self._conv_dw_2 = ConvBNLayer(
|
||||
in_channels=out_channels // 2,
|
||||
out_channels=out_channels // 2,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
padding=1,
|
||||
groups=out_channels // 2,
|
||||
act=None)
|
||||
self._conv_linear_2 = ConvBNLayer(
|
||||
in_channels=out_channels // 2,
|
||||
out_channels=out_channels // 2,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
groups=1,
|
||||
act=act)
|
||||
|
||||
def forward(self, inputs):
|
||||
x1 = self._conv_dw_1(inputs)
|
||||
x1 = self._conv_linear_1(x1)
|
||||
x2 = self._conv_pw_2(inputs)
|
||||
x2 = self._conv_dw_2(x2)
|
||||
x2 = self._conv_linear_2(x2)
|
||||
out = paddle.concat([x1, x2], axis=1)
|
||||
|
||||
return channel_shuffle(out, 2)
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class ShuffleNetV2(nn.Layer):
|
||||
def __init__(self, scale=1.0, act="relu", feature_maps=[5, 13, 17]):
|
||||
super(ShuffleNetV2, self).__init__()
|
||||
self.scale = scale
|
||||
if isinstance(feature_maps, Integral):
|
||||
feature_maps = [feature_maps]
|
||||
self.feature_maps = feature_maps
|
||||
stage_repeats = [4, 8, 4]
|
||||
|
||||
if scale == 0.25:
|
||||
stage_out_channels = [-1, 24, 24, 48, 96, 512]
|
||||
elif scale == 0.33:
|
||||
stage_out_channels = [-1, 24, 32, 64, 128, 512]
|
||||
elif scale == 0.5:
|
||||
stage_out_channels = [-1, 24, 48, 96, 192, 1024]
|
||||
elif scale == 1.0:
|
||||
stage_out_channels = [-1, 24, 116, 232, 464, 1024]
|
||||
elif scale == 1.5:
|
||||
stage_out_channels = [-1, 24, 176, 352, 704, 1024]
|
||||
elif scale == 2.0:
|
||||
stage_out_channels = [-1, 24, 244, 488, 976, 2048]
|
||||
else:
|
||||
raise NotImplementedError("This scale size:[" + str(scale) +
|
||||
"] is not implemented!")
|
||||
self._out_channels = []
|
||||
self._feature_idx = 0
|
||||
# 1. conv1
|
||||
self._conv1 = ConvBNLayer(
|
||||
in_channels=3,
|
||||
out_channels=stage_out_channels[1],
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
act=act)
|
||||
self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
|
||||
self._feature_idx += 1
|
||||
|
||||
# 2. bottleneck sequences
|
||||
self._block_list = []
|
||||
for stage_id, num_repeat in enumerate(stage_repeats):
|
||||
for i in range(num_repeat):
|
||||
if i == 0:
|
||||
block = self.add_sublayer(
|
||||
name=str(stage_id + 2) + '_' + str(i + 1),
|
||||
sublayer=InvertedResidualDS(
|
||||
in_channels=stage_out_channels[stage_id + 1],
|
||||
out_channels=stage_out_channels[stage_id + 2],
|
||||
stride=2,
|
||||
act=act))
|
||||
else:
|
||||
block = self.add_sublayer(
|
||||
name=str(stage_id + 2) + '_' + str(i + 1),
|
||||
sublayer=InvertedResidual(
|
||||
in_channels=stage_out_channels[stage_id + 2],
|
||||
out_channels=stage_out_channels[stage_id + 2],
|
||||
stride=1,
|
||||
act=act))
|
||||
self._block_list.append(block)
|
||||
self._feature_idx += 1
|
||||
self._update_out_channels(stage_out_channels[stage_id + 2],
|
||||
self._feature_idx, self.feature_maps)
|
||||
|
||||
def _update_out_channels(self, channel, feature_idx, feature_maps):
|
||||
if feature_idx in feature_maps:
|
||||
self._out_channels.append(channel)
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self._conv1(inputs['image'])
|
||||
y = self._max_pool(y)
|
||||
outs = []
|
||||
for i, inv in enumerate(self._block_list):
|
||||
y = inv(y)
|
||||
if i + 2 in self.feature_maps:
|
||||
outs.append(y)
|
||||
|
||||
return outs
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [ShapeSpec(channels=c) for c in self._out_channels]
|
||||
752
rtdetr_paddle/ppdet/modeling/backbones/swin_transformer.py
Normal file
752
rtdetr_paddle/ppdet/modeling/backbones/swin_transformer.py
Normal file
@@ -0,0 +1,752 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This code is based on https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
|
||||
Ths copyright of microsoft/Swin-Transformer is as follows:
|
||||
MIT License [see LICENSE for details]
|
||||
"""
|
||||
import numpy as np
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from ppdet.modeling.shape_spec import ShapeSpec
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from .transformer_utils import DropPath, Identity
|
||||
from .transformer_utils import add_parameter, to_2tuple
|
||||
from .transformer_utils import ones_, zeros_, trunc_normal_
|
||||
|
||||
__all__ = ['SwinTransformer']
|
||||
|
||||
MODEL_cfg = {
|
||||
# use 22kto1k finetune weights as default pretrained, can set by SwinTransformer.pretrained in config
|
||||
'swin_T_224': dict(
|
||||
pretrain_img_size=224,
|
||||
embed_dim=96,
|
||||
depths=[2, 2, 6, 2],
|
||||
num_heads=[3, 6, 12, 24],
|
||||
window_size=7,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_tiny_patch4_window7_224_22kto1k_pretrained.pdparams',
|
||||
),
|
||||
'swin_S_224': dict(
|
||||
pretrain_img_size=224,
|
||||
embed_dim=96,
|
||||
depths=[2, 2, 18, 2],
|
||||
num_heads=[3, 6, 12, 24],
|
||||
window_size=7,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_small_patch4_window7_224_22kto1k_pretrained.pdparams',
|
||||
),
|
||||
'swin_B_224': dict(
|
||||
pretrain_img_size=224,
|
||||
embed_dim=128,
|
||||
depths=[2, 2, 18, 2],
|
||||
num_heads=[4, 8, 16, 32],
|
||||
window_size=7,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window7_224_22kto1k_pretrained.pdparams',
|
||||
),
|
||||
'swin_L_224': dict(
|
||||
pretrain_img_size=224,
|
||||
embed_dim=192,
|
||||
depths=[2, 2, 18, 2],
|
||||
num_heads=[6, 12, 24, 48],
|
||||
window_size=7,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window7_224_22kto1k_pretrained.pdparams',
|
||||
),
|
||||
'swin_B_384': dict(
|
||||
pretrain_img_size=384,
|
||||
embed_dim=128,
|
||||
depths=[2, 2, 18, 2],
|
||||
num_heads=[4, 8, 16, 32],
|
||||
window_size=12,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window12_384_22kto1k_pretrained.pdparams',
|
||||
),
|
||||
'swin_L_384': dict(
|
||||
pretrain_img_size=384,
|
||||
embed_dim=192,
|
||||
depths=[2, 2, 18, 2],
|
||||
num_heads=[6, 12, 24, 48],
|
||||
window_size=12,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window12_384_22kto1k_pretrained.pdparams',
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
class Mlp(nn.Layer):
|
||||
def __init__(self,
|
||||
in_features,
|
||||
hidden_features=None,
|
||||
out_features=None,
|
||||
act_layer=nn.GELU,
|
||||
drop=0.):
|
||||
super().__init__()
|
||||
out_features = out_features or in_features
|
||||
hidden_features = hidden_features or in_features
|
||||
self.fc1 = nn.Linear(in_features, hidden_features)
|
||||
self.act = act_layer()
|
||||
self.fc2 = nn.Linear(hidden_features, out_features)
|
||||
self.drop = nn.Dropout(drop)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.fc1(x)
|
||||
x = self.act(x)
|
||||
x = self.drop(x)
|
||||
x = self.fc2(x)
|
||||
x = self.drop(x)
|
||||
return x
|
||||
|
||||
|
||||
def window_partition(x, window_size):
|
||||
"""
|
||||
Args:
|
||||
x: (B, H, W, C)
|
||||
window_size (int): window size
|
||||
Returns:
|
||||
windows: (num_windows*B, window_size, window_size, C)
|
||||
"""
|
||||
B, H, W, C = x.shape
|
||||
x = x.reshape(
|
||||
[-1, H // window_size, window_size, W // window_size, window_size, C])
|
||||
windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
|
||||
[-1, window_size, window_size, C])
|
||||
return windows
|
||||
|
||||
|
||||
def window_reverse(windows, window_size, H, W):
|
||||
"""
|
||||
Args:
|
||||
windows: (num_windows*B, window_size, window_size, C)
|
||||
window_size (int): Window size
|
||||
H (int): Height of image
|
||||
W (int): Width of image
|
||||
Returns:
|
||||
x: (B, H, W, C)
|
||||
"""
|
||||
_, _, _, C = windows.shape
|
||||
B = int(windows.shape[0] / (H * W / window_size / window_size))
|
||||
x = windows.reshape(
|
||||
[-1, H // window_size, W // window_size, window_size, window_size, C])
|
||||
x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C])
|
||||
return x
|
||||
|
||||
|
||||
class WindowAttention(nn.Layer):
|
||||
""" Window based multi-head self attention (W-MSA) module with relative position bias.
|
||||
It supports both of shifted and non-shifted window.
|
||||
|
||||
Args:
|
||||
dim (int): Number of input channels.
|
||||
window_size (tuple[int]): The height and width of the window.
|
||||
num_heads (int): Number of attention heads.
|
||||
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
|
||||
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
|
||||
attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
|
||||
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
dim,
|
||||
window_size,
|
||||
num_heads,
|
||||
qkv_bias=True,
|
||||
qk_scale=None,
|
||||
attn_drop=0.,
|
||||
proj_drop=0.):
|
||||
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.window_size = window_size # Wh, Ww
|
||||
self.num_heads = num_heads
|
||||
head_dim = dim // num_heads
|
||||
self.scale = qk_scale or head_dim**-0.5
|
||||
|
||||
# define a parameter table of relative position bias
|
||||
self.relative_position_bias_table = add_parameter(
|
||||
self,
|
||||
paddle.zeros(((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
|
||||
num_heads))) # 2*Wh-1 * 2*Ww-1, nH
|
||||
|
||||
# get pair-wise relative position index for each token inside the window
|
||||
coords_h = paddle.arange(self.window_size[0])
|
||||
coords_w = paddle.arange(self.window_size[1])
|
||||
coords = paddle.stack(paddle.meshgrid(
|
||||
[coords_h, coords_w])) # 2, Wh, Ww
|
||||
coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww
|
||||
coords_flatten_1 = coords_flatten.unsqueeze(axis=2)
|
||||
coords_flatten_2 = coords_flatten.unsqueeze(axis=1)
|
||||
relative_coords = coords_flatten_1 - coords_flatten_2
|
||||
relative_coords = relative_coords.transpose(
|
||||
[1, 2, 0]) # Wh*Ww, Wh*Ww, 2
|
||||
relative_coords[:, :, 0] += self.window_size[
|
||||
0] - 1 # shift to start from 0
|
||||
relative_coords[:, :, 1] += self.window_size[1] - 1
|
||||
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
|
||||
self.relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
|
||||
|
||||
self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
|
||||
self.attn_drop = nn.Dropout(attn_drop)
|
||||
self.proj = nn.Linear(dim, dim)
|
||||
self.proj_drop = nn.Dropout(proj_drop)
|
||||
|
||||
trunc_normal_(self.relative_position_bias_table)
|
||||
self.softmax = nn.Softmax(axis=-1)
|
||||
|
||||
def forward(self, x, mask=None):
|
||||
""" Forward function.
|
||||
Args:
|
||||
x: input features with shape of (num_windows*B, N, C)
|
||||
mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
|
||||
"""
|
||||
B_, N, C = x.shape
|
||||
qkv = self.qkv(x).reshape(
|
||||
[-1, N, 3, self.num_heads, C // self.num_heads]).transpose(
|
||||
[2, 0, 3, 1, 4])
|
||||
q, k, v = qkv[0], qkv[1], qkv[2]
|
||||
|
||||
q = q * self.scale
|
||||
attn = paddle.mm(q, k.transpose([0, 1, 3, 2]))
|
||||
|
||||
index = self.relative_position_index.flatten()
|
||||
|
||||
relative_position_bias = paddle.index_select(
|
||||
self.relative_position_bias_table, index)
|
||||
relative_position_bias = relative_position_bias.reshape([
|
||||
self.window_size[0] * self.window_size[1],
|
||||
self.window_size[0] * self.window_size[1], -1
|
||||
]) # Wh*Ww,Wh*Ww,nH
|
||||
relative_position_bias = relative_position_bias.transpose(
|
||||
[2, 0, 1]) # nH, Wh*Ww, Wh*Ww
|
||||
attn = attn + relative_position_bias.unsqueeze(0)
|
||||
|
||||
if mask is not None:
|
||||
nW = mask.shape[0]
|
||||
attn = attn.reshape([-1, nW, self.num_heads, N, N
|
||||
]) + mask.unsqueeze(1).unsqueeze(0)
|
||||
attn = attn.reshape([-1, self.num_heads, N, N])
|
||||
attn = self.softmax(attn)
|
||||
else:
|
||||
attn = self.softmax(attn)
|
||||
|
||||
attn = self.attn_drop(attn)
|
||||
|
||||
# x = (attn @ v).transpose(1, 2).reshape([B_, N, C])
|
||||
x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([-1, N, C])
|
||||
x = self.proj(x)
|
||||
x = self.proj_drop(x)
|
||||
return x
|
||||
|
||||
|
||||
class SwinTransformerBlock(nn.Layer):
|
||||
""" Swin Transformer Block.
|
||||
Args:
|
||||
dim (int): Number of input channels.
|
||||
num_heads (int): Number of attention heads.
|
||||
window_size (int): Window size.
|
||||
shift_size (int): Shift size for SW-MSA.
|
||||
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
|
||||
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
|
||||
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
|
||||
drop (float, optional): Dropout rate. Default: 0.0
|
||||
attn_drop (float, optional): Attention dropout rate. Default: 0.0
|
||||
drop_path (float, optional): Stochastic depth rate. Default: 0.0
|
||||
act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
|
||||
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
dim,
|
||||
num_heads,
|
||||
window_size=7,
|
||||
shift_size=0,
|
||||
mlp_ratio=4.,
|
||||
qkv_bias=True,
|
||||
qk_scale=None,
|
||||
drop=0.,
|
||||
attn_drop=0.,
|
||||
drop_path=0.,
|
||||
act_layer=nn.GELU,
|
||||
norm_layer=nn.LayerNorm):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.num_heads = num_heads
|
||||
self.window_size = window_size
|
||||
self.shift_size = shift_size
|
||||
self.mlp_ratio = mlp_ratio
|
||||
assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
|
||||
|
||||
self.norm1 = norm_layer(dim)
|
||||
self.attn = WindowAttention(
|
||||
dim,
|
||||
window_size=to_2tuple(self.window_size),
|
||||
num_heads=num_heads,
|
||||
qkv_bias=qkv_bias,
|
||||
qk_scale=qk_scale,
|
||||
attn_drop=attn_drop,
|
||||
proj_drop=drop)
|
||||
|
||||
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
|
||||
self.norm2 = norm_layer(dim)
|
||||
mlp_hidden_dim = int(dim * mlp_ratio)
|
||||
self.mlp = Mlp(in_features=dim,
|
||||
hidden_features=mlp_hidden_dim,
|
||||
act_layer=act_layer,
|
||||
drop=drop)
|
||||
|
||||
self.H = None
|
||||
self.W = None
|
||||
|
||||
def forward(self, x, mask_matrix):
|
||||
""" Forward function.
|
||||
Args:
|
||||
x: Input feature, tensor size (B, H*W, C).
|
||||
H, W: Spatial resolution of the input feature.
|
||||
mask_matrix: Attention mask for cyclic shift.
|
||||
"""
|
||||
B, L, C = x.shape
|
||||
H, W = self.H, self.W
|
||||
assert L == H * W, "input feature has wrong size"
|
||||
|
||||
shortcut = x
|
||||
x = self.norm1(x)
|
||||
x = x.reshape([-1, H, W, C])
|
||||
|
||||
# pad feature maps to multiples of window size
|
||||
pad_l = pad_t = 0
|
||||
pad_r = (self.window_size - W % self.window_size) % self.window_size
|
||||
pad_b = (self.window_size - H % self.window_size) % self.window_size
|
||||
x = F.pad(x, [0, pad_l, 0, pad_b, 0, pad_r, 0, pad_t],
|
||||
data_format='NHWC')
|
||||
_, Hp, Wp, _ = x.shape
|
||||
|
||||
# cyclic shift
|
||||
if self.shift_size > 0:
|
||||
shifted_x = paddle.roll(
|
||||
x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))
|
||||
attn_mask = mask_matrix
|
||||
else:
|
||||
shifted_x = x
|
||||
attn_mask = None
|
||||
|
||||
# partition windows
|
||||
x_windows = window_partition(
|
||||
shifted_x, self.window_size) # nW*B, window_size, window_size, C
|
||||
x_windows = x_windows.reshape(
|
||||
[x_windows.shape[0], self.window_size * self.window_size,
|
||||
C]) # nW*B, window_size*window_size, C
|
||||
|
||||
# W-MSA/SW-MSA
|
||||
attn_windows = self.attn(
|
||||
x_windows, mask=attn_mask) # nW*B, window_size*window_size, C
|
||||
|
||||
# merge windows
|
||||
attn_windows = attn_windows.reshape(
|
||||
[x_windows.shape[0], self.window_size, self.window_size, C])
|
||||
shifted_x = window_reverse(attn_windows, self.window_size, Hp,
|
||||
Wp) # B H' W' C
|
||||
|
||||
# reverse cyclic shift
|
||||
if self.shift_size > 0:
|
||||
x = paddle.roll(
|
||||
shifted_x,
|
||||
shifts=(self.shift_size, self.shift_size),
|
||||
axis=(1, 2))
|
||||
else:
|
||||
x = shifted_x
|
||||
|
||||
if pad_r > 0 or pad_b > 0:
|
||||
x = x[:, :H, :W, :]
|
||||
|
||||
x = x.reshape([-1, H * W, C])
|
||||
|
||||
# FFN
|
||||
x = shortcut + self.drop_path(x)
|
||||
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class PatchMerging(nn.Layer):
|
||||
r""" Patch Merging Layer.
|
||||
Args:
|
||||
dim (int): Number of input channels.
|
||||
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
|
||||
"""
|
||||
|
||||
def __init__(self, dim, norm_layer=nn.LayerNorm):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
|
||||
self.norm = norm_layer(4 * dim)
|
||||
|
||||
def forward(self, x, H, W):
|
||||
""" Forward function.
|
||||
Args:
|
||||
x: Input feature, tensor size (B, H*W, C).
|
||||
H, W: Spatial resolution of the input feature.
|
||||
"""
|
||||
B, L, C = x.shape
|
||||
assert L == H * W, "input feature has wrong size"
|
||||
|
||||
x = x.reshape([-1, H, W, C])
|
||||
|
||||
# padding
|
||||
pad_input = (H % 2 == 1) or (W % 2 == 1)
|
||||
if pad_input:
|
||||
# paddle F.pad default data_format is 'NCHW'
|
||||
x = F.pad(x, [0, 0, 0, H % 2, 0, W % 2, 0, 0], data_format='NHWC')
|
||||
H += H % 2
|
||||
W += W % 2
|
||||
|
||||
x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
|
||||
x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
|
||||
x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
|
||||
x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
|
||||
x = paddle.concat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
|
||||
x = x.reshape([-1, H * W // 4, 4 * C]) # B H/2*W/2 4*C
|
||||
|
||||
x = self.norm(x)
|
||||
x = self.reduction(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class BasicLayer(nn.Layer):
|
||||
""" A basic Swin Transformer layer for one stage.
|
||||
Args:
|
||||
dim (int): Number of input channels.
|
||||
depth (int): Number of blocks.
|
||||
num_heads (int): Number of attention heads.
|
||||
window_size (int): Local window size.
|
||||
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
|
||||
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
|
||||
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
|
||||
drop (float, optional): Dropout rate. Default: 0.0
|
||||
attn_drop (float, optional): Attention dropout rate. Default: 0.0
|
||||
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
|
||||
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
|
||||
downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
dim,
|
||||
depth,
|
||||
num_heads,
|
||||
window_size=7,
|
||||
mlp_ratio=4.,
|
||||
qkv_bias=True,
|
||||
qk_scale=None,
|
||||
drop=0.,
|
||||
attn_drop=0.,
|
||||
drop_path=0.,
|
||||
norm_layer=nn.LayerNorm,
|
||||
downsample=None):
|
||||
super().__init__()
|
||||
self.window_size = window_size
|
||||
self.shift_size = window_size // 2
|
||||
self.depth = depth
|
||||
|
||||
# build blocks
|
||||
self.blocks = nn.LayerList([
|
||||
SwinTransformerBlock(
|
||||
dim=dim,
|
||||
num_heads=num_heads,
|
||||
window_size=window_size,
|
||||
shift_size=0 if (i % 2 == 0) else window_size // 2,
|
||||
mlp_ratio=mlp_ratio,
|
||||
qkv_bias=qkv_bias,
|
||||
qk_scale=qk_scale,
|
||||
drop=drop,
|
||||
attn_drop=attn_drop,
|
||||
drop_path=drop_path[i]
|
||||
if isinstance(drop_path, np.ndarray) else drop_path,
|
||||
norm_layer=norm_layer) for i in range(depth)
|
||||
])
|
||||
|
||||
# patch merging layer
|
||||
if downsample is not None:
|
||||
self.downsample = downsample(dim=dim, norm_layer=norm_layer)
|
||||
else:
|
||||
self.downsample = None
|
||||
|
||||
def forward(self, x, H, W):
|
||||
""" Forward function.
|
||||
Args:
|
||||
x: Input feature, tensor size (B, H*W, C).
|
||||
H, W: Spatial resolution of the input feature.
|
||||
"""
|
||||
|
||||
# calculate attention mask for SW-MSA
|
||||
Hp = int(np.ceil(H / self.window_size)) * self.window_size
|
||||
Wp = int(np.ceil(W / self.window_size)) * self.window_size
|
||||
img_mask = paddle.zeros([1, Hp, Wp, 1], dtype='float32') # 1 Hp Wp 1
|
||||
h_slices = (slice(0, -self.window_size),
|
||||
slice(-self.window_size, -self.shift_size),
|
||||
slice(-self.shift_size, None))
|
||||
w_slices = (slice(0, -self.window_size),
|
||||
slice(-self.window_size, -self.shift_size),
|
||||
slice(-self.shift_size, None))
|
||||
cnt = 0
|
||||
for h in h_slices:
|
||||
for w in w_slices:
|
||||
img_mask[:, h, w, :] = cnt
|
||||
|
||||
cnt += 1
|
||||
|
||||
mask_windows = window_partition(
|
||||
img_mask, self.window_size) # nW, window_size, window_size, 1
|
||||
mask_windows = mask_windows.reshape(
|
||||
[-1, self.window_size * self.window_size])
|
||||
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
|
||||
huns = -100.0 * paddle.ones_like(attn_mask)
|
||||
attn_mask = huns * (attn_mask != 0).astype("float32")
|
||||
|
||||
for blk in self.blocks:
|
||||
blk.H, blk.W = H, W
|
||||
x = blk(x, attn_mask)
|
||||
if self.downsample is not None:
|
||||
x_down = self.downsample(x, H, W)
|
||||
Wh, Ww = (H + 1) // 2, (W + 1) // 2
|
||||
return x, H, W, x_down, Wh, Ww
|
||||
else:
|
||||
return x, H, W, x, H, W
|
||||
|
||||
|
||||
class PatchEmbed(nn.Layer):
|
||||
""" Image to Patch Embedding
|
||||
Args:
|
||||
patch_size (int): Patch token size. Default: 4.
|
||||
in_chans (int): Number of input image channels. Default: 3.
|
||||
embed_dim (int): Number of linear projection output channels. Default: 96.
|
||||
norm_layer (nn.Layer, optional): Normalization layer. Default: None
|
||||
"""
|
||||
|
||||
def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
|
||||
super().__init__()
|
||||
patch_size = to_2tuple(patch_size)
|
||||
self.patch_size = patch_size
|
||||
|
||||
self.in_chans = in_chans
|
||||
self.embed_dim = embed_dim
|
||||
|
||||
self.proj = nn.Conv2D(
|
||||
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
|
||||
if norm_layer is not None:
|
||||
self.norm = norm_layer(embed_dim)
|
||||
else:
|
||||
self.norm = None
|
||||
|
||||
def forward(self, x):
|
||||
# TODO # export dynamic shape
|
||||
B, C, H, W = x.shape
|
||||
# assert [H, W] == self.img_size[:2], "Input image size ({H}*{W}) doesn't match model ({}*{}).".format(H, W, self.img_size[0], self.img_size[1])
|
||||
if W % self.patch_size[1] != 0:
|
||||
x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
|
||||
if H % self.patch_size[0] != 0:
|
||||
x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
|
||||
|
||||
x = self.proj(x)
|
||||
if self.norm is not None:
|
||||
_, _, Wh, Ww = x.shape
|
||||
x = x.flatten(2).transpose([0, 2, 1])
|
||||
x = self.norm(x)
|
||||
x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
|
||||
|
||||
return x
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class SwinTransformer(nn.Layer):
|
||||
""" Swin Transformer backbone
|
||||
Args:
|
||||
arch (str): Architecture of FocalNet
|
||||
pretrain_img_size (int | tuple(int)): Input image size. Default 224
|
||||
patch_size (int | tuple(int)): Patch size. Default: 4
|
||||
in_chans (int): Number of input image channels. Default: 3
|
||||
embed_dim (int): Patch embedding dimension. Default: 96
|
||||
depths (tuple(int)): Depth of each Swin Transformer layer.
|
||||
num_heads (tuple(int)): Number of attention heads in different layers.
|
||||
window_size (int): Window size. Default: 7
|
||||
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
|
||||
qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
|
||||
qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
|
||||
drop_rate (float): Dropout rate. Default: 0
|
||||
attn_drop_rate (float): Attention dropout rate. Default: 0
|
||||
drop_path_rate (float): Stochastic depth rate. Default: 0.1
|
||||
norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
|
||||
ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
|
||||
patch_norm (bool): If True, add normalization after patch embedding. Default: True
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
arch='swin_T_224',
|
||||
pretrain_img_size=224,
|
||||
patch_size=4,
|
||||
in_chans=3,
|
||||
embed_dim=96,
|
||||
depths=[2, 2, 6, 2],
|
||||
num_heads=[3, 6, 12, 24],
|
||||
window_size=7,
|
||||
mlp_ratio=4.,
|
||||
qkv_bias=True,
|
||||
qk_scale=None,
|
||||
drop_rate=0.,
|
||||
attn_drop_rate=0.,
|
||||
drop_path_rate=0.2,
|
||||
norm_layer=nn.LayerNorm,
|
||||
ape=False,
|
||||
patch_norm=True,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
frozen_stages=-1,
|
||||
pretrained=None):
|
||||
super(SwinTransformer, self).__init__()
|
||||
assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch)
|
||||
|
||||
pretrain_img_size = MODEL_cfg[arch]['pretrain_img_size']
|
||||
embed_dim = MODEL_cfg[arch]['embed_dim']
|
||||
depths = MODEL_cfg[arch]['depths']
|
||||
num_heads = MODEL_cfg[arch]['num_heads']
|
||||
window_size = MODEL_cfg[arch]['window_size']
|
||||
if pretrained is None:
|
||||
pretrained = MODEL_cfg[arch]['pretrained']
|
||||
|
||||
self.num_layers = len(depths)
|
||||
self.ape = ape
|
||||
self.patch_norm = patch_norm
|
||||
self.out_indices = out_indices
|
||||
self.frozen_stages = frozen_stages
|
||||
|
||||
# split image into non-overlapping patches
|
||||
self.patch_embed = PatchEmbed(
|
||||
patch_size=patch_size,
|
||||
in_chans=in_chans,
|
||||
embed_dim=embed_dim,
|
||||
norm_layer=norm_layer if self.patch_norm else None)
|
||||
|
||||
# absolute position embedding
|
||||
if self.ape:
|
||||
pretrain_img_size = to_2tuple(pretrain_img_size)
|
||||
patch_size = to_2tuple(patch_size)
|
||||
patches_resolution = [
|
||||
pretrain_img_size[0] // patch_size[0],
|
||||
pretrain_img_size[1] // patch_size[1]
|
||||
]
|
||||
|
||||
self.absolute_pos_embed = add_parameter(
|
||||
self,
|
||||
paddle.zeros((1, embed_dim, patches_resolution[0],
|
||||
patches_resolution[1])))
|
||||
trunc_normal_(self.absolute_pos_embed)
|
||||
|
||||
self.pos_drop = nn.Dropout(p=drop_rate)
|
||||
|
||||
# stochastic depth
|
||||
dpr = np.linspace(0, drop_path_rate,
|
||||
sum(depths)) # stochastic depth decay rule
|
||||
|
||||
# build layers
|
||||
self.layers = nn.LayerList()
|
||||
for i_layer in range(self.num_layers):
|
||||
layer = BasicLayer(
|
||||
dim=int(embed_dim * 2**i_layer),
|
||||
depth=depths[i_layer],
|
||||
num_heads=num_heads[i_layer],
|
||||
window_size=window_size,
|
||||
mlp_ratio=mlp_ratio,
|
||||
qkv_bias=qkv_bias,
|
||||
qk_scale=qk_scale,
|
||||
drop=drop_rate,
|
||||
attn_drop=attn_drop_rate,
|
||||
drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
|
||||
norm_layer=norm_layer,
|
||||
downsample=PatchMerging
|
||||
if (i_layer < self.num_layers - 1) else None)
|
||||
self.layers.append(layer)
|
||||
|
||||
num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
|
||||
self.num_features = num_features
|
||||
|
||||
# add a norm layer for each output
|
||||
for i_layer in out_indices:
|
||||
layer = norm_layer(num_features[i_layer])
|
||||
layer_name = f'norm{i_layer}'
|
||||
self.add_sublayer(layer_name, layer)
|
||||
|
||||
self.apply(self._init_weights)
|
||||
self._freeze_stages()
|
||||
if pretrained:
|
||||
if 'http' in pretrained: #URL
|
||||
path = paddle.utils.download.get_weights_path_from_url(
|
||||
pretrained)
|
||||
else: #model in local path
|
||||
path = pretrained
|
||||
self.set_state_dict(paddle.load(path))
|
||||
|
||||
def _freeze_stages(self):
|
||||
if self.frozen_stages >= 0:
|
||||
self.patch_embed.eval()
|
||||
for param in self.patch_embed.parameters():
|
||||
param.stop_gradient = True
|
||||
|
||||
if self.frozen_stages >= 1 and self.ape:
|
||||
self.absolute_pos_embed.stop_gradient = True
|
||||
|
||||
if self.frozen_stages >= 2:
|
||||
self.pos_drop.eval()
|
||||
for i in range(0, self.frozen_stages - 1):
|
||||
m = self.layers[i]
|
||||
m.eval()
|
||||
for param in m.parameters():
|
||||
param.stop_gradient = True
|
||||
|
||||
def _init_weights(self, m):
|
||||
if isinstance(m, nn.Linear):
|
||||
trunc_normal_(m.weight)
|
||||
if isinstance(m, nn.Linear) and m.bias is not None:
|
||||
zeros_(m.bias)
|
||||
elif isinstance(m, nn.LayerNorm):
|
||||
zeros_(m.bias)
|
||||
ones_(m.weight)
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward function."""
|
||||
x = self.patch_embed(x['image'])
|
||||
B, _, Wh, Ww = x.shape
|
||||
if self.ape:
|
||||
# interpolate the position embedding to the corresponding size
|
||||
absolute_pos_embed = F.interpolate(
|
||||
self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
|
||||
x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1])
|
||||
else:
|
||||
x = x.flatten(2).transpose([0, 2, 1])
|
||||
x = self.pos_drop(x)
|
||||
outs = []
|
||||
for i in range(self.num_layers):
|
||||
layer = self.layers[i]
|
||||
x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
|
||||
if i in self.out_indices:
|
||||
norm_layer = getattr(self, f'norm{i}')
|
||||
x_out = norm_layer(x_out)
|
||||
out = x_out.reshape((-1, H, W, self.num_features[i])).transpose(
|
||||
(0, 3, 1, 2))
|
||||
outs.append(out)
|
||||
|
||||
return outs
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
out_strides = [4, 8, 16, 32]
|
||||
return [
|
||||
ShapeSpec(
|
||||
channels=self.num_features[i], stride=out_strides[i])
|
||||
for i in self.out_indices
|
||||
]
|
||||
381
rtdetr_paddle/ppdet/modeling/backbones/trans_encoder.py
Normal file
381
rtdetr_paddle/ppdet/modeling/backbones/trans_encoder.py
Normal file
@@ -0,0 +1,381 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle.nn import ReLU, Swish, GELU
|
||||
import math
|
||||
|
||||
from ppdet.core.workspace import register
|
||||
from ..shape_spec import ShapeSpec
|
||||
|
||||
__all__ = ['TransEncoder']
|
||||
|
||||
|
||||
class BertEmbeddings(nn.Layer):
|
||||
def __init__(self, word_size, position_embeddings_size, word_type_size,
|
||||
hidden_size, dropout_prob):
|
||||
super(BertEmbeddings, self).__init__()
|
||||
self.word_embeddings = nn.Embedding(
|
||||
word_size, hidden_size, padding_idx=0)
|
||||
self.position_embeddings = nn.Embedding(position_embeddings_size,
|
||||
hidden_size)
|
||||
self.token_type_embeddings = nn.Embedding(word_type_size, hidden_size)
|
||||
self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
|
||||
self.dropout = nn.Dropout(dropout_prob)
|
||||
|
||||
def forward(self, x, token_type_ids=None, position_ids=None):
|
||||
seq_len = paddle.shape(x)[1]
|
||||
if position_ids is None:
|
||||
position_ids = paddle.arange(seq_len).unsqueeze(0).expand_as(x)
|
||||
if token_type_ids is None:
|
||||
token_type_ids = paddle.zeros(paddle.shape(x))
|
||||
|
||||
word_embs = self.word_embeddings(x)
|
||||
position_embs = self.position_embeddings(position_ids)
|
||||
token_type_embs = self.token_type_embeddings(token_type_ids)
|
||||
|
||||
embs_cmb = word_embs + position_embs + token_type_embs
|
||||
embs_out = self.layernorm(embs_cmb)
|
||||
embs_out = self.dropout(embs_out)
|
||||
return embs_out
|
||||
|
||||
|
||||
class BertSelfAttention(nn.Layer):
|
||||
def __init__(self,
|
||||
hidden_size,
|
||||
num_attention_heads,
|
||||
attention_probs_dropout_prob,
|
||||
output_attentions=False):
|
||||
super(BertSelfAttention, self).__init__()
|
||||
if hidden_size % num_attention_heads != 0:
|
||||
raise ValueError(
|
||||
"The hidden_size must be a multiple of the number of attention "
|
||||
"heads, but got {} % {} != 0" %
|
||||
(hidden_size, num_attention_heads))
|
||||
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.attention_head_size = int(hidden_size / num_attention_heads)
|
||||
self.all_head_size = self.num_attention_heads * self.attention_head_size
|
||||
|
||||
self.query = nn.Linear(hidden_size, self.all_head_size)
|
||||
self.key = nn.Linear(hidden_size, self.all_head_size)
|
||||
self.value = nn.Linear(hidden_size, self.all_head_size)
|
||||
|
||||
self.dropout = nn.Dropout(attention_probs_dropout_prob)
|
||||
self.output_attentions = output_attentions
|
||||
|
||||
def forward(self, x, attention_mask, head_mask=None):
|
||||
query = self.query(x)
|
||||
key = self.key(x)
|
||||
value = self.value(x)
|
||||
|
||||
query_dim1, query_dim2 = paddle.shape(query)[:-1]
|
||||
new_shape = [
|
||||
query_dim1, query_dim2, self.num_attention_heads,
|
||||
self.attention_head_size
|
||||
]
|
||||
query = query.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
|
||||
key = key.reshape(new_shape).transpose(perm=(0, 2, 3, 1))
|
||||
value = value.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
|
||||
|
||||
attention = paddle.matmul(query,
|
||||
key) / math.sqrt(self.attention_head_size)
|
||||
attention = attention + attention_mask
|
||||
attention_value = F.softmax(attention, axis=-1)
|
||||
attention_value = self.dropout(attention_value)
|
||||
|
||||
if head_mask is not None:
|
||||
attention_value = attention_value * head_mask
|
||||
|
||||
context = paddle.matmul(attention_value, value).transpose(perm=(0, 2, 1,
|
||||
3))
|
||||
ctx_dim1, ctx_dim2 = paddle.shape(context)[:-2]
|
||||
new_context_shape = [
|
||||
ctx_dim1,
|
||||
ctx_dim2,
|
||||
self.all_head_size,
|
||||
]
|
||||
context = context.reshape(new_context_shape)
|
||||
|
||||
if self.output_attentions:
|
||||
return (context, attention_value)
|
||||
else:
|
||||
return (context, )
|
||||
|
||||
|
||||
class BertAttention(nn.Layer):
|
||||
def __init__(self,
|
||||
hidden_size,
|
||||
num_attention_heads,
|
||||
attention_probs_dropout_prob,
|
||||
fc_dropout_prob,
|
||||
output_attentions=False):
|
||||
super(BertAttention, self).__init__()
|
||||
self.bert_selfattention = BertSelfAttention(
|
||||
hidden_size, num_attention_heads, attention_probs_dropout_prob,
|
||||
output_attentions)
|
||||
self.fc = nn.Linear(hidden_size, hidden_size)
|
||||
self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
|
||||
self.dropout = nn.Dropout(fc_dropout_prob)
|
||||
|
||||
def forward(self, x, attention_mask, head_mask=None):
|
||||
attention_feats = self.bert_selfattention(x, attention_mask, head_mask)
|
||||
features = self.fc(attention_feats[0])
|
||||
features = self.dropout(features)
|
||||
features = self.layernorm(features + x)
|
||||
if len(attention_feats) == 2:
|
||||
return (features, attention_feats[1])
|
||||
else:
|
||||
return (features, )
|
||||
|
||||
|
||||
class BertFeedForward(nn.Layer):
|
||||
def __init__(self,
|
||||
hidden_size,
|
||||
intermediate_size,
|
||||
num_attention_heads,
|
||||
attention_probs_dropout_prob,
|
||||
fc_dropout_prob,
|
||||
act_fn='ReLU',
|
||||
output_attentions=False):
|
||||
super(BertFeedForward, self).__init__()
|
||||
self.fc1 = nn.Linear(hidden_size, intermediate_size)
|
||||
self.act_fn = eval(act_fn)
|
||||
self.fc2 = nn.Linear(intermediate_size, hidden_size)
|
||||
self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
|
||||
self.dropout = nn.Dropout(fc_dropout_prob)
|
||||
|
||||
def forward(self, x):
|
||||
features = self.fc1(x)
|
||||
features = self.act_fn(features)
|
||||
features = self.fc2(features)
|
||||
features = self.dropout(features)
|
||||
features = self.layernorm(features + x)
|
||||
return features
|
||||
|
||||
|
||||
class BertLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
hidden_size,
|
||||
intermediate_size,
|
||||
num_attention_heads,
|
||||
attention_probs_dropout_prob,
|
||||
fc_dropout_prob,
|
||||
act_fn='ReLU',
|
||||
output_attentions=False):
|
||||
super(BertLayer, self).__init__()
|
||||
self.attention = BertAttention(hidden_size, num_attention_heads,
|
||||
attention_probs_dropout_prob,
|
||||
output_attentions)
|
||||
self.feed_forward = BertFeedForward(
|
||||
hidden_size, intermediate_size, num_attention_heads,
|
||||
attention_probs_dropout_prob, fc_dropout_prob, act_fn,
|
||||
output_attentions)
|
||||
|
||||
def forward(self, x, attention_mask, head_mask=None):
|
||||
attention_feats = self.attention(x, attention_mask, head_mask)
|
||||
features = self.feed_forward(attention_feats[0])
|
||||
if len(attention_feats) == 2:
|
||||
return (features, attention_feats[1])
|
||||
else:
|
||||
return (features, )
|
||||
|
||||
|
||||
class BertEncoder(nn.Layer):
|
||||
def __init__(self,
|
||||
num_hidden_layers,
|
||||
hidden_size,
|
||||
intermediate_size,
|
||||
num_attention_heads,
|
||||
attention_probs_dropout_prob,
|
||||
fc_dropout_prob,
|
||||
act_fn='ReLU',
|
||||
output_attentions=False,
|
||||
output_hidden_feats=False):
|
||||
super(BertEncoder, self).__init__()
|
||||
self.output_attentions = output_attentions
|
||||
self.output_hidden_feats = output_hidden_feats
|
||||
self.layers = nn.LayerList([
|
||||
BertLayer(hidden_size, intermediate_size, num_attention_heads,
|
||||
attention_probs_dropout_prob, fc_dropout_prob, act_fn,
|
||||
output_attentions) for _ in range(num_hidden_layers)
|
||||
])
|
||||
|
||||
def forward(self, x, attention_mask, head_mask=None):
|
||||
all_features = (x, )
|
||||
all_attentions = ()
|
||||
|
||||
for i, layer in enumerate(self.layers):
|
||||
mask = head_mask[i] if head_mask is not None else None
|
||||
layer_out = layer(x, attention_mask, mask)
|
||||
|
||||
if self.output_hidden_feats:
|
||||
all_features = all_features + (x, )
|
||||
x = layer_out[0]
|
||||
if self.output_attentions:
|
||||
all_attentions = all_attentions + (layer_out[1], )
|
||||
|
||||
outputs = (x, )
|
||||
if self.output_hidden_feats:
|
||||
outputs += (all_features, )
|
||||
if self.output_attentions:
|
||||
outputs += (all_attentions, )
|
||||
return outputs
|
||||
|
||||
|
||||
class BertPooler(nn.Layer):
|
||||
def __init__(self, hidden_size):
|
||||
super(BertPooler, self).__init__()
|
||||
self.fc = nn.Linear(hidden_size, hidden_size)
|
||||
self.act = nn.Tanh()
|
||||
|
||||
def forward(self, x):
|
||||
first_token = x[:, 0]
|
||||
pooled_output = self.fc(first_token)
|
||||
pooled_output = self.act(pooled_output)
|
||||
return pooled_output
|
||||
|
||||
|
||||
class METROEncoder(nn.Layer):
|
||||
def __init__(self,
|
||||
vocab_size,
|
||||
num_hidden_layers,
|
||||
features_dims,
|
||||
position_embeddings_size,
|
||||
hidden_size,
|
||||
intermediate_size,
|
||||
output_feature_dim,
|
||||
num_attention_heads,
|
||||
attention_probs_dropout_prob,
|
||||
fc_dropout_prob,
|
||||
act_fn='ReLU',
|
||||
output_attentions=False,
|
||||
output_hidden_feats=False,
|
||||
use_img_layernorm=False):
|
||||
super(METROEncoder, self).__init__()
|
||||
self.img_dims = features_dims
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.use_img_layernorm = use_img_layernorm
|
||||
self.output_attentions = output_attentions
|
||||
self.embedding = BertEmbeddings(vocab_size, position_embeddings_size, 2,
|
||||
hidden_size, fc_dropout_prob)
|
||||
self.encoder = BertEncoder(
|
||||
num_hidden_layers, hidden_size, intermediate_size,
|
||||
num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob,
|
||||
act_fn, output_attentions, output_hidden_feats)
|
||||
self.pooler = BertPooler(hidden_size)
|
||||
self.position_embeddings = nn.Embedding(position_embeddings_size,
|
||||
hidden_size)
|
||||
self.img_embedding = nn.Linear(
|
||||
features_dims, hidden_size, bias_attr=True)
|
||||
self.dropout = nn.Dropout(fc_dropout_prob)
|
||||
self.cls_head = nn.Linear(hidden_size, output_feature_dim)
|
||||
self.residual = nn.Linear(features_dims, output_feature_dim)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
|
||||
def init_weights(self, module):
|
||||
""" Initialize the weights.
|
||||
"""
|
||||
if isinstance(module, (nn.Linear, nn.Embedding)):
|
||||
module.weight.set_value(
|
||||
paddle.normal(
|
||||
mean=0.0, std=0.02, shape=module.weight.shape))
|
||||
elif isinstance(module, nn.LayerNorm):
|
||||
module.bias.set_value(paddle.zeros(shape=module.bias.shape))
|
||||
module.weight.set_value(
|
||||
paddle.full(
|
||||
shape=module.weight.shape, fill_value=1.0))
|
||||
if isinstance(module, nn.Linear) and module.bias is not None:
|
||||
module.bias.set_value(paddle.zeros(shape=module.bias.shape))
|
||||
|
||||
def forward(self, x):
|
||||
batchsize, seq_len = paddle.shape(x)[:2]
|
||||
input_ids = paddle.zeros((batchsize, seq_len), dtype="int64")
|
||||
position_ids = paddle.arange(
|
||||
seq_len, dtype="int64").unsqueeze(0).expand_as(input_ids)
|
||||
|
||||
attention_mask = paddle.ones_like(input_ids).unsqueeze(1).unsqueeze(2)
|
||||
head_mask = [None] * self.num_hidden_layers
|
||||
|
||||
position_embs = self.position_embeddings(position_ids)
|
||||
attention_mask = (1.0 - attention_mask) * -10000.0
|
||||
|
||||
img_features = self.img_embedding(x)
|
||||
|
||||
# We empirically observe that adding an additional learnable position embedding leads to more stable training
|
||||
embeddings = position_embs + img_features
|
||||
if self.use_img_layernorm:
|
||||
embeddings = self.layernorm(embeddings)
|
||||
embeddings = self.dropout(embeddings)
|
||||
|
||||
encoder_outputs = self.encoder(
|
||||
embeddings, attention_mask, head_mask=head_mask)
|
||||
|
||||
pred_score = self.cls_head(encoder_outputs[0])
|
||||
res_img_feats = self.residual(x)
|
||||
pred_score = pred_score + res_img_feats
|
||||
|
||||
if self.output_attentions and self.output_hidden_feats:
|
||||
return pred_score, encoder_outputs[1], encoder_outputs[-1]
|
||||
else:
|
||||
return pred_score
|
||||
|
||||
|
||||
def gelu(x):
|
||||
"""Implementation of the gelu activation function.
|
||||
https://arxiv.org/abs/1606.08415
|
||||
"""
|
||||
return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))
|
||||
|
||||
|
||||
@register
|
||||
class TransEncoder(nn.Layer):
|
||||
def __init__(self,
|
||||
vocab_size=30522,
|
||||
num_hidden_layers=4,
|
||||
num_attention_heads=4,
|
||||
position_embeddings_size=512,
|
||||
intermediate_size=3072,
|
||||
input_feat_dim=[2048, 512, 128],
|
||||
hidden_feat_dim=[1024, 256, 128],
|
||||
attention_probs_dropout_prob=0.1,
|
||||
fc_dropout_prob=0.1,
|
||||
act_fn='gelu',
|
||||
output_attentions=False,
|
||||
output_hidden_feats=False):
|
||||
super(TransEncoder, self).__init__()
|
||||
output_feat_dim = input_feat_dim[1:] + [3]
|
||||
trans_encoder = []
|
||||
for i in range(len(output_feat_dim)):
|
||||
features_dims = input_feat_dim[i]
|
||||
output_feature_dim = output_feat_dim[i]
|
||||
hidden_size = hidden_feat_dim[i]
|
||||
|
||||
# init a transformer encoder and append it to a list
|
||||
assert hidden_size % num_attention_heads == 0
|
||||
model = METROEncoder(vocab_size, num_hidden_layers, features_dims,
|
||||
position_embeddings_size, hidden_size,
|
||||
intermediate_size, output_feature_dim,
|
||||
num_attention_heads,
|
||||
attention_probs_dropout_prob, fc_dropout_prob,
|
||||
act_fn, output_attentions, output_hidden_feats)
|
||||
trans_encoder.append(model)
|
||||
self.trans_encoder = paddle.nn.Sequential(*trans_encoder)
|
||||
|
||||
def forward(self, x):
|
||||
out = self.trans_encoder(x)
|
||||
return out
|
||||
124
rtdetr_paddle/ppdet/modeling/backbones/transformer_utils.py
Normal file
124
rtdetr_paddle/ppdet/modeling/backbones/transformer_utils.py
Normal file
@@ -0,0 +1,124 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
|
||||
from paddle.nn.initializer import TruncatedNormal, Constant, Assign
|
||||
|
||||
# Common initializations
|
||||
ones_ = Constant(value=1.)
|
||||
zeros_ = Constant(value=0.)
|
||||
trunc_normal_ = TruncatedNormal(std=.02)
|
||||
|
||||
|
||||
# Common Layers
|
||||
def drop_path(x, drop_prob=0., training=False):
|
||||
"""
|
||||
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
||||
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
|
||||
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
|
||||
"""
|
||||
if drop_prob == 0. or not training:
|
||||
return x
|
||||
keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
|
||||
shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
|
||||
random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
|
||||
random_tensor = paddle.floor(random_tensor) # binarize
|
||||
output = x.divide(keep_prob) * random_tensor
|
||||
return output
|
||||
|
||||
|
||||
class DropPath(nn.Layer):
|
||||
def __init__(self, drop_prob=None):
|
||||
super(DropPath, self).__init__()
|
||||
self.drop_prob = drop_prob
|
||||
|
||||
def forward(self, x):
|
||||
return drop_path(x, self.drop_prob, self.training)
|
||||
|
||||
|
||||
class Identity(nn.Layer):
|
||||
def __init__(self):
|
||||
super(Identity, self).__init__()
|
||||
|
||||
def forward(self, input):
|
||||
return input
|
||||
|
||||
|
||||
# common funcs
|
||||
|
||||
|
||||
def to_2tuple(x):
|
||||
if isinstance(x, (list, tuple)):
|
||||
return x
|
||||
return tuple([x] * 2)
|
||||
|
||||
|
||||
def add_parameter(layer, datas, name=None):
|
||||
parameter = layer.create_parameter(
|
||||
shape=(datas.shape), default_initializer=Assign(datas))
|
||||
if name:
|
||||
layer.add_parameter(name, parameter)
|
||||
return parameter
|
||||
|
||||
|
||||
def window_partition(x, window_size):
|
||||
"""
|
||||
Partition into non-overlapping windows with padding if needed.
|
||||
Args:
|
||||
x (tensor): input tokens with [B, H, W, C].
|
||||
window_size (int): window size.
|
||||
Returns:
|
||||
windows: windows after partition with [B * num_windows, window_size, window_size, C].
|
||||
(Hp, Wp): padded height and width before partition
|
||||
"""
|
||||
B, H, W, C = paddle.shape(x)
|
||||
|
||||
pad_h = (window_size - H % window_size) % window_size
|
||||
pad_w = (window_size - W % window_size) % window_size
|
||||
x = F.pad(x.transpose([0, 3, 1, 2]),
|
||||
paddle.to_tensor(
|
||||
[0, int(pad_w), 0, int(pad_h)],
|
||||
dtype='int32')).transpose([0, 2, 3, 1])
|
||||
Hp, Wp = H + pad_h, W + pad_w
|
||||
|
||||
num_h, num_w = Hp // window_size, Wp // window_size
|
||||
|
||||
x = x.reshape([B, num_h, window_size, num_w, window_size, C])
|
||||
windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
|
||||
[-1, window_size, window_size, C])
|
||||
return windows, (Hp, Wp), (num_h, num_w)
|
||||
|
||||
|
||||
def window_unpartition(x, pad_hw, num_hw, hw):
|
||||
"""
|
||||
Window unpartition into original sequences and removing padding.
|
||||
Args:
|
||||
x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
|
||||
pad_hw (Tuple): padded height and width (Hp, Wp).
|
||||
hw (Tuple): original height and width (H, W) before padding.
|
||||
Returns:
|
||||
x: unpartitioned sequences with [B, H, W, C].
|
||||
"""
|
||||
Hp, Wp = pad_hw
|
||||
num_h, num_w = num_hw
|
||||
H, W = hw
|
||||
B, window_size, _, C = paddle.shape(x)
|
||||
B = B // (num_h * num_w)
|
||||
x = x.reshape([B, num_h, num_w, window_size, window_size, C])
|
||||
x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, Hp, Wp, C])
|
||||
|
||||
return x[:, :H, :W, :]
|
||||
652
rtdetr_paddle/ppdet/modeling/backbones/vision_transformer.py
Normal file
652
rtdetr_paddle/ppdet/modeling/backbones/vision_transformer.py
Normal file
@@ -0,0 +1,652 @@
|
||||
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
import numpy as np
|
||||
from paddle.nn.initializer import Constant
|
||||
|
||||
from ppdet.modeling.shape_spec import ShapeSpec
|
||||
from ppdet.core.workspace import register, serializable
|
||||
|
||||
from .transformer_utils import zeros_, DropPath, Identity
|
||||
|
||||
|
||||
class Mlp(nn.Layer):
|
||||
def __init__(self,
|
||||
in_features,
|
||||
hidden_features=None,
|
||||
out_features=None,
|
||||
act_layer=nn.GELU,
|
||||
drop=0.):
|
||||
super().__init__()
|
||||
out_features = out_features or in_features
|
||||
hidden_features = hidden_features or in_features
|
||||
self.fc1 = nn.Linear(in_features, hidden_features)
|
||||
self.act = act_layer()
|
||||
self.fc2 = nn.Linear(hidden_features, out_features)
|
||||
self.drop = nn.Dropout(drop)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.fc1(x)
|
||||
x = self.act(x)
|
||||
x = self.drop(x)
|
||||
x = self.fc2(x)
|
||||
x = self.drop(x)
|
||||
return x
|
||||
|
||||
|
||||
class Attention(nn.Layer):
|
||||
def __init__(self,
|
||||
dim,
|
||||
num_heads=8,
|
||||
qkv_bias=False,
|
||||
qk_scale=None,
|
||||
attn_drop=0.,
|
||||
proj_drop=0.,
|
||||
window_size=None):
|
||||
super().__init__()
|
||||
self.num_heads = num_heads
|
||||
head_dim = dim // num_heads
|
||||
self.scale = qk_scale or head_dim**-0.5
|
||||
|
||||
self.qkv = nn.Linear(dim, dim * 3, bias_attr=False)
|
||||
|
||||
if qkv_bias:
|
||||
self.q_bias = self.create_parameter(
|
||||
shape=([dim]), default_initializer=zeros_)
|
||||
self.v_bias = self.create_parameter(
|
||||
shape=([dim]), default_initializer=zeros_)
|
||||
else:
|
||||
self.q_bias = None
|
||||
self.v_bias = None
|
||||
if window_size:
|
||||
self.window_size = window_size
|
||||
self.num_relative_distance = (2 * window_size[0] - 1) * (
|
||||
2 * window_size[1] - 1) + 3
|
||||
self.relative_position_bias_table = self.create_parameter(
|
||||
shape=(self.num_relative_distance, num_heads),
|
||||
default_initializer=zeros_) # 2*Wh-1 * 2*Ww-1, nH
|
||||
# cls to token & token 2 cls & cls to cls
|
||||
|
||||
# get pair-wise relative position index for each token inside the window
|
||||
coords_h = paddle.arange(window_size[0])
|
||||
coords_w = paddle.arange(window_size[1])
|
||||
coords = paddle.stack(paddle.meshgrid(
|
||||
[coords_h, coords_w])) # 2, Wh, Ww
|
||||
coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww
|
||||
coords_flatten_1 = paddle.unsqueeze(coords_flatten, 2)
|
||||
coords_flatten_2 = paddle.unsqueeze(coords_flatten, 1)
|
||||
relative_coords = coords_flatten_1.clone() - coords_flatten_2.clone(
|
||||
)
|
||||
|
||||
#relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Wh
|
||||
relative_coords = relative_coords.transpose(
|
||||
(1, 2, 0)) #.contiguous() # Wh*Ww, Wh*Ww, 2
|
||||
relative_coords[:, :, 0] += window_size[
|
||||
0] - 1 # shift to start from 0
|
||||
relative_coords[:, :, 1] += window_size[1] - 1
|
||||
relative_coords[:, :, 0] *= 2 * window_size[1] - 1
|
||||
relative_position_index = \
|
||||
paddle.zeros(shape=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
|
||||
relative_position_index[1:, 1:] = relative_coords.sum(
|
||||
-1) # Wh*Ww, Wh*Ww
|
||||
relative_position_index[0, 0:] = self.num_relative_distance - 3
|
||||
relative_position_index[0:, 0] = self.num_relative_distance - 2
|
||||
relative_position_index[0, 0] = self.num_relative_distance - 1
|
||||
|
||||
self.register_buffer("relative_position_index",
|
||||
relative_position_index)
|
||||
# trunc_normal_(self.relative_position_bias_table, std=.0)
|
||||
else:
|
||||
self.window_size = None
|
||||
self.relative_position_bias_table = None
|
||||
self.relative_position_index = None
|
||||
|
||||
self.attn_drop = nn.Dropout(attn_drop)
|
||||
self.proj = nn.Linear(dim, dim)
|
||||
self.proj_drop = nn.Dropout(proj_drop)
|
||||
|
||||
def forward(self, x, rel_pos_bias=None):
|
||||
x_shape = paddle.shape(x)
|
||||
N, C = x_shape[1], x_shape[2]
|
||||
|
||||
qkv_bias = None
|
||||
if self.q_bias is not None:
|
||||
qkv_bias = paddle.concat(
|
||||
(self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
|
||||
qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
|
||||
|
||||
qkv = qkv.reshape((-1, N, 3, self.num_heads,
|
||||
C // self.num_heads)).transpose((2, 0, 3, 1, 4))
|
||||
q, k, v = qkv[0], qkv[1], qkv[2]
|
||||
attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
|
||||
|
||||
if self.relative_position_bias_table is not None:
|
||||
relative_position_bias = self.relative_position_bias_table[
|
||||
self.relative_position_index.reshape([-1])].reshape([
|
||||
self.window_size[0] * self.window_size[1] + 1,
|
||||
self.window_size[0] * self.window_size[1] + 1, -1
|
||||
]) # Wh*Ww,Wh*Ww,nH
|
||||
relative_position_bias = relative_position_bias.transpose(
|
||||
(2, 0, 1)) #.contiguous() # nH, Wh*Ww, Wh*Ww
|
||||
attn = attn + relative_position_bias.unsqueeze(0)
|
||||
if rel_pos_bias is not None:
|
||||
attn = attn + rel_pos_bias
|
||||
|
||||
attn = nn.functional.softmax(attn, axis=-1)
|
||||
attn = self.attn_drop(attn)
|
||||
|
||||
x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
|
||||
x = self.proj(x)
|
||||
x = self.proj_drop(x)
|
||||
return x
|
||||
|
||||
|
||||
class Block(nn.Layer):
|
||||
def __init__(self,
|
||||
dim,
|
||||
num_heads,
|
||||
mlp_ratio=4.,
|
||||
qkv_bias=False,
|
||||
qk_scale=None,
|
||||
drop=0.,
|
||||
attn_drop=0.,
|
||||
drop_path=0.,
|
||||
window_size=None,
|
||||
init_values=None,
|
||||
act_layer=nn.GELU,
|
||||
norm_layer='nn.LayerNorm',
|
||||
epsilon=1e-5):
|
||||
super().__init__()
|
||||
self.norm1 = nn.LayerNorm(dim, epsilon=1e-6)
|
||||
self.attn = Attention(
|
||||
dim,
|
||||
num_heads=num_heads,
|
||||
qkv_bias=qkv_bias,
|
||||
qk_scale=qk_scale,
|
||||
attn_drop=attn_drop,
|
||||
proj_drop=drop,
|
||||
window_size=window_size)
|
||||
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
|
||||
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
|
||||
self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
|
||||
mlp_hidden_dim = int(dim * mlp_ratio)
|
||||
self.mlp = Mlp(in_features=dim,
|
||||
hidden_features=mlp_hidden_dim,
|
||||
act_layer=act_layer,
|
||||
drop=drop)
|
||||
if init_values is not None:
|
||||
self.gamma_1 = self.create_parameter(
|
||||
shape=([dim]), default_initializer=Constant(value=init_values))
|
||||
self.gamma_2 = self.create_parameter(
|
||||
shape=([dim]), default_initializer=Constant(value=init_values))
|
||||
else:
|
||||
self.gamma_1, self.gamma_2 = None, None
|
||||
|
||||
def forward(self, x, rel_pos_bias=None):
|
||||
|
||||
if self.gamma_1 is None:
|
||||
x = x + self.drop_path(
|
||||
self.attn(
|
||||
self.norm1(x), rel_pos_bias=rel_pos_bias))
|
||||
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
||||
else:
|
||||
x = x + self.drop_path(self.gamma_1 * self.attn(
|
||||
self.norm1(x), rel_pos_bias=rel_pos_bias))
|
||||
x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
|
||||
return x
|
||||
|
||||
|
||||
class PatchEmbed(nn.Layer):
|
||||
""" Image to Patch Embedding
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
img_size=[224, 224],
|
||||
patch_size=16,
|
||||
in_chans=3,
|
||||
embed_dim=768):
|
||||
super().__init__()
|
||||
self.num_patches_w = img_size[0] // patch_size
|
||||
self.num_patches_h = img_size[1] // patch_size
|
||||
|
||||
num_patches = self.num_patches_w * self.num_patches_h
|
||||
self.patch_shape = (img_size[0] // patch_size,
|
||||
img_size[1] // patch_size)
|
||||
self.img_size = img_size
|
||||
self.patch_size = patch_size
|
||||
self.num_patches = num_patches
|
||||
|
||||
self.proj = nn.Conv2D(
|
||||
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
|
||||
|
||||
@property
|
||||
def num_patches_in_h(self):
|
||||
return self.img_size[1] // self.patch_size
|
||||
|
||||
@property
|
||||
def num_patches_in_w(self):
|
||||
return self.img_size[0] // self.patch_size
|
||||
|
||||
def forward(self, x, mask=None):
|
||||
B, C, H, W = x.shape
|
||||
return self.proj(x)
|
||||
|
||||
|
||||
class RelativePositionBias(nn.Layer):
|
||||
def __init__(self, window_size, num_heads):
|
||||
super().__init__()
|
||||
self.window_size = window_size
|
||||
self.num_relative_distance = (2 * window_size[0] - 1) * (
|
||||
2 * window_size[1] - 1) + 3
|
||||
self.relative_position_bias_table = self.create_parameter(
|
||||
shape=(self.num_relative_distance, num_heads),
|
||||
default_initialize=zeros_)
|
||||
# cls to token & token 2 cls & cls to cls
|
||||
|
||||
# get pair-wise relative position index for each token inside the window
|
||||
coords_h = paddle.arange(window_size[0])
|
||||
coords_w = paddle.arange(window_size[1])
|
||||
coords = paddle.stack(paddle.meshgrid(
|
||||
[coords_h, coords_w])) # 2, Wh, Ww
|
||||
coords_flatten = coords.flatten(1) # 2, Wh*Ww
|
||||
|
||||
relative_coords = coords_flatten[:, :,
|
||||
None] - coords_flatten[:,
|
||||
None, :] # 2, Wh*Ww, Wh*Ww
|
||||
relative_coords = relative_coords.transpos(
|
||||
(1, 2, 0)) # Wh*Ww, Wh*Ww, 2
|
||||
relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0
|
||||
relative_coords[:, :, 1] += window_size[1] - 1
|
||||
relative_coords[:, :, 0] *= 2 * window_size[1] - 1
|
||||
relative_position_index = \
|
||||
paddle.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
|
||||
relative_position_index[1:, 1:] = relative_coords.sum(
|
||||
-1) # Wh*Ww, Wh*Ww
|
||||
relative_position_index[0, 0:] = self.num_relative_distance - 3
|
||||
relative_position_index[0:, 0] = self.num_relative_distance - 2
|
||||
relative_position_index[0, 0] = self.num_relative_distance - 1
|
||||
self.register_buffer("relative_position_index", relative_position_index)
|
||||
|
||||
def forward(self):
|
||||
relative_position_bias = \
|
||||
self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
|
||||
self.window_size[0] * self.window_size[1] + 1,
|
||||
self.window_size[0] * self.window_size[1] + 1, -1]) # Wh*Ww,Wh*Ww,nH
|
||||
return relative_position_bias.transpose((2, 0, 1)) # nH, Wh*Ww, Wh*Ww
|
||||
|
||||
|
||||
def get_sinusoid_encoding_table(n_position, d_hid, token=False):
|
||||
''' Sinusoid position encoding table '''
|
||||
|
||||
def get_position_angle_vec(position):
|
||||
return [
|
||||
position / np.power(10000, 2 * (hid_j // 2) / d_hid)
|
||||
for hid_j in range(d_hid)
|
||||
]
|
||||
|
||||
sinusoid_table = np.array(
|
||||
[get_position_angle_vec(pos_i) for pos_i in range(n_position)])
|
||||
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
|
||||
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
|
||||
if token:
|
||||
sinusoid_table = np.concatenate(
|
||||
[sinusoid_table, np.zeros([1, d_hid])], dim=0)
|
||||
|
||||
return paddle.to_tensor(sinusoid_table, dtype=paddle.float32).unsqueeze(0)
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class VisionTransformer(nn.Layer):
|
||||
""" Vision Transformer with support for patch input
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
img_size=[672, 1092],
|
||||
patch_size=16,
|
||||
in_chans=3,
|
||||
embed_dim=768,
|
||||
depth=12,
|
||||
num_heads=12,
|
||||
mlp_ratio=4,
|
||||
qkv_bias=False,
|
||||
qk_scale=None,
|
||||
drop_rate=0.,
|
||||
attn_drop_rate=0.,
|
||||
drop_path_rate=0.,
|
||||
norm_layer='nn.LayerNorm',
|
||||
init_values=None,
|
||||
use_rel_pos_bias=False,
|
||||
use_shared_rel_pos_bias=False,
|
||||
epsilon=1e-5,
|
||||
final_norm=False,
|
||||
pretrained=None,
|
||||
out_indices=[3, 5, 7, 11],
|
||||
use_abs_pos_emb=False,
|
||||
use_sincos_pos_emb=True,
|
||||
with_fpn=True,
|
||||
num_fpn_levels=4,
|
||||
use_checkpoint=False,
|
||||
**args):
|
||||
super().__init__()
|
||||
self.img_size = img_size
|
||||
self.embed_dim = embed_dim
|
||||
self.with_fpn = with_fpn
|
||||
self.use_checkpoint = use_checkpoint
|
||||
self.use_sincos_pos_emb = use_sincos_pos_emb
|
||||
self.use_rel_pos_bias = use_rel_pos_bias
|
||||
self.final_norm = final_norm
|
||||
self.out_indices = out_indices
|
||||
self.num_fpn_levels = num_fpn_levels
|
||||
|
||||
if use_checkpoint:
|
||||
paddle.seed(0)
|
||||
|
||||
self.patch_embed = PatchEmbed(
|
||||
img_size=img_size,
|
||||
patch_size=patch_size,
|
||||
in_chans=in_chans,
|
||||
embed_dim=embed_dim)
|
||||
|
||||
self.pos_w = self.patch_embed.num_patches_in_w
|
||||
self.pos_h = self.patch_embed.num_patches_in_h
|
||||
|
||||
self.cls_token = self.create_parameter(
|
||||
shape=(1, 1, embed_dim),
|
||||
default_initializer=paddle.nn.initializer.Constant(value=0.))
|
||||
|
||||
if use_abs_pos_emb:
|
||||
self.pos_embed = self.create_parameter(
|
||||
shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
|
||||
default_initializer=paddle.nn.initializer.TruncatedNormal(
|
||||
std=.02))
|
||||
elif use_sincos_pos_emb:
|
||||
pos_embed = self.build_2d_sincos_position_embedding(embed_dim)
|
||||
|
||||
self.pos_embed = pos_embed
|
||||
self.pos_embed = self.create_parameter(shape=pos_embed.shape)
|
||||
self.pos_embed.set_value(pos_embed.numpy())
|
||||
self.pos_embed.stop_gradient = True
|
||||
|
||||
else:
|
||||
self.pos_embed = None
|
||||
|
||||
self.pos_drop = nn.Dropout(p=drop_rate)
|
||||
|
||||
if use_shared_rel_pos_bias:
|
||||
self.rel_pos_bias = RelativePositionBias(
|
||||
window_size=self.patch_embed.patch_shape, num_heads=num_heads)
|
||||
else:
|
||||
self.rel_pos_bias = None
|
||||
|
||||
dpr = np.linspace(0, drop_path_rate, depth)
|
||||
|
||||
self.blocks = nn.LayerList([
|
||||
Block(
|
||||
dim=embed_dim,
|
||||
num_heads=num_heads,
|
||||
mlp_ratio=mlp_ratio,
|
||||
qkv_bias=qkv_bias,
|
||||
qk_scale=qk_scale,
|
||||
drop=drop_rate,
|
||||
attn_drop=attn_drop_rate,
|
||||
drop_path=dpr[i],
|
||||
norm_layer=norm_layer,
|
||||
init_values=init_values,
|
||||
window_size=self.patch_embed.patch_shape
|
||||
if use_rel_pos_bias else None,
|
||||
epsilon=epsilon) for i in range(depth)
|
||||
])
|
||||
|
||||
self.pretrained = pretrained
|
||||
self.init_weight()
|
||||
|
||||
assert len(out_indices) <= 4, ''
|
||||
self.out_indices = out_indices
|
||||
self.out_channels = [embed_dim for _ in range(num_fpn_levels)]
|
||||
self.out_strides = [4, 8, 16, 32][-num_fpn_levels:] if with_fpn else [
|
||||
patch_size for _ in range(len(out_indices))
|
||||
]
|
||||
|
||||
self.norm = Identity()
|
||||
|
||||
if self.with_fpn:
|
||||
assert num_fpn_levels <= 4, ''
|
||||
self.init_fpn(
|
||||
embed_dim=embed_dim,
|
||||
patch_size=patch_size, )
|
||||
|
||||
def init_weight(self):
|
||||
pretrained = self.pretrained
|
||||
|
||||
if pretrained:
|
||||
if 'http' in pretrained: #URL
|
||||
path = paddle.utils.download.get_weights_path_from_url(
|
||||
pretrained)
|
||||
else: #model in local path
|
||||
path = pretrained
|
||||
|
||||
load_state_dict = paddle.load(path)
|
||||
model_state_dict = self.state_dict()
|
||||
pos_embed_name = "pos_embed"
|
||||
|
||||
if pos_embed_name in load_state_dict.keys():
|
||||
load_pos_embed = paddle.to_tensor(
|
||||
load_state_dict[pos_embed_name], dtype="float32")
|
||||
if self.pos_embed.shape != load_pos_embed.shape:
|
||||
pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
|
||||
model_state_dict[pos_embed_name] = self.resize_pos_embed(
|
||||
load_pos_embed, (pos_size, pos_size),
|
||||
(self.pos_h, self.pos_w))
|
||||
|
||||
# self.set_state_dict(model_state_dict)
|
||||
load_state_dict[pos_embed_name] = model_state_dict[
|
||||
pos_embed_name]
|
||||
|
||||
print("Load pos_embed and resize it from {} to {} .".format(
|
||||
load_pos_embed.shape, self.pos_embed.shape))
|
||||
|
||||
self.set_state_dict(load_state_dict)
|
||||
print("Load load_state_dict....")
|
||||
|
||||
def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
|
||||
if patch_size == 16:
|
||||
self.fpn1 = nn.Sequential(
|
||||
nn.Conv2DTranspose(
|
||||
embed_dim, embed_dim, kernel_size=2, stride=2),
|
||||
nn.BatchNorm2D(embed_dim),
|
||||
nn.GELU(),
|
||||
nn.Conv2DTranspose(
|
||||
embed_dim, embed_dim, kernel_size=2, stride=2), )
|
||||
|
||||
self.fpn2 = nn.Sequential(
|
||||
nn.Conv2DTranspose(
|
||||
embed_dim, embed_dim, kernel_size=2, stride=2), )
|
||||
|
||||
self.fpn3 = Identity()
|
||||
|
||||
self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
|
||||
elif patch_size == 8:
|
||||
self.fpn1 = nn.Sequential(
|
||||
nn.Conv2DTranspose(
|
||||
embed_dim, embed_dim, kernel_size=2, stride=2), )
|
||||
|
||||
self.fpn2 = Identity()
|
||||
|
||||
self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )
|
||||
|
||||
self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )
|
||||
|
||||
if not out_with_norm:
|
||||
self.norm = Identity()
|
||||
else:
|
||||
self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6)
|
||||
|
||||
def interpolate_pos_encoding(self, x, w, h):
|
||||
npatch = x.shape[1] - 1
|
||||
N = self.pos_embed.shape[1] - 1
|
||||
w0 = w // self.patch_embed.patch_size
|
||||
h0 = h // self.patch_embed.patch_size
|
||||
if npatch == N and w0 == self.patch_embed.num_patches_w and h0 == self.patch_embed.num_patches_h:
|
||||
return self.pos_embed
|
||||
class_pos_embed = self.pos_embed[:, 0]
|
||||
patch_pos_embed = self.pos_embed[:, 1:]
|
||||
dim = x.shape[-1]
|
||||
# we add a small number to avoid floating point error in the interpolation
|
||||
# see discussion at https://github.com/facebookresearch/dino/issues/8
|
||||
# w0, h0 = w0 + 0.1, h0 + 0.1
|
||||
# patch_pos_embed = nn.functional.interpolate(
|
||||
# patch_pos_embed.reshape([
|
||||
# 1, self.patch_embed.num_patches_w,
|
||||
# self.patch_embed.num_patches_h, dim
|
||||
# ]).transpose((0, 3, 1, 2)),
|
||||
# scale_factor=(w0 / self.patch_embed.num_patches_w,
|
||||
# h0 / self.patch_embed.num_patches_h),
|
||||
# mode='bicubic', )
|
||||
|
||||
patch_pos_embed = nn.functional.interpolate(
|
||||
patch_pos_embed.reshape([
|
||||
1, self.patch_embed.num_patches_w,
|
||||
self.patch_embed.num_patches_h, dim
|
||||
]).transpose((0, 3, 1, 2)),
|
||||
(w0, h0),
|
||||
mode='bicubic', )
|
||||
|
||||
assert int(w0) == patch_pos_embed.shape[-2] and int(
|
||||
h0) == patch_pos_embed.shape[-1]
|
||||
patch_pos_embed = patch_pos_embed.transpose(
|
||||
(0, 2, 3, 1)).reshape([1, -1, dim])
|
||||
return paddle.concat(
|
||||
(class_pos_embed.unsqueeze(0), patch_pos_embed), axis=1)
|
||||
|
||||
def resize_pos_embed(self, pos_embed, old_hw, new_hw):
|
||||
"""
|
||||
Resize pos_embed weight.
|
||||
Args:
|
||||
pos_embed (Tensor): the pos_embed weight
|
||||
old_hw (list[int]): the height and width of old pos_embed
|
||||
new_hw (list[int]): the height and width of new pos_embed
|
||||
Returns:
|
||||
Tensor: the resized pos_embed weight
|
||||
"""
|
||||
cls_pos_embed = pos_embed[:, :1, :]
|
||||
pos_embed = pos_embed[:, 1:, :]
|
||||
|
||||
pos_embed = pos_embed.transpose([0, 2, 1])
|
||||
pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
|
||||
pos_embed = F.interpolate(
|
||||
pos_embed, new_hw, mode='bicubic', align_corners=False)
|
||||
pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
|
||||
pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
|
||||
|
||||
return pos_embed
|
||||
|
||||
def build_2d_sincos_position_embedding(
|
||||
self,
|
||||
embed_dim=768,
|
||||
temperature=10000., ):
|
||||
h, w = self.patch_embed.patch_shape
|
||||
grid_w = paddle.arange(w, dtype=paddle.float32)
|
||||
grid_h = paddle.arange(h, dtype=paddle.float32)
|
||||
grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
|
||||
assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
|
||||
pos_dim = embed_dim // 4
|
||||
omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
|
||||
omega = 1. / (temperature**omega)
|
||||
|
||||
out_w = grid_w.flatten()[..., None] @omega[None]
|
||||
out_h = grid_h.flatten()[..., None] @omega[None]
|
||||
|
||||
pos_emb = paddle.concat(
|
||||
[
|
||||
paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
|
||||
paddle.cos(out_h)
|
||||
],
|
||||
axis=1)[None, :, :]
|
||||
|
||||
pe_token = paddle.zeros([1, 1, embed_dim], dtype=paddle.float32)
|
||||
pos_embed = paddle.concat([pe_token, pos_emb], axis=1)
|
||||
# pos_embed.stop_gradient = True
|
||||
|
||||
return pos_embed
|
||||
|
||||
def forward(self, x):
|
||||
x = x['image'] if isinstance(x, dict) else x
|
||||
_, _, h, w = x.shape
|
||||
|
||||
x = self.patch_embed(x)
|
||||
|
||||
B, D, Hp, Wp = x.shape # b * c * h * w
|
||||
|
||||
cls_tokens = self.cls_token.expand(
|
||||
(B, self.cls_token.shape[-2], self.cls_token.shape[-1]))
|
||||
x = x.flatten(2).transpose([0, 2, 1]) # b * hw * c
|
||||
x = paddle.concat([cls_tokens, x], axis=1)
|
||||
|
||||
if self.pos_embed is not None:
|
||||
# x = x + self.interpolate_pos_encoding(x, w, h)
|
||||
x = x + self.interpolate_pos_encoding(x, h, w)
|
||||
|
||||
x = self.pos_drop(x)
|
||||
|
||||
rel_pos_bias = self.rel_pos_bias(
|
||||
) if self.rel_pos_bias is not None else None
|
||||
|
||||
feats = []
|
||||
for idx, blk in enumerate(self.blocks):
|
||||
if self.use_checkpoint and self.training:
|
||||
x = paddle.distributed.fleet.utils.recompute(
|
||||
blk, x, rel_pos_bias, **{"preserve_rng_state": True})
|
||||
else:
|
||||
x = blk(x, rel_pos_bias)
|
||||
|
||||
if idx in self.out_indices:
|
||||
xp = paddle.reshape(
|
||||
paddle.transpose(
|
||||
self.norm(x[:, 1:, :]), perm=[0, 2, 1]),
|
||||
shape=[B, D, Hp, Wp])
|
||||
feats.append(xp)
|
||||
|
||||
if self.with_fpn:
|
||||
fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4][
|
||||
-self.num_fpn_levels:]
|
||||
assert len(fpns) == len(feats) or len(feats) == 1, ''
|
||||
outputs = []
|
||||
for i, m in enumerate(fpns):
|
||||
outputs.append(
|
||||
m(feats[i] if len(feats) == len(fpns) else feats[-1]))
|
||||
|
||||
return outputs
|
||||
|
||||
return feats
|
||||
|
||||
@property
|
||||
def num_layers(self):
|
||||
return len(self.blocks)
|
||||
|
||||
@property
|
||||
def no_weight_decay(self):
|
||||
return {'pos_embed', 'cls_token'}
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [
|
||||
ShapeSpec(
|
||||
channels=c, stride=s)
|
||||
for c, s in zip(self.out_channels, self.out_strides)
|
||||
]
|
||||
749
rtdetr_paddle/ppdet/modeling/backbones/vit_mae.py
Normal file
749
rtdetr_paddle/ppdet/modeling/backbones/vit_mae.py
Normal file
@@ -0,0 +1,749 @@
|
||||
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
import numpy as np
|
||||
import math
|
||||
from paddle import ParamAttr
|
||||
from paddle.regularizer import L2Decay
|
||||
from paddle.nn.initializer import Constant, TruncatedNormal
|
||||
|
||||
from ppdet.modeling.shape_spec import ShapeSpec
|
||||
from ppdet.core.workspace import register, serializable
|
||||
|
||||
from .transformer_utils import (zeros_, DropPath, Identity, window_partition,
|
||||
window_unpartition)
|
||||
from ..initializer import linear_init_
|
||||
|
||||
__all__ = ['VisionTransformer2D', 'SimpleFeaturePyramid']
|
||||
|
||||
|
||||
class Mlp(nn.Layer):
|
||||
def __init__(self,
|
||||
in_features,
|
||||
hidden_features=None,
|
||||
out_features=None,
|
||||
act_layer='nn.GELU',
|
||||
drop=0.,
|
||||
lr_factor=1.0):
|
||||
super().__init__()
|
||||
out_features = out_features or in_features
|
||||
hidden_features = hidden_features or in_features
|
||||
self.fc1 = nn.Linear(
|
||||
in_features,
|
||||
hidden_features,
|
||||
weight_attr=ParamAttr(learning_rate=lr_factor),
|
||||
bias_attr=ParamAttr(learning_rate=lr_factor))
|
||||
self.act = eval(act_layer)()
|
||||
self.fc2 = nn.Linear(
|
||||
hidden_features,
|
||||
out_features,
|
||||
weight_attr=ParamAttr(learning_rate=lr_factor),
|
||||
bias_attr=ParamAttr(learning_rate=lr_factor))
|
||||
self.drop = nn.Dropout(drop)
|
||||
|
||||
self._init_weights()
|
||||
|
||||
def _init_weights(self):
|
||||
linear_init_(self.fc1)
|
||||
linear_init_(self.fc2)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.drop(self.act(self.fc1(x)))
|
||||
x = self.drop(self.fc2(x))
|
||||
return x
|
||||
|
||||
|
||||
class Attention(nn.Layer):
|
||||
def __init__(self,
|
||||
dim,
|
||||
num_heads=8,
|
||||
qkv_bias=False,
|
||||
attn_bias=False,
|
||||
attn_drop=0.,
|
||||
proj_drop=0.,
|
||||
use_rel_pos=False,
|
||||
rel_pos_zero_init=True,
|
||||
window_size=None,
|
||||
input_size=None,
|
||||
qk_scale=None,
|
||||
lr_factor=1.0):
|
||||
super().__init__()
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = dim // num_heads
|
||||
self.scale = qk_scale or self.head_dim**-0.5
|
||||
self.use_rel_pos = use_rel_pos
|
||||
self.input_size = input_size
|
||||
self.rel_pos_zero_init = rel_pos_zero_init
|
||||
self.window_size = window_size
|
||||
self.lr_factor = lr_factor
|
||||
|
||||
self.qkv = nn.Linear(
|
||||
dim,
|
||||
dim * 3,
|
||||
weight_attr=ParamAttr(learning_rate=lr_factor),
|
||||
bias_attr=ParamAttr(learning_rate=lr_factor)
|
||||
if attn_bias else False)
|
||||
if qkv_bias:
|
||||
self.q_bias = self.create_parameter(
|
||||
shape=([dim]), default_initializer=zeros_)
|
||||
self.v_bias = self.create_parameter(
|
||||
shape=([dim]), default_initializer=zeros_)
|
||||
else:
|
||||
self.q_bias = None
|
||||
self.v_bias = None
|
||||
self.proj = nn.Linear(
|
||||
dim,
|
||||
dim,
|
||||
weight_attr=ParamAttr(learning_rate=lr_factor),
|
||||
bias_attr=ParamAttr(learning_rate=lr_factor))
|
||||
self.attn_drop = nn.Dropout(attn_drop)
|
||||
if window_size is None:
|
||||
self.window_size = self.input_size[0]
|
||||
|
||||
self._init_weights()
|
||||
|
||||
def _init_weights(self):
|
||||
linear_init_(self.qkv)
|
||||
linear_init_(self.proj)
|
||||
|
||||
if self.use_rel_pos:
|
||||
self.rel_pos_h = self.create_parameter(
|
||||
[2 * self.window_size - 1, self.head_dim],
|
||||
attr=ParamAttr(learning_rate=self.lr_factor),
|
||||
default_initializer=Constant(value=0.))
|
||||
self.rel_pos_w = self.create_parameter(
|
||||
[2 * self.window_size - 1, self.head_dim],
|
||||
attr=ParamAttr(learning_rate=self.lr_factor),
|
||||
default_initializer=Constant(value=0.))
|
||||
|
||||
if not self.rel_pos_zero_init:
|
||||
TruncatedNormal(self.rel_pos_h, std=0.02)
|
||||
TruncatedNormal(self.rel_pos_w, std=0.02)
|
||||
|
||||
def get_rel_pos(self, seq_size, rel_pos):
|
||||
max_rel_dist = int(2 * seq_size - 1)
|
||||
# Interpolate rel pos if needed.
|
||||
if rel_pos.shape[0] != max_rel_dist:
|
||||
# Interpolate rel pos.
|
||||
rel_pos = rel_pos.reshape([1, rel_pos.shape[0], -1])
|
||||
rel_pos = rel_pos.transpose([0, 2, 1])
|
||||
rel_pos_resized = F.interpolate(
|
||||
rel_pos,
|
||||
size=(max_rel_dist, ),
|
||||
mode="linear",
|
||||
data_format='NCW')
|
||||
rel_pos_resized = rel_pos_resized.reshape([-1, max_rel_dist])
|
||||
rel_pos_resized = rel_pos_resized.transpose([1, 0])
|
||||
else:
|
||||
rel_pos_resized = rel_pos
|
||||
|
||||
coords = paddle.arange(seq_size, dtype='float32')
|
||||
relative_coords = coords.unsqueeze(-1) - coords.unsqueeze(0)
|
||||
relative_coords += (seq_size - 1)
|
||||
relative_coords = relative_coords.astype('int64').flatten()
|
||||
|
||||
return paddle.index_select(rel_pos_resized, relative_coords).reshape(
|
||||
[seq_size, seq_size, self.head_dim])
|
||||
|
||||
def add_decomposed_rel_pos(self, attn, q, h, w):
|
||||
"""
|
||||
Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
|
||||
Args:
|
||||
attn (Tensor): attention map.
|
||||
q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
|
||||
Returns:
|
||||
attn (Tensor): attention map with added relative positional embeddings.
|
||||
"""
|
||||
Rh = self.get_rel_pos(h, self.rel_pos_h)
|
||||
Rw = self.get_rel_pos(w, self.rel_pos_w)
|
||||
|
||||
B, _, dim = q.shape
|
||||
r_q = q.reshape([B, h, w, dim])
|
||||
# bhwc, hch->bhwh1
|
||||
# bwhc, wcw->bhw1w
|
||||
rel_h = paddle.einsum("bhwc,hkc->bhwk", r_q, Rh).unsqueeze(-1)
|
||||
rel_w = paddle.einsum("bhwc,wkc->bhwk", r_q, Rw).unsqueeze(-2)
|
||||
|
||||
attn = attn.reshape([B, h, w, h, w]) + rel_h + rel_w
|
||||
return attn.reshape([B, h * w, h * w])
|
||||
|
||||
def forward(self, x):
|
||||
B, H, W, C = paddle.shape(x)
|
||||
|
||||
if self.q_bias is not None:
|
||||
qkv_bias = paddle.concat(
|
||||
(self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
|
||||
qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
|
||||
else:
|
||||
qkv = self.qkv(x).reshape(
|
||||
[B, H * W, 3, self.num_heads, self.head_dim]).transpose(
|
||||
[2, 0, 3, 1, 4]).reshape(
|
||||
[3, B * self.num_heads, H * W, self.head_dim])
|
||||
|
||||
q, k, v = qkv[0], qkv[1], qkv[2]
|
||||
attn = q.matmul(k.transpose([0, 2, 1])) * self.scale
|
||||
|
||||
if self.use_rel_pos:
|
||||
attn = self.add_decomposed_rel_pos(attn, q, H, W)
|
||||
|
||||
attn = F.softmax(attn, axis=-1)
|
||||
attn = self.attn_drop(attn)
|
||||
x = attn.matmul(v).reshape(
|
||||
[B, self.num_heads, H * W, self.head_dim]).transpose(
|
||||
[0, 2, 1, 3]).reshape([B, H, W, C])
|
||||
x = self.proj(x)
|
||||
return x
|
||||
|
||||
|
||||
class Block(nn.Layer):
|
||||
def __init__(self,
|
||||
dim,
|
||||
num_heads,
|
||||
mlp_ratio=4.,
|
||||
qkv_bias=False,
|
||||
attn_bias=False,
|
||||
qk_scale=None,
|
||||
init_values=None,
|
||||
drop=0.,
|
||||
attn_drop=0.,
|
||||
drop_path=0.,
|
||||
use_rel_pos=True,
|
||||
rel_pos_zero_init=True,
|
||||
window_size=None,
|
||||
input_size=None,
|
||||
act_layer='nn.GELU',
|
||||
norm_layer='nn.LayerNorm',
|
||||
lr_factor=1.0,
|
||||
epsilon=1e-5):
|
||||
super().__init__()
|
||||
self.window_size = window_size
|
||||
|
||||
self.norm1 = eval(norm_layer)(dim,
|
||||
weight_attr=ParamAttr(
|
||||
learning_rate=lr_factor,
|
||||
regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(
|
||||
learning_rate=lr_factor,
|
||||
regularizer=L2Decay(0.0)),
|
||||
epsilon=epsilon)
|
||||
self.attn = Attention(
|
||||
dim,
|
||||
num_heads=num_heads,
|
||||
qkv_bias=qkv_bias,
|
||||
attn_bias=attn_bias,
|
||||
qk_scale=qk_scale,
|
||||
attn_drop=attn_drop,
|
||||
proj_drop=drop,
|
||||
use_rel_pos=use_rel_pos,
|
||||
rel_pos_zero_init=rel_pos_zero_init,
|
||||
window_size=window_size,
|
||||
input_size=input_size,
|
||||
lr_factor=lr_factor)
|
||||
|
||||
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
|
||||
self.norm2 = eval(norm_layer)(dim,
|
||||
weight_attr=ParamAttr(
|
||||
learning_rate=lr_factor,
|
||||
regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(
|
||||
learning_rate=lr_factor,
|
||||
regularizer=L2Decay(0.0)),
|
||||
epsilon=epsilon)
|
||||
self.mlp = Mlp(in_features=dim,
|
||||
hidden_features=int(dim * mlp_ratio),
|
||||
act_layer=act_layer,
|
||||
drop=drop,
|
||||
lr_factor=lr_factor)
|
||||
if init_values is not None:
|
||||
self.gamma_1 = self.create_parameter(
|
||||
shape=([dim]), default_initializer=Constant(value=init_values))
|
||||
self.gamma_2 = self.create_parameter(
|
||||
shape=([dim]), default_initializer=Constant(value=init_values))
|
||||
else:
|
||||
self.gamma_1, self.gamma_2 = None, None
|
||||
|
||||
def forward(self, x):
|
||||
y = self.norm1(x)
|
||||
if self.window_size is not None:
|
||||
y, pad_hw, num_hw = window_partition(y, self.window_size)
|
||||
y = self.attn(y)
|
||||
if self.gamma_1 is not None:
|
||||
y = self.gamma_1 * y
|
||||
|
||||
if self.window_size is not None:
|
||||
y = window_unpartition(y, pad_hw, num_hw, (x.shape[1], x.shape[2]))
|
||||
x = x + self.drop_path(y)
|
||||
if self.gamma_2 is None:
|
||||
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
||||
else:
|
||||
x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class PatchEmbed(nn.Layer):
|
||||
""" Image to Patch Embedding
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
img_size=(224, 224),
|
||||
patch_size=16,
|
||||
in_chans=3,
|
||||
embed_dim=768,
|
||||
lr_factor=0.01):
|
||||
super().__init__()
|
||||
self.img_size = img_size
|
||||
self.patch_size = patch_size
|
||||
self.proj = nn.Conv2D(
|
||||
in_chans,
|
||||
embed_dim,
|
||||
kernel_size=patch_size,
|
||||
stride=patch_size,
|
||||
weight_attr=ParamAttr(learning_rate=lr_factor),
|
||||
bias_attr=ParamAttr(learning_rate=lr_factor))
|
||||
|
||||
@property
|
||||
def num_patches_in_h(self):
|
||||
return self.img_size[1] // self.patch_size
|
||||
|
||||
@property
|
||||
def num_patches_in_w(self):
|
||||
return self.img_size[0] // self.patch_size
|
||||
|
||||
def forward(self, x):
|
||||
out = self.proj(x)
|
||||
return out
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class VisionTransformer2D(nn.Layer):
|
||||
""" Vision Transformer with support for patch input
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
img_size=(1024, 1024),
|
||||
patch_size=16,
|
||||
in_chans=3,
|
||||
embed_dim=768,
|
||||
depth=12,
|
||||
num_heads=12,
|
||||
mlp_ratio=4,
|
||||
qkv_bias=False,
|
||||
attn_bias=False,
|
||||
qk_scale=None,
|
||||
init_values=None,
|
||||
drop_rate=0.,
|
||||
attn_drop_rate=0.,
|
||||
drop_path_rate=0.,
|
||||
act_layer='nn.GELU',
|
||||
norm_layer='nn.LayerNorm',
|
||||
lr_decay_rate=1.0,
|
||||
global_attn_indexes=(2, 5, 8, 11),
|
||||
use_abs_pos=False,
|
||||
use_rel_pos=False,
|
||||
use_abs_pos_emb=False,
|
||||
use_sincos_pos_emb=False,
|
||||
rel_pos_zero_init=True,
|
||||
epsilon=1e-5,
|
||||
final_norm=False,
|
||||
pretrained=None,
|
||||
window_size=None,
|
||||
out_indices=(11, ),
|
||||
with_fpn=False,
|
||||
use_checkpoint=False,
|
||||
*args,
|
||||
**kwargs):
|
||||
super().__init__()
|
||||
self.img_size = img_size
|
||||
self.patch_size = patch_size
|
||||
self.embed_dim = embed_dim
|
||||
self.num_heads = num_heads
|
||||
self.depth = depth
|
||||
self.global_attn_indexes = global_attn_indexes
|
||||
self.epsilon = epsilon
|
||||
self.with_fpn = with_fpn
|
||||
self.use_checkpoint = use_checkpoint
|
||||
|
||||
self.patch_h = img_size[0] // patch_size
|
||||
self.patch_w = img_size[1] // patch_size
|
||||
self.num_patches = self.patch_h * self.patch_w
|
||||
self.use_abs_pos = use_abs_pos
|
||||
self.use_abs_pos_emb = use_abs_pos_emb
|
||||
|
||||
self.patch_embed = PatchEmbed(
|
||||
img_size=img_size,
|
||||
patch_size=patch_size,
|
||||
in_chans=in_chans,
|
||||
embed_dim=embed_dim)
|
||||
|
||||
dpr = np.linspace(0, drop_path_rate, depth)
|
||||
if use_checkpoint:
|
||||
paddle.seed(0)
|
||||
|
||||
if use_abs_pos_emb:
|
||||
self.pos_w = self.patch_embed.num_patches_in_w
|
||||
self.pos_h = self.patch_embed.num_patches_in_h
|
||||
self.pos_embed = self.create_parameter(
|
||||
shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
|
||||
default_initializer=paddle.nn.initializer.TruncatedNormal(
|
||||
std=.02))
|
||||
elif use_sincos_pos_emb:
|
||||
pos_embed = self.get_2d_sincos_position_embedding(self.patch_h,
|
||||
self.patch_w)
|
||||
|
||||
self.pos_embed = pos_embed
|
||||
self.pos_embed = self.create_parameter(shape=pos_embed.shape)
|
||||
self.pos_embed.set_value(pos_embed.numpy())
|
||||
self.pos_embed.stop_gradient = True
|
||||
else:
|
||||
self.pos_embed = None
|
||||
|
||||
self.blocks = nn.LayerList([
|
||||
Block(
|
||||
embed_dim,
|
||||
num_heads=num_heads,
|
||||
mlp_ratio=mlp_ratio,
|
||||
qkv_bias=qkv_bias,
|
||||
attn_bias=attn_bias,
|
||||
qk_scale=qk_scale,
|
||||
drop=drop_rate,
|
||||
attn_drop=attn_drop_rate,
|
||||
drop_path=dpr[i],
|
||||
use_rel_pos=use_rel_pos,
|
||||
rel_pos_zero_init=rel_pos_zero_init,
|
||||
window_size=None
|
||||
if i in self.global_attn_indexes else window_size,
|
||||
input_size=[self.patch_h, self.patch_w],
|
||||
act_layer=act_layer,
|
||||
lr_factor=self.get_vit_lr_decay_rate(i, lr_decay_rate),
|
||||
norm_layer=norm_layer,
|
||||
init_values=init_values,
|
||||
epsilon=epsilon) for i in range(depth)
|
||||
])
|
||||
|
||||
assert len(out_indices) <= 4, 'out_indices out of bound'
|
||||
self.out_indices = out_indices
|
||||
self.pretrained = pretrained
|
||||
self.init_weight()
|
||||
|
||||
self.out_channels = [embed_dim for _ in range(len(out_indices))]
|
||||
self.out_strides = [4, 8, 16, 32][-len(out_indices):] if with_fpn else [
|
||||
patch_size for _ in range(len(out_indices))
|
||||
]
|
||||
self.norm = Identity()
|
||||
if self.with_fpn:
|
||||
self.init_fpn(
|
||||
embed_dim=embed_dim,
|
||||
patch_size=patch_size,
|
||||
out_with_norm=final_norm)
|
||||
|
||||
def get_vit_lr_decay_rate(self, layer_id, lr_decay_rate):
|
||||
return lr_decay_rate**(self.depth - layer_id)
|
||||
|
||||
def init_weight(self):
|
||||
pretrained = self.pretrained
|
||||
if pretrained:
|
||||
if 'http' in pretrained:
|
||||
path = paddle.utils.download.get_weights_path_from_url(
|
||||
pretrained)
|
||||
else:
|
||||
path = pretrained
|
||||
|
||||
load_state_dict = paddle.load(path)
|
||||
model_state_dict = self.state_dict()
|
||||
pos_embed_name = "pos_embed"
|
||||
|
||||
if pos_embed_name in load_state_dict.keys(
|
||||
) and self.use_abs_pos_emb:
|
||||
load_pos_embed = paddle.to_tensor(
|
||||
load_state_dict[pos_embed_name], dtype="float32")
|
||||
if self.pos_embed.shape != load_pos_embed.shape:
|
||||
pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
|
||||
model_state_dict[pos_embed_name] = self.resize_pos_embed(
|
||||
load_pos_embed, (pos_size, pos_size),
|
||||
(self.pos_h, self.pos_w))
|
||||
|
||||
# self.set_state_dict(model_state_dict)
|
||||
load_state_dict[pos_embed_name] = model_state_dict[
|
||||
pos_embed_name]
|
||||
|
||||
print("Load pos_embed and resize it from {} to {} .".format(
|
||||
load_pos_embed.shape, self.pos_embed.shape))
|
||||
|
||||
self.set_state_dict(load_state_dict)
|
||||
print("Load load_state_dict....")
|
||||
|
||||
def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
|
||||
if patch_size == 16:
|
||||
self.fpn1 = nn.Sequential(
|
||||
nn.Conv2DTranspose(
|
||||
embed_dim, embed_dim, kernel_size=2, stride=2),
|
||||
nn.BatchNorm2D(embed_dim),
|
||||
nn.GELU(),
|
||||
nn.Conv2DTranspose(
|
||||
embed_dim, embed_dim, kernel_size=2, stride=2), )
|
||||
|
||||
self.fpn2 = nn.Sequential(
|
||||
nn.Conv2DTranspose(
|
||||
embed_dim, embed_dim, kernel_size=2, stride=2), )
|
||||
|
||||
self.fpn3 = Identity()
|
||||
|
||||
self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
|
||||
elif patch_size == 8:
|
||||
self.fpn1 = nn.Sequential(
|
||||
nn.Conv2DTranspose(
|
||||
embed_dim, embed_dim, kernel_size=2, stride=2), )
|
||||
|
||||
self.fpn2 = Identity()
|
||||
|
||||
self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )
|
||||
|
||||
self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )
|
||||
|
||||
if not out_with_norm:
|
||||
self.norm = Identity()
|
||||
else:
|
||||
self.norm = nn.LayerNorm(embed_dim, epsilon=self.epsilon)
|
||||
|
||||
def resize_pos_embed(self, pos_embed, old_hw, new_hw):
|
||||
"""
|
||||
Resize pos_embed weight.
|
||||
Args:
|
||||
pos_embed (Tensor): the pos_embed weight
|
||||
old_hw (list[int]): the height and width of old pos_embed
|
||||
new_hw (list[int]): the height and width of new pos_embed
|
||||
Returns:
|
||||
Tensor: the resized pos_embed weight
|
||||
"""
|
||||
cls_pos_embed = pos_embed[:, :1, :]
|
||||
pos_embed = pos_embed[:, 1:, :]
|
||||
|
||||
pos_embed = pos_embed.transpose([0, 2, 1])
|
||||
pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
|
||||
pos_embed = F.interpolate(
|
||||
pos_embed, new_hw, mode='bicubic', align_corners=False)
|
||||
pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
|
||||
pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
|
||||
|
||||
return pos_embed
|
||||
|
||||
def get_2d_sincos_position_embedding(self, h, w, temperature=10000.):
|
||||
grid_y, grid_x = paddle.meshgrid(
|
||||
paddle.arange(
|
||||
h, dtype=paddle.float32),
|
||||
paddle.arange(
|
||||
w, dtype=paddle.float32))
|
||||
assert self.embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
|
||||
pos_dim = self.embed_dim // 4
|
||||
omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
|
||||
omega = (1. / (temperature**omega)).unsqueeze(0)
|
||||
|
||||
out_x = grid_x.reshape([-1, 1]).matmul(omega)
|
||||
out_y = grid_y.reshape([-1, 1]).matmul(omega)
|
||||
|
||||
pos_emb = paddle.concat(
|
||||
[
|
||||
paddle.sin(out_y), paddle.cos(out_y), paddle.sin(out_x),
|
||||
paddle.cos(out_x)
|
||||
],
|
||||
axis=1)
|
||||
|
||||
return pos_emb.reshape([1, h, w, self.embed_dim])
|
||||
|
||||
def forward(self, inputs):
|
||||
x = self.patch_embed(inputs['image']).transpose([0, 2, 3, 1])
|
||||
B, Hp, Wp, _ = paddle.shape(x)
|
||||
|
||||
if self.use_abs_pos:
|
||||
x = x + self.get_2d_sincos_position_embedding(Hp, Wp)
|
||||
|
||||
if self.use_abs_pos_emb:
|
||||
x = x + self.resize_pos_embed(self.pos_embed,
|
||||
(self.pos_h, self.pos_w), (Hp, Wp))
|
||||
|
||||
feats = []
|
||||
for idx, blk in enumerate(self.blocks):
|
||||
if self.use_checkpoint and self.training:
|
||||
x = paddle.distributed.fleet.utils.recompute(
|
||||
blk, x, **{"preserve_rng_state": True})
|
||||
else:
|
||||
x = blk(x)
|
||||
if idx in self.out_indices:
|
||||
feats.append(self.norm(x.transpose([0, 3, 1, 2])))
|
||||
|
||||
if self.with_fpn:
|
||||
fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
|
||||
for i in range(len(feats)):
|
||||
feats[i] = fpns[i](feats[i])
|
||||
return feats
|
||||
|
||||
@property
|
||||
def num_layers(self):
|
||||
return len(self.blocks)
|
||||
|
||||
@property
|
||||
def no_weight_decay(self):
|
||||
return {'pos_embed', 'cls_token'}
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [
|
||||
ShapeSpec(
|
||||
channels=c, stride=s)
|
||||
for c, s in zip(self.out_channels, self.out_strides)
|
||||
]
|
||||
|
||||
|
||||
class LayerNorm(nn.Layer):
|
||||
"""
|
||||
A LayerNorm variant, popularized by Transformers, that performs point-wise mean and
|
||||
variance normalization over the channel dimension for inputs that have shape
|
||||
(batch_size, channels, height, width).
|
||||
Note that, the modified LayerNorm on used in ResBlock and SimpleFeaturePyramid.
|
||||
|
||||
In ViT, we use the nn.LayerNorm
|
||||
"""
|
||||
|
||||
def __init__(self, normalized_shape, eps=1e-6):
|
||||
super().__init__()
|
||||
self.weight = self.create_parameter([normalized_shape])
|
||||
self.bias = self.create_parameter([normalized_shape])
|
||||
self.eps = eps
|
||||
self.normalized_shape = (normalized_shape, )
|
||||
|
||||
def forward(self, x):
|
||||
u = x.mean(1, keepdim=True)
|
||||
s = (x - u).pow(2).mean(1, keepdim=True)
|
||||
x = (x - u) / paddle.sqrt(s + self.eps)
|
||||
x = self.weight[:, None, None] * x + self.bias[:, None, None]
|
||||
return x
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class SimpleFeaturePyramid(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
spatial_scales,
|
||||
num_levels=4,
|
||||
use_bias=False):
|
||||
"""
|
||||
Args:
|
||||
in_channels (list[int]): input channels of each level which can be
|
||||
derived from the output shape of backbone by from_config
|
||||
out_channel (int): output channel of each level.
|
||||
spatial_scales (list[float]): list of scaling factors to upsample or downsample
|
||||
the input features for creating pyramid features which can be derived from
|
||||
the output shape of backbone by from_config
|
||||
num_levels (int): number of levels of output features.
|
||||
use_bias (bool): whether use bias or not.
|
||||
"""
|
||||
super(SimpleFeaturePyramid, self).__init__()
|
||||
|
||||
self.in_channels = in_channels[0]
|
||||
self.out_channels = out_channels
|
||||
self.num_levels = num_levels
|
||||
|
||||
self.stages = []
|
||||
dim = self.in_channels
|
||||
if num_levels == 4:
|
||||
scale_factors = [2.0, 1.0, 0.5]
|
||||
elif num_levels == 5:
|
||||
scale_factors = [4.0, 2.0, 1.0, 0.5]
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"num_levels={num_levels} is not supported yet.")
|
||||
|
||||
dim = in_channels[0]
|
||||
for idx, scale in enumerate(scale_factors):
|
||||
out_dim = dim
|
||||
if scale == 4.0:
|
||||
layers = [
|
||||
nn.Conv2DTranspose(
|
||||
dim, dim // 2, kernel_size=2, stride=2),
|
||||
nn.LayerNorm(dim // 2),
|
||||
nn.GELU(),
|
||||
nn.Conv2DTranspose(
|
||||
dim // 2, dim // 4, kernel_size=2, stride=2),
|
||||
]
|
||||
out_dim = dim // 4
|
||||
elif scale == 2.0:
|
||||
layers = [
|
||||
nn.Conv2DTranspose(
|
||||
dim, dim // 2, kernel_size=2, stride=2)
|
||||
]
|
||||
out_dim = dim // 2
|
||||
elif scale == 1.0:
|
||||
layers = []
|
||||
elif scale == 0.5:
|
||||
layers = [nn.MaxPool2D(kernel_size=2, stride=2)]
|
||||
|
||||
layers.extend([
|
||||
nn.Conv2D(
|
||||
out_dim,
|
||||
out_channels,
|
||||
kernel_size=1,
|
||||
bias_attr=use_bias, ), LayerNorm(out_channels), nn.Conv2D(
|
||||
out_channels,
|
||||
out_channels,
|
||||
kernel_size=3,
|
||||
padding=1,
|
||||
bias_attr=use_bias, ), LayerNorm(out_channels)
|
||||
])
|
||||
layers = nn.Sequential(*layers)
|
||||
|
||||
stage = -int(math.log2(spatial_scales[0] * scale_factors[idx]))
|
||||
self.add_sublayer(f"simfp_{stage}", layers)
|
||||
self.stages.append(layers)
|
||||
|
||||
# top block output feature maps.
|
||||
self.top_block = nn.Sequential(
|
||||
nn.MaxPool2D(
|
||||
kernel_size=1, stride=2, padding=0))
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, cfg, input_shape):
|
||||
return {
|
||||
'in_channels': [i.channels for i in input_shape],
|
||||
'spatial_scales': [1.0 / i.stride for i in input_shape],
|
||||
}
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [
|
||||
ShapeSpec(channels=self.out_channels)
|
||||
for _ in range(self.num_levels)
|
||||
]
|
||||
|
||||
def forward(self, feats):
|
||||
"""
|
||||
Args:
|
||||
x: Tensor of shape (N,C,H,W).
|
||||
"""
|
||||
features = feats[0]
|
||||
results = []
|
||||
|
||||
for stage in self.stages:
|
||||
results.append(stage(features))
|
||||
|
||||
top_block_in_feature = results[-1]
|
||||
results.append(self.top_block(top_block_in_feature))
|
||||
assert self.num_levels == len(results)
|
||||
|
||||
return results
|
||||
Reference in New Issue
Block a user