first commit

This commit is contained in:
陈赣
2026-06-03 12:42:47 +08:00
commit ec23799148
339 changed files with 57120 additions and 0 deletions

View File

@@ -0,0 +1,30 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .resnet import *
from .darknet import *
from .mobilenet_v1 import *
from .mobilenet_v3 import *
from .shufflenet_v2 import *
from .swin_transformer import *
from .lcnet import *
from .cspresnet import *
from .csp_darknet import *
from .convnext import *
from .vision_transformer import *
from .mobileone import *
from .trans_encoder import *
from .focalnet import *
from .vit_mae import *
from .hgnet_v2 import *

View File

@@ -0,0 +1,245 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''
Modified from https://github.com/facebookresearch/ConvNeXt
Copyright (c) Meta Platforms, Inc. and affiliates.
All rights reserved.
This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
'''
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Constant
import numpy as np
from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec
from .transformer_utils import DropPath, trunc_normal_, zeros_
__all__ = ['ConvNeXt']
class Block(nn.Layer):
r""" ConvNeXt Block. There are two equivalent implementations:
(1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
(2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
We use (2) as we find it slightly faster in Pypaddle
Args:
dim (int): Number of input channels.
drop_path (float): Stochastic depth rate. Default: 0.0
layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
"""
def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
super().__init__()
self.dwconv = nn.Conv2D(
dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
self.norm = LayerNorm(dim, eps=1e-6)
self.pwconv1 = nn.Linear(
dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers
self.act = nn.GELU()
self.pwconv2 = nn.Linear(4 * dim, dim)
if layer_scale_init_value > 0:
self.gamma = self.create_parameter(
shape=(dim, ),
attr=ParamAttr(initializer=Constant(layer_scale_init_value)))
else:
self.gamma = None
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity(
)
def forward(self, x):
input = x
x = self.dwconv(x)
x = x.transpose([0, 2, 3, 1])
x = self.norm(x)
x = self.pwconv1(x)
x = self.act(x)
x = self.pwconv2(x)
if self.gamma is not None:
x = self.gamma * x
x = x.transpose([0, 3, 1, 2])
x = input + self.drop_path(x)
return x
class LayerNorm(nn.Layer):
r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
shape (batch_size, height, width, channels) while channels_first corresponds to inputs
with shape (batch_size, channels, height, width).
"""
def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
super().__init__()
self.weight = self.create_parameter(
shape=(normalized_shape, ),
attr=ParamAttr(initializer=Constant(1.)))
self.bias = self.create_parameter(
shape=(normalized_shape, ),
attr=ParamAttr(initializer=Constant(0.)))
self.eps = eps
self.data_format = data_format
if self.data_format not in ["channels_last", "channels_first"]:
raise NotImplementedError
self.normalized_shape = (normalized_shape, )
def forward(self, x):
if self.data_format == "channels_last":
return F.layer_norm(x, self.normalized_shape, self.weight,
self.bias, self.eps)
elif self.data_format == "channels_first":
u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / paddle.sqrt(s + self.eps)
x = self.weight[:, None, None] * x + self.bias[:, None, None]
return x
@register
@serializable
class ConvNeXt(nn.Layer):
r""" ConvNeXt
A Pypaddle impl of : `A ConvNet for the 2020s` -
https://arxiv.org/pdf/2201.03545.pdf
Args:
in_chans (int): Number of input image channels. Default: 3
depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
drop_path_rate (float): Stochastic depth rate. Default: 0.
layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
"""
arch_settings = {
'tiny': {
'depths': [3, 3, 9, 3],
'dims': [96, 192, 384, 768]
},
'small': {
'depths': [3, 3, 27, 3],
'dims': [96, 192, 384, 768]
},
'base': {
'depths': [3, 3, 27, 3],
'dims': [128, 256, 512, 1024]
},
'large': {
'depths': [3, 3, 27, 3],
'dims': [192, 384, 768, 1536]
},
'xlarge': {
'depths': [3, 3, 27, 3],
'dims': [256, 512, 1024, 2048]
},
}
def __init__(
self,
arch='tiny',
in_chans=3,
drop_path_rate=0.,
layer_scale_init_value=1e-6,
return_idx=[1, 2, 3],
norm_output=True,
pretrained=None, ):
super().__init__()
depths = self.arch_settings[arch]['depths']
dims = self.arch_settings[arch]['dims']
self.downsample_layers = nn.LayerList(
) # stem and 3 intermediate downsampling conv layers
stem = nn.Sequential(
nn.Conv2D(
in_chans, dims[0], kernel_size=4, stride=4),
LayerNorm(
dims[0], eps=1e-6, data_format="channels_first"))
self.downsample_layers.append(stem)
for i in range(3):
downsample_layer = nn.Sequential(
LayerNorm(
dims[i], eps=1e-6, data_format="channels_first"),
nn.Conv2D(
dims[i], dims[i + 1], kernel_size=2, stride=2), )
self.downsample_layers.append(downsample_layer)
self.stages = nn.LayerList(
) # 4 feature resolution stages, each consisting of multiple residual blocks
dp_rates = [x for x in np.linspace(0, drop_path_rate, sum(depths))]
cur = 0
for i in range(4):
stage = nn.Sequential(* [
Block(
dim=dims[i],
drop_path=dp_rates[cur + j],
layer_scale_init_value=layer_scale_init_value)
for j in range(depths[i])
])
self.stages.append(stage)
cur += depths[i]
self.return_idx = return_idx
self.dims = [dims[i] for i in return_idx] # [::-1]
self.norm_output = norm_output
if norm_output:
self.norms = nn.LayerList([
LayerNorm(
c, eps=1e-6, data_format="channels_first")
for c in self.dims
])
self.apply(self._init_weights)
if pretrained is not None:
if 'http' in pretrained: #URL
path = paddle.utils.download.get_weights_path_from_url(
pretrained)
else: #model in local path
path = pretrained
self.set_state_dict(paddle.load(path))
def _init_weights(self, m):
if isinstance(m, (nn.Conv2D, nn.Linear)):
trunc_normal_(m.weight)
zeros_(m.bias)
def forward_features(self, x):
output = []
for i in range(4):
x = self.downsample_layers[i](x)
x = self.stages[i](x)
output.append(x)
outputs = [output[i] for i in self.return_idx]
if self.norm_output:
outputs = [self.norms[i](out) for i, out in enumerate(outputs)]
return outputs
def forward(self, x):
x = self.forward_features(x['image'])
return x
@property
def out_shape(self):
return [ShapeSpec(channels=c) for c in self.dims]

View File

@@ -0,0 +1,404 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register, serializable
from ppdet.modeling.initializer import conv_init_
from ..shape_spec import ShapeSpec
__all__ = [
'CSPDarkNet', 'BaseConv', 'DWConv', 'BottleNeck', 'SPPLayer', 'SPPFLayer'
]
class BaseConv(nn.Layer):
def __init__(self,
in_channels,
out_channels,
ksize,
stride,
groups=1,
bias=False,
act="silu"):
super(BaseConv, self).__init__()
self.conv = nn.Conv2D(
in_channels,
out_channels,
kernel_size=ksize,
stride=stride,
padding=(ksize - 1) // 2,
groups=groups,
bias_attr=bias)
self.bn = nn.BatchNorm2D(
out_channels,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
self._init_weights()
def _init_weights(self):
conv_init_(self.conv)
def forward(self, x):
# use 'x * F.sigmoid(x)' replace 'silu'
x = self.bn(self.conv(x))
y = x * F.sigmoid(x)
return y
class DWConv(nn.Layer):
"""Depthwise Conv"""
def __init__(self,
in_channels,
out_channels,
ksize,
stride=1,
bias=False,
act="silu"):
super(DWConv, self).__init__()
self.dw_conv = BaseConv(
in_channels,
in_channels,
ksize=ksize,
stride=stride,
groups=in_channels,
bias=bias,
act=act)
self.pw_conv = BaseConv(
in_channels,
out_channels,
ksize=1,
stride=1,
groups=1,
bias=bias,
act=act)
def forward(self, x):
return self.pw_conv(self.dw_conv(x))
class Focus(nn.Layer):
"""Focus width and height information into channel space, used in YOLOX."""
def __init__(self,
in_channels,
out_channels,
ksize=3,
stride=1,
bias=False,
act="silu"):
super(Focus, self).__init__()
self.conv = BaseConv(
in_channels * 4,
out_channels,
ksize=ksize,
stride=stride,
bias=bias,
act=act)
def forward(self, inputs):
# inputs [bs, C, H, W] -> outputs [bs, 4C, W/2, H/2]
top_left = inputs[:, :, 0::2, 0::2]
top_right = inputs[:, :, 0::2, 1::2]
bottom_left = inputs[:, :, 1::2, 0::2]
bottom_right = inputs[:, :, 1::2, 1::2]
outputs = paddle.concat(
[top_left, bottom_left, top_right, bottom_right], 1)
return self.conv(outputs)
class BottleNeck(nn.Layer):
def __init__(self,
in_channels,
out_channels,
shortcut=True,
expansion=0.5,
depthwise=False,
bias=False,
act="silu"):
super(BottleNeck, self).__init__()
hidden_channels = int(out_channels * expansion)
Conv = DWConv if depthwise else BaseConv
self.conv1 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.conv2 = Conv(
hidden_channels,
out_channels,
ksize=3,
stride=1,
bias=bias,
act=act)
self.add_shortcut = shortcut and in_channels == out_channels
def forward(self, x):
y = self.conv2(self.conv1(x))
if self.add_shortcut:
y = y + x
return y
class SPPLayer(nn.Layer):
"""Spatial Pyramid Pooling (SPP) layer used in YOLOv3-SPP and YOLOX"""
def __init__(self,
in_channels,
out_channels,
kernel_sizes=(5, 9, 13),
bias=False,
act="silu"):
super(SPPLayer, self).__init__()
hidden_channels = in_channels // 2
self.conv1 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.maxpoolings = nn.LayerList([
nn.MaxPool2D(
kernel_size=ks, stride=1, padding=ks // 2)
for ks in kernel_sizes
])
conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
self.conv2 = BaseConv(
conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)
def forward(self, x):
x = self.conv1(x)
x = paddle.concat([x] + [mp(x) for mp in self.maxpoolings], axis=1)
x = self.conv2(x)
return x
class SPPFLayer(nn.Layer):
""" Spatial Pyramid Pooling - Fast (SPPF) layer used in YOLOv5 by Glenn Jocher,
equivalent to SPP(k=(5, 9, 13))
"""
def __init__(self,
in_channels,
out_channels,
ksize=5,
bias=False,
act='silu'):
super(SPPFLayer, self).__init__()
hidden_channels = in_channels // 2
self.conv1 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.maxpooling = nn.MaxPool2D(
kernel_size=ksize, stride=1, padding=ksize // 2)
conv2_channels = hidden_channels * 4
self.conv2 = BaseConv(
conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)
def forward(self, x):
x = self.conv1(x)
y1 = self.maxpooling(x)
y2 = self.maxpooling(y1)
y3 = self.maxpooling(y2)
concats = paddle.concat([x, y1, y2, y3], axis=1)
out = self.conv2(concats)
return out
class CSPLayer(nn.Layer):
"""CSP (Cross Stage Partial) layer with 3 convs, named C3 in YOLOv5"""
def __init__(self,
in_channels,
out_channels,
num_blocks=1,
shortcut=True,
expansion=0.5,
depthwise=False,
bias=False,
act="silu"):
super(CSPLayer, self).__init__()
hidden_channels = int(out_channels * expansion)
self.conv1 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.conv2 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.bottlenecks = nn.Sequential(* [
BottleNeck(
hidden_channels,
hidden_channels,
shortcut=shortcut,
expansion=1.0,
depthwise=depthwise,
bias=bias,
act=act) for _ in range(num_blocks)
])
self.conv3 = BaseConv(
hidden_channels * 2,
out_channels,
ksize=1,
stride=1,
bias=bias,
act=act)
def forward(self, x):
x_1 = self.conv1(x)
x_1 = self.bottlenecks(x_1)
x_2 = self.conv2(x)
x = paddle.concat([x_1, x_2], axis=1)
x = self.conv3(x)
return x
@register
@serializable
class CSPDarkNet(nn.Layer):
"""
CSPDarkNet backbone.
Args:
arch (str): Architecture of CSPDarkNet, from {P5, P6, X}, default as X,
and 'X' means used in YOLOX, 'P5/P6' means used in YOLOv5.
depth_mult (float): Depth multiplier, multiply number of channels in
each layer, default as 1.0.
width_mult (float): Width multiplier, multiply number of blocks in
CSPLayer, default as 1.0.
depthwise (bool): Whether to use depth-wise conv layer.
act (str): Activation function type, default as 'silu'.
return_idx (list): Index of stages whose feature maps are returned.
"""
__shared__ = ['depth_mult', 'width_mult', 'act', 'trt']
# in_channels, out_channels, num_blocks, add_shortcut, use_spp(use_sppf)
# 'X' means setting used in YOLOX, 'P5/P6' means setting used in YOLOv5.
arch_settings = {
'X': [[64, 128, 3, True, False], [128, 256, 9, True, False],
[256, 512, 9, True, False], [512, 1024, 3, False, True]],
'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
[256, 512, 9, True, False], [512, 1024, 3, True, True]],
'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
[256, 512, 9, True, False], [512, 768, 3, True, False],
[768, 1024, 3, True, True]],
}
def __init__(self,
arch='X',
depth_mult=1.0,
width_mult=1.0,
depthwise=False,
act='silu',
trt=False,
return_idx=[2, 3, 4]):
super(CSPDarkNet, self).__init__()
self.arch = arch
self.return_idx = return_idx
Conv = DWConv if depthwise else BaseConv
arch_setting = self.arch_settings[arch]
base_channels = int(arch_setting[0][0] * width_mult)
# Note: differences between the latest YOLOv5 and the original YOLOX
# 1. self.stem, use SPPF(in YOLOv5) or SPP(in YOLOX)
# 2. use SPPF(in YOLOv5) or SPP(in YOLOX)
# 3. put SPPF before(YOLOv5) or SPP after(YOLOX) the last cspdark block's CSPLayer
# 4. whether SPPF(SPP)'CSPLayer add shortcut, True in YOLOv5, False in YOLOX
if arch in ['P5', 'P6']:
# in the latest YOLOv5, use Conv stem, and SPPF (fast, only single spp kernal size)
self.stem = Conv(
3, base_channels, ksize=6, stride=2, bias=False, act=act)
spp_kernal_sizes = 5
elif arch in ['X']:
# in the original YOLOX, use Focus stem, and SPP (three spp kernal sizes)
self.stem = Focus(
3, base_channels, ksize=3, stride=1, bias=False, act=act)
spp_kernal_sizes = (5, 9, 13)
else:
raise AttributeError("Unsupported arch type: {}".format(arch))
_out_channels = [base_channels]
layers_num = 1
self.csp_dark_blocks = []
for i, (in_channels, out_channels, num_blocks, shortcut,
use_spp) in enumerate(arch_setting):
in_channels = int(in_channels * width_mult)
out_channels = int(out_channels * width_mult)
_out_channels.append(out_channels)
num_blocks = max(round(num_blocks * depth_mult), 1)
stage = []
conv_layer = self.add_sublayer(
'layers{}.stage{}.conv_layer'.format(layers_num, i + 1),
Conv(
in_channels, out_channels, 3, 2, bias=False, act=act))
stage.append(conv_layer)
layers_num += 1
if use_spp and arch in ['X']:
# in YOLOX use SPPLayer
spp_layer = self.add_sublayer(
'layers{}.stage{}.spp_layer'.format(layers_num, i + 1),
SPPLayer(
out_channels,
out_channels,
kernel_sizes=spp_kernal_sizes,
bias=False,
act=act))
stage.append(spp_layer)
layers_num += 1
csp_layer = self.add_sublayer(
'layers{}.stage{}.csp_layer'.format(layers_num, i + 1),
CSPLayer(
out_channels,
out_channels,
num_blocks=num_blocks,
shortcut=shortcut,
depthwise=depthwise,
bias=False,
act=act))
stage.append(csp_layer)
layers_num += 1
if use_spp and arch in ['P5', 'P6']:
# in latest YOLOv5 use SPPFLayer instead of SPPLayer
sppf_layer = self.add_sublayer(
'layers{}.stage{}.sppf_layer'.format(layers_num, i + 1),
SPPFLayer(
out_channels,
out_channels,
ksize=5,
bias=False,
act=act))
stage.append(sppf_layer)
layers_num += 1
self.csp_dark_blocks.append(nn.Sequential(*stage))
self._out_channels = [_out_channels[i] for i in self.return_idx]
self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx]
def forward(self, inputs):
x = inputs['image']
outputs = []
x = self.stem(x)
for i, layer in enumerate(self.csp_dark_blocks):
x = layer(x)
if i + 1 in self.return_idx:
outputs.append(x)
return outputs
@property
def out_shape(self):
return [
ShapeSpec(
channels=c, stride=s)
for c, s in zip(self._out_channels, self.strides)
]

View File

@@ -0,0 +1,321 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import Constant
from ppdet.modeling.ops import get_act_fn
from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec
__all__ = ['CSPResNet', 'BasicBlock', 'EffectiveSELayer', 'ConvBNLayer']
class ConvBNLayer(nn.Layer):
def __init__(self,
ch_in,
ch_out,
filter_size=3,
stride=1,
groups=1,
padding=0,
act=None):
super(ConvBNLayer, self).__init__()
self.conv = nn.Conv2D(
in_channels=ch_in,
out_channels=ch_out,
kernel_size=filter_size,
stride=stride,
padding=padding,
groups=groups,
bias_attr=False)
self.bn = nn.BatchNorm2D(
ch_out,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
self.act = get_act_fn(act) if act is None or isinstance(act, (
str, dict)) else act
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
x = self.act(x)
return x
class RepVggBlock(nn.Layer):
def __init__(self, ch_in, ch_out, act='relu', alpha=False):
super(RepVggBlock, self).__init__()
self.ch_in = ch_in
self.ch_out = ch_out
self.conv1 = ConvBNLayer(
ch_in, ch_out, 3, stride=1, padding=1, act=None)
self.conv2 = ConvBNLayer(
ch_in, ch_out, 1, stride=1, padding=0, act=None)
self.act = get_act_fn(act) if act is None or isinstance(act, (
str, dict)) else act
if alpha:
self.alpha = self.create_parameter(
shape=[1],
attr=ParamAttr(initializer=Constant(value=1.)),
dtype="float32")
else:
self.alpha = None
def forward(self, x):
if hasattr(self, 'conv'):
y = self.conv(x)
else:
if self.alpha:
y = self.conv1(x) + self.alpha * self.conv2(x)
else:
y = self.conv1(x) + self.conv2(x)
y = self.act(y)
return y
def convert_to_deploy(self):
if not hasattr(self, 'conv'):
self.conv = nn.Conv2D(
in_channels=self.ch_in,
out_channels=self.ch_out,
kernel_size=3,
stride=1,
padding=1,
groups=1)
kernel, bias = self.get_equivalent_kernel_bias()
self.conv.weight.set_value(kernel)
self.conv.bias.set_value(bias)
self.__delattr__('conv1')
self.__delattr__('conv2')
def get_equivalent_kernel_bias(self):
kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
if self.alpha:
return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor(
kernel1x1), bias3x3 + self.alpha * bias1x1
else:
return kernel3x3 + self._pad_1x1_to_3x3_tensor(
kernel1x1), bias3x3 + bias1x1
def _pad_1x1_to_3x3_tensor(self, kernel1x1):
if kernel1x1 is None:
return 0
else:
return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
def _fuse_bn_tensor(self, branch):
if branch is None:
return 0, 0
kernel = branch.conv.weight
running_mean = branch.bn._mean
running_var = branch.bn._variance
gamma = branch.bn.weight
beta = branch.bn.bias
eps = branch.bn._epsilon
std = (running_var + eps).sqrt()
t = (gamma / std).reshape((-1, 1, 1, 1))
return kernel * t, beta - running_mean * gamma / std
class BasicBlock(nn.Layer):
def __init__(self,
ch_in,
ch_out,
act='relu',
shortcut=True,
use_alpha=False):
super(BasicBlock, self).__init__()
assert ch_in == ch_out
self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act)
self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha)
self.shortcut = shortcut
def forward(self, x):
y = self.conv1(x)
y = self.conv2(y)
if self.shortcut:
return paddle.add(x, y)
else:
return y
class EffectiveSELayer(nn.Layer):
""" Effective Squeeze-Excitation
From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
"""
def __init__(self, channels, act='hardsigmoid'):
super(EffectiveSELayer, self).__init__()
self.fc = nn.Conv2D(channels, channels, kernel_size=1, padding=0)
self.act = get_act_fn(act) if act is None or isinstance(act, (
str, dict)) else act
def forward(self, x):
x_se = x.mean((2, 3), keepdim=True)
x_se = self.fc(x_se)
return x * self.act(x_se)
class CSPResStage(nn.Layer):
def __init__(self,
block_fn,
ch_in,
ch_out,
n,
stride,
act='relu',
attn='eca',
use_alpha=False):
super(CSPResStage, self).__init__()
ch_mid = (ch_in + ch_out) // 2
if stride == 2:
self.conv_down = ConvBNLayer(
ch_in, ch_mid, 3, stride=2, padding=1, act=act)
else:
self.conv_down = None
self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
self.blocks = nn.Sequential(*[
block_fn(
ch_mid // 2,
ch_mid // 2,
act=act,
shortcut=True,
use_alpha=use_alpha) for i in range(n)
])
if attn:
self.attn = EffectiveSELayer(ch_mid, act='hardsigmoid')
else:
self.attn = None
self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act)
def forward(self, x):
if self.conv_down is not None:
x = self.conv_down(x)
y1 = self.conv1(x)
y2 = self.blocks(self.conv2(x))
y = paddle.concat([y1, y2], axis=1)
if self.attn is not None:
y = self.attn(y)
y = self.conv3(y)
return y
@register
@serializable
class CSPResNet(nn.Layer):
__shared__ = ['width_mult', 'depth_mult', 'trt']
def __init__(self,
layers=[3, 6, 6, 3],
channels=[64, 128, 256, 512, 1024],
act='swish',
return_idx=[1, 2, 3],
depth_wise=False,
use_large_stem=False,
width_mult=1.0,
depth_mult=1.0,
trt=False,
use_checkpoint=False,
use_alpha=False,
**args):
super(CSPResNet, self).__init__()
self.use_checkpoint = use_checkpoint
channels = [max(round(c * width_mult), 1) for c in channels]
layers = [max(round(l * depth_mult), 1) for l in layers]
act = get_act_fn(
act, trt=trt) if act is None or isinstance(act,
(str, dict)) else act
if use_large_stem:
self.stem = nn.Sequential(
('conv1', ConvBNLayer(
3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
('conv2', ConvBNLayer(
channels[0] // 2,
channels[0] // 2,
3,
stride=1,
padding=1,
act=act)), ('conv3', ConvBNLayer(
channels[0] // 2,
channels[0],
3,
stride=1,
padding=1,
act=act)))
else:
self.stem = nn.Sequential(
('conv1', ConvBNLayer(
3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
('conv2', ConvBNLayer(
channels[0] // 2,
channels[0],
3,
stride=1,
padding=1,
act=act)))
n = len(channels) - 1
self.stages = nn.Sequential(*[(str(i), CSPResStage(
BasicBlock,
channels[i],
channels[i + 1],
layers[i],
2,
act=act,
use_alpha=use_alpha)) for i in range(n)])
self._out_channels = channels[1:]
self._out_strides = [4 * 2**i for i in range(n)]
self.return_idx = return_idx
if use_checkpoint:
paddle.seed(0)
def forward(self, inputs):
x = inputs['image']
x = self.stem(x)
outs = []
for idx, stage in enumerate(self.stages):
if self.use_checkpoint and self.training:
x = paddle.distributed.fleet.utils.recompute(
stage, x, **{"preserve_rng_state": True})
else:
x = stage(x)
if idx in self.return_idx:
outs.append(x)
return outs
@property
def out_shape(self):
return [
ShapeSpec(
channels=self._out_channels[i], stride=self._out_strides[i])
for i in self.return_idx
]

View File

@@ -0,0 +1,345 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from ppdet.modeling.ops import batch_norm, mish
from ..shape_spec import ShapeSpec
__all__ = ['DarkNet', 'ConvBNLayer']
class ConvBNLayer(nn.Layer):
def __init__(self,
ch_in,
ch_out,
filter_size=3,
stride=1,
groups=1,
padding=0,
norm_type='bn',
norm_decay=0.,
act="leaky",
freeze_norm=False,
data_format='NCHW',
name=''):
"""
conv + bn + activation layer
Args:
ch_in (int): input channel
ch_out (int): output channel
filter_size (int): filter size, default 3
stride (int): stride, default 1
groups (int): number of groups of conv layer, default 1
padding (int): padding size, default 0
norm_type (str): batch norm type, default bn
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
act (str): activation function type, default 'leaky', which means leaky_relu
freeze_norm (bool): whether to freeze norm, default False
data_format (str): data format, NCHW or NHWC
"""
super(ConvBNLayer, self).__init__()
self.conv = nn.Conv2D(
in_channels=ch_in,
out_channels=ch_out,
kernel_size=filter_size,
stride=stride,
padding=padding,
groups=groups,
data_format=data_format,
bias_attr=False)
self.batch_norm = batch_norm(
ch_out,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format)
self.act = act
def forward(self, inputs):
out = self.conv(inputs)
out = self.batch_norm(out)
if self.act == 'leaky':
out = F.leaky_relu(out, 0.1)
else:
out = getattr(F, self.act)(out)
return out
class DownSample(nn.Layer):
def __init__(self,
ch_in,
ch_out,
filter_size=3,
stride=2,
padding=1,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
data_format='NCHW'):
"""
downsample layer
Args:
ch_in (int): input channel
ch_out (int): output channel
filter_size (int): filter size, default 3
stride (int): stride, default 2
padding (int): padding size, default 1
norm_type (str): batch norm type, default bn
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
freeze_norm (bool): whether to freeze norm, default False
data_format (str): data format, NCHW or NHWC
"""
super(DownSample, self).__init__()
self.conv_bn_layer = ConvBNLayer(
ch_in=ch_in,
ch_out=ch_out,
filter_size=filter_size,
stride=stride,
padding=padding,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format)
self.ch_out = ch_out
def forward(self, inputs):
out = self.conv_bn_layer(inputs)
return out
class BasicBlock(nn.Layer):
def __init__(self,
ch_in,
ch_out,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
data_format='NCHW'):
"""
BasicBlock layer of DarkNet
Args:
ch_in (int): input channel
ch_out (int): output channel
norm_type (str): batch norm type, default bn
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
freeze_norm (bool): whether to freeze norm, default False
data_format (str): data format, NCHW or NHWC
"""
super(BasicBlock, self).__init__()
assert ch_in == ch_out and (ch_in % 2) == 0, \
f"ch_in and ch_out should be the same even int, but the input \'ch_in is {ch_in}, \'ch_out is {ch_out}"
# example:
# --------------{conv1} --> {conv2}
# channel route: 10-->5 --> 5-->10
self.conv1 = ConvBNLayer(
ch_in=ch_in,
ch_out=int(ch_out / 2),
filter_size=1,
stride=1,
padding=0,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format)
self.conv2 = ConvBNLayer(
ch_in=int(ch_out / 2),
ch_out=ch_out,
filter_size=3,
stride=1,
padding=1,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format)
def forward(self, inputs):
conv1 = self.conv1(inputs)
conv2 = self.conv2(conv1)
out = paddle.add(x=inputs, y=conv2)
return out
class Blocks(nn.Layer):
def __init__(self,
ch_in,
ch_out,
count,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
name=None,
data_format='NCHW'):
"""
Blocks layer, which consist of some BaickBlock layers
Args:
ch_in (int): input channel
ch_out (int): output channel
count (int): number of BasicBlock layer
norm_type (str): batch norm type, default bn
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
freeze_norm (bool): whether to freeze norm, default False
name (str): layer name
data_format (str): data format, NCHW or NHWC
"""
super(Blocks, self).__init__()
self.basicblock0 = BasicBlock(
ch_in,
ch_out,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format)
self.res_out_list = []
for i in range(1, count):
block_name = '{}.{}'.format(name, i)
res_out = self.add_sublayer(
block_name,
BasicBlock(
ch_out,
ch_out,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format))
self.res_out_list.append(res_out)
self.ch_out = ch_out
def forward(self, inputs):
y = self.basicblock0(inputs)
for basic_block_i in self.res_out_list:
y = basic_block_i(y)
return y
DarkNet_cfg = {53: ([1, 2, 8, 8, 4])}
@register
@serializable
class DarkNet(nn.Layer):
__shared__ = ['norm_type', 'data_format']
def __init__(self,
depth=53,
freeze_at=-1,
return_idx=[2, 3, 4],
num_stages=5,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
data_format='NCHW'):
"""
Darknet, see https://pjreddie.com/darknet/yolo/
Args:
depth (int): depth of network
freeze_at (int): freeze the backbone at which stage
filter_size (int): filter size, default 3
return_idx (list): index of stages whose feature maps are returned
norm_type (str): batch norm type, default bn
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
data_format (str): data format, NCHW or NHWC
"""
super(DarkNet, self).__init__()
self.depth = depth
self.freeze_at = freeze_at
self.return_idx = return_idx
self.num_stages = num_stages
self.stages = DarkNet_cfg[self.depth][0:num_stages]
self.conv0 = ConvBNLayer(
ch_in=3,
ch_out=32,
filter_size=3,
stride=1,
padding=1,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format)
self.downsample0 = DownSample(
ch_in=32,
ch_out=32 * 2,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format)
self._out_channels = []
self.darknet_conv_block_list = []
self.downsample_list = []
ch_in = [64, 128, 256, 512, 1024]
for i, stage in enumerate(self.stages):
name = 'stage.{}'.format(i)
conv_block = self.add_sublayer(
name,
Blocks(
int(ch_in[i]),
int(ch_in[i]),
stage,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format,
name=name))
self.darknet_conv_block_list.append(conv_block)
if i in return_idx:
self._out_channels.append(int(ch_in[i]))
for i in range(num_stages - 1):
down_name = 'stage.{}.downsample'.format(i)
downsample = self.add_sublayer(
down_name,
DownSample(
ch_in=int(ch_in[i]),
ch_out=int(ch_in[i + 1]),
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format))
self.downsample_list.append(downsample)
def forward(self, inputs):
x = inputs['image']
out = self.conv0(x)
out = self.downsample0(out)
blocks = []
for i, conv_block_i in enumerate(self.darknet_conv_block_list):
out = conv_block_i(out)
if i == self.freeze_at:
out.stop_gradient = True
if i in self.return_idx:
blocks.append(out)
if i < self.num_stages - 1:
out = self.downsample_list[i](out)
return blocks
@property
def out_shape(self):
return [ShapeSpec(channels=c) for c in self._out_channels]

View File

@@ -0,0 +1,720 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/microsoft/FocalNet/blob/main/classification/focalnet.py
"""
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.modeling.shape_spec import ShapeSpec
from ppdet.core.workspace import register, serializable
from .transformer_utils import DropPath, Identity
from .transformer_utils import add_parameter, to_2tuple
from .transformer_utils import ones_, zeros_, trunc_normal_
from .swin_transformer import Mlp
__all__ = ['FocalNet']
MODEL_cfg = {
'focalnet_T_224_1k_srf': dict(
embed_dim=96,
depths=[2, 2, 6, 2],
focal_levels=[2, 2, 2, 2],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.2,
use_conv_embed=False,
use_postln=False,
use_postln_in_modulation=False,
use_layerscale=False,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_srf_pretrained.pdparams',
),
'focalnet_S_224_1k_srf': dict(
embed_dim=96,
depths=[2, 2, 18, 2],
focal_levels=[2, 2, 2, 2],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.3,
use_conv_embed=False,
use_postln=False,
use_postln_in_modulation=False,
use_layerscale=False,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_srf_pretrained.pdparams',
),
'focalnet_B_224_1k_srf': dict(
embed_dim=128,
depths=[2, 2, 18, 2],
focal_levels=[2, 2, 2, 2],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.5,
use_conv_embed=False,
use_postln=False,
use_postln_in_modulation=False,
use_layerscale=False,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_srf_pretrained.pdparams',
),
'focalnet_T_224_1k_lrf': dict(
embed_dim=96,
depths=[2, 2, 6, 2],
focal_levels=[3, 3, 3, 3],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.2,
use_conv_embed=False,
use_postln=False,
use_postln_in_modulation=False,
use_layerscale=False,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_lrf_pretrained.pdparams',
),
'focalnet_S_224_1k_lrf': dict(
embed_dim=96,
depths=[2, 2, 18, 2],
focal_levels=[3, 3, 3, 3],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.3,
use_conv_embed=False,
use_postln=False,
use_postln_in_modulation=False,
use_layerscale=False,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_lrf_pretrained.pdparams',
),
'focalnet_B_224_1k_lrf': dict(
embed_dim=128,
depths=[2, 2, 18, 2],
focal_levels=[3, 3, 3, 3],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.5,
use_conv_embed=False,
use_postln=False,
use_postln_in_modulation=False,
use_layerscale=False,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_lrf_pretrained.pdparams',
),
'focalnet_L_384_22k_fl3': dict(
embed_dim=192,
depths=[2, 2, 18, 2],
focal_levels=[3, 3, 3, 3],
focal_windows=[5, 5, 5, 5],
drop_path_rate=0.5,
use_conv_embed=True,
use_postln=True,
use_postln_in_modulation=False,
use_layerscale=True,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_pretrained.pdparams',
),
'focalnet_L_384_22k_fl4': dict(
embed_dim=192,
depths=[2, 2, 18, 2],
focal_levels=[4, 4, 4, 4],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.5,
use_conv_embed=True,
use_postln=True,
use_postln_in_modulation=False,
use_layerscale=True,
normalize_modulator=True, #
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_fl4_pretrained.pdparams',
),
'focalnet_XL_384_22k_fl3': dict(
embed_dim=256,
depths=[2, 2, 18, 2],
focal_levels=[3, 3, 3, 3],
focal_windows=[5, 5, 5, 5],
drop_path_rate=0.5,
use_conv_embed=True,
use_postln=True,
use_postln_in_modulation=False,
use_layerscale=True,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_pretrained.pdparams',
),
'focalnet_XL_384_22k_fl4': dict(
embed_dim=256,
depths=[2, 2, 18, 2],
focal_levels=[4, 4, 4, 4],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.5,
use_conv_embed=True,
use_postln=True,
use_postln_in_modulation=False,
use_layerscale=True,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_fl4_pretrained.pdparams',
),
'focalnet_H_224_22k_fl3': dict(
embed_dim=352,
depths=[2, 2, 18, 2],
focal_levels=[3, 3, 3, 3],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.5,
use_conv_embed=True,
use_postln=True,
use_postln_in_modulation=True, #
use_layerscale=True,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_pretrained.pdparams',
),
'focalnet_H_224_22k_fl4': dict(
embed_dim=352,
depths=[2, 2, 18, 2],
focal_levels=[4, 4, 4, 4],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.5,
use_conv_embed=True,
use_postln=True,
use_postln_in_modulation=True, #
use_layerscale=True,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_fl4_pretrained.pdparams',
),
}
class FocalModulation(nn.Layer):
"""
Args:
dim (int): Number of input channels.
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
focal_level (int): Number of focal levels
focal_window (int): Focal window size at focal level 1
focal_factor (int): Step to increase the focal window. Default: 2
use_postln_in_modulation (bool): Whether use post-modulation layernorm
normalize_modulator (bool): Whether use normalize in modulator
"""
def __init__(self,
dim,
proj_drop=0.,
focal_level=2,
focal_window=7,
focal_factor=2,
use_postln_in_modulation=False,
normalize_modulator=False):
super().__init__()
self.dim = dim
# specific args for focalv3
self.focal_level = focal_level
self.focal_window = focal_window
self.focal_factor = focal_factor
self.use_postln_in_modulation = use_postln_in_modulation
self.normalize_modulator = normalize_modulator
self.f = nn.Linear(
dim, 2 * dim + (self.focal_level + 1), bias_attr=True)
self.h = nn.Conv2D(
dim,
dim,
kernel_size=1,
stride=1,
padding=0,
groups=1,
bias_attr=True)
self.act = nn.GELU()
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
self.focal_layers = nn.LayerList()
if self.use_postln_in_modulation:
self.ln = nn.LayerNorm(dim)
for k in range(self.focal_level):
kernel_size = self.focal_factor * k + self.focal_window
self.focal_layers.append(
nn.Sequential(
nn.Conv2D(
dim,
dim,
kernel_size=kernel_size,
stride=1,
groups=dim,
padding=kernel_size // 2,
bias_attr=False),
nn.GELU()))
def forward(self, x):
""" Forward function.
Args:
x: input features with shape of (B, H, W, C)
"""
_, _, _, C = x.shape
x = self.f(x)
x = x.transpose([0, 3, 1, 2])
q, ctx, gates = paddle.split(x, (C, C, self.focal_level + 1), 1)
ctx_all = 0
for l in range(self.focal_level):
ctx = self.focal_layers[l](ctx)
ctx_all = ctx_all + ctx * gates[:, l:l + 1]
ctx_global = self.act(ctx.mean(2, keepdim=True).mean(3, keepdim=True))
ctx_all = ctx_all + ctx_global * gates[:, self.focal_level:]
if self.normalize_modulator:
ctx_all = ctx_all / (self.focal_level + 1)
x_out = q * self.h(ctx_all)
x_out = x_out.transpose([0, 2, 3, 1])
if self.use_postln_in_modulation:
x_out = self.ln(x_out)
x_out = self.proj(x_out)
x_out = self.proj_drop(x_out)
return x_out
class FocalModulationBlock(nn.Layer):
""" Focal Modulation Block.
Args:
dim (int): Number of input channels.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
drop (float, optional): Dropout rate. Default: 0.0
drop_path (float, optional): Stochastic depth rate. Default: 0.0
act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
focal_level (int): number of focal levels
focal_window (int): focal kernel size at level 1
use_postln (bool): Whether use layernorm after modulation. Default: False.
use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
normalize_modulator (bool): Whether use normalize in modulator
use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False
layerscale_value (float): Value for layer scale. Default: 1e-4
"""
def __init__(self,
dim,
mlp_ratio=4.,
drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm,
focal_level=2,
focal_window=9,
use_postln=False,
use_postln_in_modulation=False,
normalize_modulator=False,
use_layerscale=False,
layerscale_value=1e-4):
super().__init__()
self.dim = dim
self.mlp_ratio = mlp_ratio
self.focal_window = focal_window
self.focal_level = focal_level
self.use_postln = use_postln
self.use_layerscale = use_layerscale
self.norm1 = norm_layer(dim)
self.modulation = FocalModulation(
dim,
proj_drop=drop,
focal_level=self.focal_level,
focal_window=self.focal_window,
use_postln_in_modulation=use_postln_in_modulation,
normalize_modulator=normalize_modulator)
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
self.H = None
self.W = None
self.gamma_1 = 1.0
self.gamma_2 = 1.0
if self.use_layerscale:
self.gamma_1 = add_parameter(self,
layerscale_value * paddle.ones([dim]))
self.gamma_2 = add_parameter(self,
layerscale_value * paddle.ones([dim]))
def forward(self, x):
"""
Args:
x: Input feature, tensor size (B, H*W, C).
"""
B, L, C = x.shape
H, W = self.H, self.W
assert L == H * W, "input feature has wrong size"
shortcut = x
if not self.use_postln:
x = self.norm1(x)
x = x.reshape([-1, H, W, C])
# FM
x = self.modulation(x).reshape([-1, H * W, C])
if self.use_postln:
x = self.norm1(x)
# FFN
x = shortcut + self.drop_path(self.gamma_1 * x)
if self.use_postln:
x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x)))
else:
x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
return x
class BasicLayer(nn.Layer):
""" A basic focal modulation layer for one stage.
Args:
dim (int): Number of feature channels
depth (int): Depths of this stage.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
drop (float, optional): Dropout rate. Default: 0.0
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
focal_level (int): Number of focal levels
focal_window (int): Focal window size at focal level 1
use_conv_embed (bool): Whether use overlapped convolution for patch embedding
use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False
layerscale_value (float): Value of layerscale
use_postln (bool): Whether use layernorm after modulation. Default: False.
use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
normalize_modulator (bool): Whether use normalize in modulator
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
"""
def __init__(self,
dim,
depth,
mlp_ratio=4.,
drop=0.,
drop_path=0.,
norm_layer=nn.LayerNorm,
downsample=None,
focal_level=2,
focal_window=9,
use_conv_embed=False,
use_layerscale=False,
layerscale_value=1e-4,
use_postln=False,
use_postln_in_modulation=False,
normalize_modulator=False,
use_checkpoint=False):
super().__init__()
self.depth = depth
self.use_checkpoint = use_checkpoint
# build blocks
self.blocks = nn.LayerList([
FocalModulationBlock(
dim=dim,
mlp_ratio=mlp_ratio,
drop=drop,
drop_path=drop_path[i]
if isinstance(drop_path, np.ndarray) else drop_path,
act_layer=nn.GELU,
norm_layer=norm_layer,
focal_level=focal_level,
focal_window=focal_window,
use_postln=use_postln,
use_postln_in_modulation=use_postln_in_modulation,
normalize_modulator=normalize_modulator,
use_layerscale=use_layerscale,
layerscale_value=layerscale_value) for i in range(depth)
])
# patch merging layer
if downsample is not None:
self.downsample = downsample(
patch_size=2,
in_chans=dim,
embed_dim=2 * dim,
use_conv_embed=use_conv_embed,
norm_layer=norm_layer,
is_stem=False)
else:
self.downsample = None
def forward(self, x, H, W):
"""
Args:
x: Input feature, tensor size (B, H*W, C).
"""
for blk in self.blocks:
blk.H, blk.W = H, W
x = blk(x)
if self.downsample is not None:
x_reshaped = x.transpose([0, 2, 1]).reshape(
[x.shape[0], x.shape[-1], H, W])
x_down = self.downsample(x_reshaped)
x_down = x_down.flatten(2).transpose([0, 2, 1])
Wh, Ww = (H + 1) // 2, (W + 1) // 2
return x, H, W, x_down, Wh, Ww
else:
return x, H, W, x, H, W
class PatchEmbed(nn.Layer):
""" Image to Patch Embedding
Args:
patch_size (int): Patch token size. Default: 4.
in_chans (int): Number of input image channels. Default: 3.
embed_dim (int): Number of linear projection output channels. Default: 96.
norm_layer (nn.Layer, optional): Normalization layer. Default: None
use_conv_embed (bool): Whether use overlapped convolution for patch embedding. Default: False
is_stem (bool): Is the stem block or not.
"""
def __init__(self,
patch_size=4,
in_chans=3,
embed_dim=96,
norm_layer=None,
use_conv_embed=False,
is_stem=False):
super().__init__()
patch_size = to_2tuple(patch_size)
self.patch_size = patch_size
self.in_chans = in_chans
self.embed_dim = embed_dim
if use_conv_embed:
# if we choose to use conv embedding, then we treat the stem and non-stem differently
if is_stem:
kernel_size = 7
padding = 2
stride = 4
else:
kernel_size = 3
padding = 1
stride = 2
self.proj = nn.Conv2D(
in_chans,
embed_dim,
kernel_size=kernel_size,
stride=stride,
padding=padding)
else:
self.proj = nn.Conv2D(
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
if norm_layer is not None:
self.norm = norm_layer(embed_dim)
else:
self.norm = None
def forward(self, x):
_, _, H, W = x.shape
if W % self.patch_size[1] != 0:
# for 3D tensor: [pad_left, pad_right]
# for 4D tensor: [pad_left, pad_right, pad_top, pad_bottom]
x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
W += W % self.patch_size[1]
if H % self.patch_size[0] != 0:
x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
H += H % self.patch_size[0]
x = self.proj(x)
if self.norm is not None:
_, _, Wh, Ww = x.shape
x = x.flatten(2).transpose([0, 2, 1])
x = self.norm(x)
x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
return x
@register
@serializable
class FocalNet(nn.Layer):
""" FocalNet backbone
Args:
arch (str): Architecture of FocalNet
out_indices (Sequence[int]): Output from which stages.
frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-1 means not freezing any parameters.
patch_size (int | tuple(int)): Patch size. Default: 4.
in_chans (int): Number of input image channels. Default: 3.
embed_dim (int): Number of linear projection output channels. Default: 96.
depths (tuple[int]): Depths of each FocalNet Transformer stage.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
drop_rate (float): Dropout rate.
drop_path_rate (float): Stochastic depth rate. Default: 0.2.
norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
patch_norm (bool): If True, add normalization after patch embedding. Default: True.
focal_levels (Sequence[int]): Number of focal levels at four stages
focal_windows (Sequence[int]): Focal window sizes at first focal level at four stages
use_conv_embed (bool): Whether use overlapped convolution for patch embedding
use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False
layerscale_value (float): Value of layerscale
use_postln (bool): Whether use layernorm after modulation. Default: False.
use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
normalize_modulator (bool): Whether use normalize in modulator
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
"""
def __init__(
self,
arch='focalnet_T_224_1k_srf',
out_indices=(0, 1, 2, 3),
frozen_stages=-1,
patch_size=4,
in_chans=3,
embed_dim=96,
depths=[2, 2, 6, 2],
mlp_ratio=4.,
drop_rate=0.,
drop_path_rate=0.2, # 0.5 better for large+ models
norm_layer=nn.LayerNorm,
patch_norm=True,
focal_levels=[2, 2, 2, 2],
focal_windows=[3, 3, 3, 3],
use_conv_embed=False,
use_layerscale=False,
layerscale_value=1e-4,
use_postln=False,
use_postln_in_modulation=False,
normalize_modulator=False,
use_checkpoint=False,
pretrained=None):
super(FocalNet, self).__init__()
assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch)
embed_dim = MODEL_cfg[arch]['embed_dim']
depths = MODEL_cfg[arch]['depths']
drop_path_rate = MODEL_cfg[arch]['drop_path_rate']
focal_levels = MODEL_cfg[arch]['focal_levels']
focal_windows = MODEL_cfg[arch]['focal_windows']
use_conv_embed = MODEL_cfg[arch]['use_conv_embed']
use_layerscale = MODEL_cfg[arch]['use_layerscale']
use_postln = MODEL_cfg[arch]['use_postln']
use_postln_in_modulation = MODEL_cfg[arch]['use_postln_in_modulation']
normalize_modulator = MODEL_cfg[arch]['normalize_modulator']
if pretrained is None:
pretrained = MODEL_cfg[arch]['pretrained']
self.out_indices = out_indices
self.frozen_stages = frozen_stages
self.num_layers = len(depths)
self.patch_norm = patch_norm
# split image into non-overlapping patches
self.patch_embed = PatchEmbed(
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim,
norm_layer=norm_layer if self.patch_norm else None,
use_conv_embed=use_conv_embed,
is_stem=True)
self.pos_drop = nn.Dropout(p=drop_rate)
# stochastic depth decay rule
dpr = np.linspace(0, drop_path_rate, sum(depths))
# build layers
self.layers = nn.LayerList()
for i_layer in range(self.num_layers):
layer = BasicLayer(
dim=int(embed_dim * 2**i_layer),
depth=depths[i_layer],
mlp_ratio=mlp_ratio,
drop=drop_rate,
drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
norm_layer=norm_layer,
downsample=PatchEmbed
if (i_layer < self.num_layers - 1) else None,
focal_level=focal_levels[i_layer],
focal_window=focal_windows[i_layer],
use_conv_embed=use_conv_embed,
use_layerscale=use_layerscale,
layerscale_value=layerscale_value,
use_postln=use_postln,
use_postln_in_modulation=use_postln_in_modulation,
normalize_modulator=normalize_modulator,
use_checkpoint=use_checkpoint)
self.layers.append(layer)
num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
self.num_features = num_features
# add a norm layer for each output
for i_layer in out_indices:
layer = norm_layer(num_features[i_layer])
layer_name = f'norm{i_layer}'
self.add_sublayer(layer_name, layer)
self.apply(self._init_weights)
self._freeze_stages()
if pretrained:
if 'http' in pretrained: #URL
path = paddle.utils.download.get_weights_path_from_url(
pretrained)
else: #model in local path
path = pretrained
self.set_state_dict(paddle.load(path))
def _freeze_stages(self):
if self.frozen_stages >= 0:
self.patch_embed.eval()
for param in self.patch_embed.parameters():
param.stop_gradient = True
if self.frozen_stages >= 2:
self.pos_drop.eval()
for i in range(0, self.frozen_stages - 1):
m = self.layers[i]
m.eval()
for param in m.parameters():
param.stop_gradient = True
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight)
if isinstance(m, nn.Linear) and m.bias is not None:
zeros_(m.bias)
elif isinstance(m, nn.LayerNorm):
zeros_(m.bias)
ones_(m.weight)
def forward(self, x):
x = self.patch_embed(x['image'])
B, _, Wh, Ww = x.shape
x = x.flatten(2).transpose([0, 2, 1])
x = self.pos_drop(x)
outs = []
for i in range(self.num_layers):
layer = self.layers[i]
x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
if i in self.out_indices:
norm_layer = getattr(self, f'norm{i}')
x_out = norm_layer(x_out)
out = x_out.reshape([-1, H, W, self.num_features[i]]).transpose(
(0, 3, 1, 2))
outs.append(out)
return outs
@property
def out_shape(self):
out_strides = [4, 8, 16, 32]
return [
ShapeSpec(
channels=self.num_features[i], stride=out_strides[i])
for i in self.out_indices
]

View File

@@ -0,0 +1,447 @@
# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import KaimingNormal, Constant
from paddle.nn import Conv2D, BatchNorm2D, ReLU, AdaptiveAvgPool2D, MaxPool2D
from paddle.regularizer import L2Decay
from paddle import ParamAttr
import copy
from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec
__all__ = ['PPHGNetV2']
kaiming_normal_ = KaimingNormal()
zeros_ = Constant(value=0.)
ones_ = Constant(value=1.)
class LearnableAffineBlock(nn.Layer):
def __init__(self,
scale_value=1.0,
bias_value=0.0,
lr_mult=1.0,
lab_lr=0.01):
super().__init__()
self.scale = self.create_parameter(
shape=[1, ],
default_initializer=Constant(value=scale_value),
attr=ParamAttr(learning_rate=lr_mult * lab_lr))
self.add_parameter("scale", self.scale)
self.bias = self.create_parameter(
shape=[1, ],
default_initializer=Constant(value=bias_value),
attr=ParamAttr(learning_rate=lr_mult * lab_lr))
self.add_parameter("bias", self.bias)
def forward(self, x):
return self.scale * x + self.bias
class ConvBNAct(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size=3,
stride=1,
padding=1,
groups=1,
use_act=True,
use_lab=False,
lr_mult=1.0):
super().__init__()
self.use_act = use_act
self.use_lab = use_lab
self.conv = Conv2D(
in_channels,
out_channels,
kernel_size,
stride,
padding=padding
if isinstance(padding, str) else (kernel_size - 1) // 2,
groups=groups,
weight_attr=ParamAttr(learning_rate=lr_mult),
bias_attr=False)
self.bn = BatchNorm2D(
out_channels,
weight_attr=ParamAttr(
regularizer=L2Decay(0.0), learning_rate=lr_mult),
bias_attr=ParamAttr(
regularizer=L2Decay(0.0), learning_rate=lr_mult))
if self.use_act:
self.act = ReLU()
if self.use_lab:
self.lab = LearnableAffineBlock(lr_mult=lr_mult)
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
if self.use_act:
x = self.act(x)
if self.use_lab:
x = self.lab(x)
return x
class LightConvBNAct(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
groups=1,
use_lab=False,
lr_mult=1.0):
super().__init__()
self.conv1 = ConvBNAct(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
use_act=False,
use_lab=use_lab,
lr_mult=lr_mult)
self.conv2 = ConvBNAct(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=kernel_size,
groups=out_channels,
use_act=True,
use_lab=use_lab,
lr_mult=lr_mult)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
return x
class StemBlock(nn.Layer):
def __init__(self,
in_channels,
mid_channels,
out_channels,
use_lab=False,
lr_mult=1.0):
super().__init__()
self.stem1 = ConvBNAct(
in_channels=in_channels,
out_channels=mid_channels,
kernel_size=3,
stride=2,
use_lab=use_lab,
lr_mult=lr_mult)
self.stem2a = ConvBNAct(
in_channels=mid_channels,
out_channels=mid_channels // 2,
kernel_size=2,
stride=1,
padding="SAME",
use_lab=use_lab,
lr_mult=lr_mult)
self.stem2b = ConvBNAct(
in_channels=mid_channels // 2,
out_channels=mid_channels,
kernel_size=2,
stride=1,
padding="SAME",
use_lab=use_lab,
lr_mult=lr_mult)
self.stem3 = ConvBNAct(
in_channels=mid_channels * 2,
out_channels=mid_channels,
kernel_size=3,
stride=2,
use_lab=use_lab,
lr_mult=lr_mult)
self.stem4 = ConvBNAct(
in_channels=mid_channels,
out_channels=out_channels,
kernel_size=1,
stride=1,
use_lab=use_lab,
lr_mult=lr_mult)
self.pool = nn.MaxPool2D(
kernel_size=2, stride=1, ceil_mode=True, padding="SAME")
def forward(self, x):
x = self.stem1(x)
x2 = self.stem2a(x)
x2 = self.stem2b(x2)
x1 = self.pool(x)
x = paddle.concat([x1, x2], 1)
x = self.stem3(x)
x = self.stem4(x)
return x
class HG_Block(nn.Layer):
def __init__(self,
in_channels,
mid_channels,
out_channels,
kernel_size=3,
layer_num=6,
identity=False,
light_block=True,
use_lab=False,
lr_mult=1.0):
super().__init__()
self.identity = identity
self.layers = nn.LayerList()
block_type = "LightConvBNAct" if light_block else "ConvBNAct"
for i in range(layer_num):
self.layers.append(
eval(block_type)(in_channels=in_channels
if i == 0 else mid_channels,
out_channels=mid_channels,
stride=1,
kernel_size=kernel_size,
use_lab=use_lab,
lr_mult=lr_mult))
# feature aggregation
total_channels = in_channels + layer_num * mid_channels
self.aggregation_squeeze_conv = ConvBNAct(
in_channels=total_channels,
out_channels=out_channels // 2,
kernel_size=1,
stride=1,
use_lab=use_lab,
lr_mult=lr_mult)
self.aggregation_excitation_conv = ConvBNAct(
in_channels=out_channels // 2,
out_channels=out_channels,
kernel_size=1,
stride=1,
use_lab=use_lab,
lr_mult=lr_mult)
def forward(self, x):
identity = x
output = []
output.append(x)
for layer in self.layers:
x = layer(x)
output.append(x)
x = paddle.concat(output, axis=1)
x = self.aggregation_squeeze_conv(x)
x = self.aggregation_excitation_conv(x)
if self.identity:
x += identity
return x
class HG_Stage(nn.Layer):
def __init__(self,
in_channels,
mid_channels,
out_channels,
block_num,
layer_num=6,
downsample=True,
light_block=True,
kernel_size=3,
use_lab=False,
lr_mult=1.0):
super().__init__()
self.downsample = downsample
if downsample:
self.downsample = ConvBNAct(
in_channels=in_channels,
out_channels=in_channels,
kernel_size=3,
stride=2,
groups=in_channels,
use_act=False,
use_lab=use_lab,
lr_mult=lr_mult)
blocks_list = []
for i in range(block_num):
blocks_list.append(
HG_Block(
in_channels=in_channels if i == 0 else out_channels,
mid_channels=mid_channels,
out_channels=out_channels,
kernel_size=kernel_size,
layer_num=layer_num,
identity=False if i == 0 else True,
light_block=light_block,
use_lab=use_lab,
lr_mult=lr_mult))
self.blocks = nn.Sequential(*blocks_list)
def forward(self, x):
if self.downsample:
x = self.downsample(x)
x = self.blocks(x)
return x
def _freeze_norm(m: nn.BatchNorm2D):
param_attr = ParamAttr(
learning_rate=0., regularizer=L2Decay(0.), trainable=False)
bias_attr = ParamAttr(
learning_rate=0., regularizer=L2Decay(0.), trainable=False)
global_stats = True
norm = nn.BatchNorm2D(
m._num_features,
weight_attr=param_attr,
bias_attr=bias_attr,
use_global_stats=global_stats)
for param in norm.parameters():
param.stop_gradient = True
return norm
def reset_bn(model: nn.Layer, reset_func=_freeze_norm):
if isinstance(model, nn.BatchNorm2D):
model = reset_func(model)
else:
for name, child in model.named_children():
_child = reset_bn(child, reset_func)
if _child is not child:
setattr(model, name, _child)
return model
@register
@serializable
class PPHGNetV2(nn.Layer):
"""
PPHGNetV2
Args:
stem_channels: list. Number of channels for the stem block.
stage_type: str. The stage configuration of PPHGNet. such as the number of channels, stride, etc.
use_lab: boolean. Whether to use LearnableAffineBlock in network.
lr_mult_list: list. Control the learning rate of different stages.
Returns:
model: nn.Layer. Specific PPHGNetV2 model depends on args.
"""
arch_configs = {
'L': {
'stem_channels': [3, 32, 48],
'stage_config': {
# in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
"stage1": [48, 48, 128, 1, False, False, 3, 6],
"stage2": [128, 96, 512, 1, True, False, 3, 6],
"stage3": [512, 192, 1024, 3, True, True, 5, 6],
"stage4": [1024, 384, 2048, 1, True, True, 5, 6],
}
},
'X': {
'stem_channels': [3, 32, 64],
'stage_config': {
# in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
"stage1": [64, 64, 128, 1, False, False, 3, 6],
"stage2": [128, 128, 512, 2, True, False, 3, 6],
"stage3": [512, 256, 1024, 5, True, True, 5, 6],
"stage4": [1024, 512, 2048, 2, True, True, 5, 6],
}
}
}
def __init__(self,
arch,
use_lab=False,
lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
return_idx=[1, 2, 3],
freeze_stem_only=True,
freeze_at=0,
freeze_norm=True):
super().__init__()
self.use_lab = use_lab
self.return_idx = return_idx
stem_channels = self.arch_configs[arch]['stem_channels']
stage_config = self.arch_configs[arch]['stage_config']
self._out_strides = [4, 8, 16, 32]
self._out_channels = [stage_config[k][2] for k in stage_config]
# stem
self.stem = StemBlock(
in_channels=stem_channels[0],
mid_channels=stem_channels[1],
out_channels=stem_channels[2],
use_lab=use_lab,
lr_mult=lr_mult_list[0])
# stages
self.stages = nn.LayerList()
for i, k in enumerate(stage_config):
in_channels, mid_channels, out_channels, block_num, downsample, light_block, kernel_size, layer_num = stage_config[
k]
self.stages.append(
HG_Stage(
in_channels,
mid_channels,
out_channels,
block_num,
layer_num,
downsample,
light_block,
kernel_size,
use_lab,
lr_mult=lr_mult_list[i + 1]))
if freeze_at >= 0:
self._freeze_parameters(self.stem)
if not freeze_stem_only:
for i in range(min(freeze_at + 1, len(self.stages))):
self._freeze_parameters(self.stages[i])
if freeze_norm:
reset_bn(self, reset_func=_freeze_norm)
self._init_weights()
def _freeze_parameters(self, m):
for p in m.parameters():
p.stop_gradient = True
def _init_weights(self):
for m in self.sublayers():
if isinstance(m, nn.Conv2D):
kaiming_normal_(m.weight)
elif isinstance(m, (nn.BatchNorm2D)):
ones_(m.weight)
zeros_(m.bias)
elif isinstance(m, nn.Linear):
zeros_(m.bias)
@property
def out_shape(self):
return [
ShapeSpec(
channels=self._out_channels[i], stride=self._out_strides[i])
for i in self.return_idx
]
def forward(self, inputs):
x = inputs['image']
x = self.stem(x)
outs = []
for idx, stage in enumerate(self.stages):
x = stage(x)
if idx in self.return_idx:
outs.append(x)
return outs

View File

@@ -0,0 +1,271 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
from paddle import ParamAttr
from paddle.nn import AdaptiveAvgPool2D, Conv2D
from paddle.regularizer import L2Decay
from paddle.nn.initializer import KaimingNormal
from ppdet.core.workspace import register, serializable
from numbers import Integral
from ..shape_spec import ShapeSpec
__all__ = ['LCNet']
NET_CONFIG = {
"blocks2":
#k, in_c, out_c, s, use_se
[[3, 16, 32, 1, False], ],
"blocks3": [
[3, 32, 64, 2, False],
[3, 64, 64, 1, False],
],
"blocks4": [
[3, 64, 128, 2, False],
[3, 128, 128, 1, False],
],
"blocks5": [
[3, 128, 256, 2, False],
[5, 256, 256, 1, False],
[5, 256, 256, 1, False],
[5, 256, 256, 1, False],
[5, 256, 256, 1, False],
[5, 256, 256, 1, False],
],
"blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]]
}
def make_divisible(v, divisor=8, min_value=None):
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
if new_v < 0.9 * v:
new_v += divisor
return new_v
class ConvBNLayer(nn.Layer):
def __init__(self,
num_channels,
filter_size,
num_filters,
stride,
num_groups=1,
act='hard_swish'):
super().__init__()
self.conv = Conv2D(
in_channels=num_channels,
out_channels=num_filters,
kernel_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=num_groups,
weight_attr=ParamAttr(initializer=KaimingNormal()),
bias_attr=False)
self.bn = nn.BatchNorm2D(
num_filters,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
if act == 'hard_swish':
self.act = nn.Hardswish()
elif act == 'relu6':
self.act = nn.ReLU6()
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
x = self.act(x)
return x
class DepthwiseSeparable(nn.Layer):
def __init__(self,
num_channels,
num_filters,
stride,
dw_size=3,
use_se=False,
act='hard_swish'):
super().__init__()
self.use_se = use_se
self.dw_conv = ConvBNLayer(
num_channels=num_channels,
num_filters=num_channels,
filter_size=dw_size,
stride=stride,
num_groups=num_channels,
act=act)
if use_se:
self.se = SEModule(num_channels)
self.pw_conv = ConvBNLayer(
num_channels=num_channels,
filter_size=1,
num_filters=num_filters,
stride=1,
act=act)
def forward(self, x):
x = self.dw_conv(x)
if self.use_se:
x = self.se(x)
x = self.pw_conv(x)
return x
class SEModule(nn.Layer):
def __init__(self, channel, reduction=4):
super().__init__()
self.avg_pool = AdaptiveAvgPool2D(1)
self.conv1 = Conv2D(
in_channels=channel,
out_channels=channel // reduction,
kernel_size=1,
stride=1,
padding=0)
self.relu = nn.ReLU()
self.conv2 = Conv2D(
in_channels=channel // reduction,
out_channels=channel,
kernel_size=1,
stride=1,
padding=0)
self.hardsigmoid = nn.Hardsigmoid()
def forward(self, x):
identity = x
x = self.avg_pool(x)
x = self.conv1(x)
x = self.relu(x)
x = self.conv2(x)
x = self.hardsigmoid(x)
x = paddle.multiply(x=identity, y=x)
return x
@register
@serializable
class LCNet(nn.Layer):
def __init__(self, scale=1.0, feature_maps=[3, 4, 5], act='hard_swish'):
super().__init__()
self.scale = scale
self.feature_maps = feature_maps
out_channels = []
self.conv1 = ConvBNLayer(
num_channels=3,
filter_size=3,
num_filters=make_divisible(16 * scale),
stride=2,
act=act)
self.blocks2 = nn.Sequential(* [
DepthwiseSeparable(
num_channels=make_divisible(in_c * scale),
num_filters=make_divisible(out_c * scale),
dw_size=k,
stride=s,
use_se=se,
act=act)
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"])
])
self.blocks3 = nn.Sequential(* [
DepthwiseSeparable(
num_channels=make_divisible(in_c * scale),
num_filters=make_divisible(out_c * scale),
dw_size=k,
stride=s,
use_se=se,
act=act)
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"])
])
out_channels.append(
make_divisible(NET_CONFIG["blocks3"][-1][2] * scale))
self.blocks4 = nn.Sequential(* [
DepthwiseSeparable(
num_channels=make_divisible(in_c * scale),
num_filters=make_divisible(out_c * scale),
dw_size=k,
stride=s,
use_se=se,
act=act)
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"])
])
out_channels.append(
make_divisible(NET_CONFIG["blocks4"][-1][2] * scale))
self.blocks5 = nn.Sequential(* [
DepthwiseSeparable(
num_channels=make_divisible(in_c * scale),
num_filters=make_divisible(out_c * scale),
dw_size=k,
stride=s,
use_se=se,
act=act)
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"])
])
out_channels.append(
make_divisible(NET_CONFIG["blocks5"][-1][2] * scale))
self.blocks6 = nn.Sequential(* [
DepthwiseSeparable(
num_channels=make_divisible(in_c * scale),
num_filters=make_divisible(out_c * scale),
dw_size=k,
stride=s,
use_se=se,
act=act)
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"])
])
out_channels.append(
make_divisible(NET_CONFIG["blocks6"][-1][2] * scale))
self._out_channels = [
ch for idx, ch in enumerate(out_channels) if idx + 2 in feature_maps
]
def forward(self, inputs):
x = inputs['image']
outs = []
x = self.conv1(x)
x = self.blocks2(x)
x = self.blocks3(x)
outs.append(x)
x = self.blocks4(x)
outs.append(x)
x = self.blocks5(x)
outs.append(x)
x = self.blocks6(x)
outs.append(x)
outs = [o for i, o in enumerate(outs) if i + 2 in self.feature_maps]
return outs
@property
def out_shape(self):
return [ShapeSpec(channels=c) for c in self._out_channels]

View File

@@ -0,0 +1,402 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import KaimingNormal
from ppdet.core.workspace import register, serializable
from numbers import Integral
from ..shape_spec import ShapeSpec
__all__ = ['MobileNet']
class ConvBNLayer(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
padding,
num_groups=1,
act='relu',
conv_lr=1.,
conv_decay=0.,
norm_decay=0.,
norm_type='bn',
name=None):
super(ConvBNLayer, self).__init__()
self.act = act
self._conv = nn.Conv2D(
in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=num_groups,
weight_attr=ParamAttr(
learning_rate=conv_lr,
initializer=KaimingNormal(),
regularizer=L2Decay(conv_decay)),
bias_attr=False)
param_attr = ParamAttr(regularizer=L2Decay(norm_decay))
bias_attr = ParamAttr(regularizer=L2Decay(norm_decay))
if norm_type in ['sync_bn', 'bn']:
self._batch_norm = nn.BatchNorm2D(
out_channels, weight_attr=param_attr, bias_attr=bias_attr)
def forward(self, x):
x = self._conv(x)
x = self._batch_norm(x)
if self.act == "relu":
x = F.relu(x)
elif self.act == "relu6":
x = F.relu6(x)
return x
class DepthwiseSeparable(nn.Layer):
def __init__(self,
in_channels,
out_channels1,
out_channels2,
num_groups,
stride,
scale,
conv_lr=1.,
conv_decay=0.,
norm_decay=0.,
norm_type='bn',
name=None):
super(DepthwiseSeparable, self).__init__()
self._depthwise_conv = ConvBNLayer(
in_channels,
int(out_channels1 * scale),
kernel_size=3,
stride=stride,
padding=1,
num_groups=int(num_groups * scale),
conv_lr=conv_lr,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name=name + "_dw")
self._pointwise_conv = ConvBNLayer(
int(out_channels1 * scale),
int(out_channels2 * scale),
kernel_size=1,
stride=1,
padding=0,
conv_lr=conv_lr,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name=name + "_sep")
def forward(self, x):
x = self._depthwise_conv(x)
x = self._pointwise_conv(x)
return x
class ExtraBlock(nn.Layer):
def __init__(self,
in_channels,
out_channels1,
out_channels2,
num_groups=1,
stride=2,
conv_lr=1.,
conv_decay=0.,
norm_decay=0.,
norm_type='bn',
name=None):
super(ExtraBlock, self).__init__()
self.pointwise_conv = ConvBNLayer(
in_channels,
int(out_channels1),
kernel_size=1,
stride=1,
padding=0,
num_groups=int(num_groups),
act='relu6',
conv_lr=conv_lr,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name=name + "_extra1")
self.normal_conv = ConvBNLayer(
int(out_channels1),
int(out_channels2),
kernel_size=3,
stride=stride,
padding=1,
num_groups=int(num_groups),
act='relu6',
conv_lr=conv_lr,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name=name + "_extra2")
def forward(self, x):
x = self.pointwise_conv(x)
x = self.normal_conv(x)
return x
@register
@serializable
class MobileNet(nn.Layer):
__shared__ = ['norm_type']
def __init__(self,
norm_type='bn',
norm_decay=0.,
conv_decay=0.,
scale=1,
conv_learning_rate=1.0,
feature_maps=[4, 6, 13],
with_extra_blocks=False,
extra_block_filters=[[256, 512], [128, 256], [128, 256],
[64, 128]]):
super(MobileNet, self).__init__()
if isinstance(feature_maps, Integral):
feature_maps = [feature_maps]
self.feature_maps = feature_maps
self.with_extra_blocks = with_extra_blocks
self.extra_block_filters = extra_block_filters
self._out_channels = []
self.conv1 = ConvBNLayer(
in_channels=3,
out_channels=int(32 * scale),
kernel_size=3,
stride=2,
padding=1,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv1")
self.dwsl = []
dws21 = self.add_sublayer(
"conv2_1",
sublayer=DepthwiseSeparable(
in_channels=int(32 * scale),
out_channels1=32,
out_channels2=64,
num_groups=32,
stride=1,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv2_1"))
self.dwsl.append(dws21)
self._update_out_channels(int(64 * scale), len(self.dwsl), feature_maps)
dws22 = self.add_sublayer(
"conv2_2",
sublayer=DepthwiseSeparable(
in_channels=int(64 * scale),
out_channels1=64,
out_channels2=128,
num_groups=64,
stride=2,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv2_2"))
self.dwsl.append(dws22)
self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps)
# 1/4
dws31 = self.add_sublayer(
"conv3_1",
sublayer=DepthwiseSeparable(
in_channels=int(128 * scale),
out_channels1=128,
out_channels2=128,
num_groups=128,
stride=1,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv3_1"))
self.dwsl.append(dws31)
self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps)
dws32 = self.add_sublayer(
"conv3_2",
sublayer=DepthwiseSeparable(
in_channels=int(128 * scale),
out_channels1=128,
out_channels2=256,
num_groups=128,
stride=2,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv3_2"))
self.dwsl.append(dws32)
self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps)
# 1/8
dws41 = self.add_sublayer(
"conv4_1",
sublayer=DepthwiseSeparable(
in_channels=int(256 * scale),
out_channels1=256,
out_channels2=256,
num_groups=256,
stride=1,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv4_1"))
self.dwsl.append(dws41)
self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps)
dws42 = self.add_sublayer(
"conv4_2",
sublayer=DepthwiseSeparable(
in_channels=int(256 * scale),
out_channels1=256,
out_channels2=512,
num_groups=256,
stride=2,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv4_2"))
self.dwsl.append(dws42)
self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps)
# 1/16
for i in range(5):
tmp = self.add_sublayer(
"conv5_" + str(i + 1),
sublayer=DepthwiseSeparable(
in_channels=int(512 * scale),
out_channels1=512,
out_channels2=512,
num_groups=512,
stride=1,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv5_" + str(i + 1)))
self.dwsl.append(tmp)
self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps)
dws56 = self.add_sublayer(
"conv5_6",
sublayer=DepthwiseSeparable(
in_channels=int(512 * scale),
out_channels1=512,
out_channels2=1024,
num_groups=512,
stride=2,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv5_6"))
self.dwsl.append(dws56)
self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps)
# 1/32
dws6 = self.add_sublayer(
"conv6",
sublayer=DepthwiseSeparable(
in_channels=int(1024 * scale),
out_channels1=1024,
out_channels2=1024,
num_groups=1024,
stride=1,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv6"))
self.dwsl.append(dws6)
self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps)
if self.with_extra_blocks:
self.extra_blocks = []
for i, block_filter in enumerate(self.extra_block_filters):
in_c = 1024 if i == 0 else self.extra_block_filters[i - 1][1]
conv_extra = self.add_sublayer(
"conv7_" + str(i + 1),
sublayer=ExtraBlock(
in_c,
block_filter[0],
block_filter[1],
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv7_" + str(i + 1)))
self.extra_blocks.append(conv_extra)
self._update_out_channels(
block_filter[1],
len(self.dwsl) + len(self.extra_blocks), feature_maps)
def _update_out_channels(self, channel, feature_idx, feature_maps):
if feature_idx in feature_maps:
self._out_channels.append(channel)
def forward(self, inputs):
outs = []
y = self.conv1(inputs['image'])
for i, block in enumerate(self.dwsl):
y = block(y)
if i + 1 in self.feature_maps:
outs.append(y)
if not self.with_extra_blocks:
return outs
y = outs[-1]
for i, block in enumerate(self.extra_blocks):
idx = i + len(self.dwsl)
y = block(y)
if idx + 1 in self.feature_maps:
outs.append(y)
return outs
@property
def out_shape(self):
return [ShapeSpec(channels=c) for c in self._out_channels]

View File

@@ -0,0 +1,478 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register, serializable
from numbers import Integral
from ..shape_spec import ShapeSpec
__all__ = ['MobileNetV3']
def make_divisible(v, divisor=8, min_value=None):
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
if new_v < 0.9 * v:
new_v += divisor
return new_v
class ConvBNLayer(nn.Layer):
def __init__(self,
in_c,
out_c,
filter_size,
stride,
padding,
num_groups=1,
act=None,
lr_mult=1.,
conv_decay=0.,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
name=""):
super(ConvBNLayer, self).__init__()
self.act = act
self.conv = nn.Conv2D(
in_channels=in_c,
out_channels=out_c,
kernel_size=filter_size,
stride=stride,
padding=padding,
groups=num_groups,
weight_attr=ParamAttr(
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
bias_attr=False)
norm_lr = 0. if freeze_norm else lr_mult
param_attr = ParamAttr(
learning_rate=norm_lr,
regularizer=L2Decay(norm_decay),
trainable=False if freeze_norm else True)
bias_attr = ParamAttr(
learning_rate=norm_lr,
regularizer=L2Decay(norm_decay),
trainable=False if freeze_norm else True)
global_stats = True if freeze_norm else None
if norm_type in ['sync_bn', 'bn']:
self.bn = nn.BatchNorm2D(
out_c,
weight_attr=param_attr,
bias_attr=bias_attr,
use_global_stats=global_stats)
norm_params = self.bn.parameters()
if freeze_norm:
for param in norm_params:
param.stop_gradient = True
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
if self.act is not None:
if self.act == "relu":
x = F.relu(x)
elif self.act == "relu6":
x = F.relu6(x)
elif self.act == "hard_swish":
x = F.hardswish(x)
else:
raise NotImplementedError(
"The activation function is selected incorrectly.")
return x
class ResidualUnit(nn.Layer):
def __init__(self,
in_c,
mid_c,
out_c,
filter_size,
stride,
use_se,
lr_mult,
conv_decay=0.,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
act=None,
return_list=False,
name=''):
super(ResidualUnit, self).__init__()
self.if_shortcut = stride == 1 and in_c == out_c
self.use_se = use_se
self.return_list = return_list
self.expand_conv = ConvBNLayer(
in_c=in_c,
out_c=mid_c,
filter_size=1,
stride=1,
padding=0,
act=act,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_expand")
self.bottleneck_conv = ConvBNLayer(
in_c=mid_c,
out_c=mid_c,
filter_size=filter_size,
stride=stride,
padding=int((filter_size - 1) // 2),
num_groups=mid_c,
act=act,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_depthwise")
if self.use_se:
self.mid_se = SEModule(
mid_c, lr_mult, conv_decay, name=name + "_se")
self.linear_conv = ConvBNLayer(
in_c=mid_c,
out_c=out_c,
filter_size=1,
stride=1,
padding=0,
act=None,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_linear")
def forward(self, inputs):
y = self.expand_conv(inputs)
x = self.bottleneck_conv(y)
if self.use_se:
x = self.mid_se(x)
x = self.linear_conv(x)
if self.if_shortcut:
x = paddle.add(inputs, x)
if self.return_list:
return [y, x]
else:
return x
class SEModule(nn.Layer):
def __init__(self, channel, lr_mult, conv_decay, reduction=4, name=""):
super(SEModule, self).__init__()
self.avg_pool = nn.AdaptiveAvgPool2D(1)
mid_channels = int(channel // reduction)
self.conv1 = nn.Conv2D(
in_channels=channel,
out_channels=mid_channels,
kernel_size=1,
stride=1,
padding=0,
weight_attr=ParamAttr(
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
bias_attr=ParamAttr(
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
self.conv2 = nn.Conv2D(
in_channels=mid_channels,
out_channels=channel,
kernel_size=1,
stride=1,
padding=0,
weight_attr=ParamAttr(
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
bias_attr=ParamAttr(
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
def forward(self, inputs):
outputs = self.avg_pool(inputs)
outputs = self.conv1(outputs)
outputs = F.relu(outputs)
outputs = self.conv2(outputs)
outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5)
return paddle.multiply(x=inputs, y=outputs)
class ExtraBlockDW(nn.Layer):
def __init__(self,
in_c,
ch_1,
ch_2,
stride,
lr_mult,
conv_decay=0.,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
name=None):
super(ExtraBlockDW, self).__init__()
self.pointwise_conv = ConvBNLayer(
in_c=in_c,
out_c=ch_1,
filter_size=1,
stride=1,
padding='SAME',
act='relu6',
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_extra1")
self.depthwise_conv = ConvBNLayer(
in_c=ch_1,
out_c=ch_2,
filter_size=3,
stride=stride,
padding='SAME',
num_groups=int(ch_1),
act='relu6',
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_extra2_dw")
self.normal_conv = ConvBNLayer(
in_c=ch_2,
out_c=ch_2,
filter_size=1,
stride=1,
padding='SAME',
act='relu6',
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_extra2_sep")
def forward(self, inputs):
x = self.pointwise_conv(inputs)
x = self.depthwise_conv(x)
x = self.normal_conv(x)
return x
@register
@serializable
class MobileNetV3(nn.Layer):
__shared__ = ['norm_type']
def __init__(
self,
scale=1.0,
model_name="large",
feature_maps=[6, 12, 15],
with_extra_blocks=False,
extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]],
lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
conv_decay=0.0,
multiplier=1.0,
norm_type='bn',
norm_decay=0.0,
freeze_norm=False):
super(MobileNetV3, self).__init__()
if isinstance(feature_maps, Integral):
feature_maps = [feature_maps]
if norm_type == 'sync_bn' and freeze_norm:
raise ValueError(
"The norm_type should not be sync_bn when freeze_norm is True")
self.feature_maps = feature_maps
self.with_extra_blocks = with_extra_blocks
self.extra_block_filters = extra_block_filters
inplanes = 16
if model_name == "large":
self.cfg = [
# k, exp, c, se, nl, s,
[3, 16, 16, False, "relu", 1],
[3, 64, 24, False, "relu", 2],
[3, 72, 24, False, "relu", 1],
[5, 72, 40, True, "relu", 2], # RCNN output
[5, 120, 40, True, "relu", 1],
[5, 120, 40, True, "relu", 1], # YOLOv3 output
[3, 240, 80, False, "hard_swish", 2], # RCNN output
[3, 200, 80, False, "hard_swish", 1],
[3, 184, 80, False, "hard_swish", 1],
[3, 184, 80, False, "hard_swish", 1],
[3, 480, 112, True, "hard_swish", 1],
[3, 672, 112, True, "hard_swish", 1], # YOLOv3 output
[5, 672, 160, True, "hard_swish", 2], # SSD/SSDLite/RCNN output
[5, 960, 160, True, "hard_swish", 1],
[5, 960, 160, True, "hard_swish", 1], # YOLOv3 output
]
elif model_name == "small":
self.cfg = [
# k, exp, c, se, nl, s,
[3, 16, 16, True, "relu", 2],
[3, 72, 24, False, "relu", 2], # RCNN output
[3, 88, 24, False, "relu", 1], # YOLOv3 output
[5, 96, 40, True, "hard_swish", 2], # RCNN output
[5, 240, 40, True, "hard_swish", 1],
[5, 240, 40, True, "hard_swish", 1],
[5, 120, 48, True, "hard_swish", 1],
[5, 144, 48, True, "hard_swish", 1], # YOLOv3 output
[5, 288, 96, True, "hard_swish", 2], # SSD/SSDLite/RCNN output
[5, 576, 96, True, "hard_swish", 1],
[5, 576, 96, True, "hard_swish", 1], # YOLOv3 output
]
else:
raise NotImplementedError(
"mode[{}_model] is not implemented!".format(model_name))
if multiplier != 1.0:
self.cfg[-3][2] = int(self.cfg[-3][2] * multiplier)
self.cfg[-2][1] = int(self.cfg[-2][1] * multiplier)
self.cfg[-2][2] = int(self.cfg[-2][2] * multiplier)
self.cfg[-1][1] = int(self.cfg[-1][1] * multiplier)
self.cfg[-1][2] = int(self.cfg[-1][2] * multiplier)
self.conv1 = ConvBNLayer(
in_c=3,
out_c=make_divisible(inplanes * scale),
filter_size=3,
stride=2,
padding=1,
num_groups=1,
act="hard_swish",
lr_mult=lr_mult_list[0],
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name="conv1")
self._out_channels = []
self.block_list = []
i = 0
inplanes = make_divisible(inplanes * scale)
for (k, exp, c, se, nl, s) in self.cfg:
lr_idx = min(i // 3, len(lr_mult_list) - 1)
lr_mult = lr_mult_list[lr_idx]
# for SSD/SSDLite, first head input is after ResidualUnit expand_conv
return_list = self.with_extra_blocks and i + 2 in self.feature_maps
block = self.add_sublayer(
"conv" + str(i + 2),
sublayer=ResidualUnit(
in_c=inplanes,
mid_c=make_divisible(scale * exp),
out_c=make_divisible(scale * c),
filter_size=k,
stride=s,
use_se=se,
act=nl,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
return_list=return_list,
name="conv" + str(i + 2)))
self.block_list.append(block)
inplanes = make_divisible(scale * c)
i += 1
self._update_out_channels(
make_divisible(scale * exp)
if return_list else inplanes, i + 1, feature_maps)
if self.with_extra_blocks:
self.extra_block_list = []
extra_out_c = make_divisible(scale * self.cfg[-1][1])
lr_idx = min(i // 3, len(lr_mult_list) - 1)
lr_mult = lr_mult_list[lr_idx]
conv_extra = self.add_sublayer(
"conv" + str(i + 2),
sublayer=ConvBNLayer(
in_c=inplanes,
out_c=extra_out_c,
filter_size=1,
stride=1,
padding=0,
num_groups=1,
act="hard_swish",
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name="conv" + str(i + 2)))
self.extra_block_list.append(conv_extra)
i += 1
self._update_out_channels(extra_out_c, i + 1, feature_maps)
for j, block_filter in enumerate(self.extra_block_filters):
in_c = extra_out_c if j == 0 else self.extra_block_filters[j -
1][1]
conv_extra = self.add_sublayer(
"conv" + str(i + 2),
sublayer=ExtraBlockDW(
in_c,
block_filter[0],
block_filter[1],
stride=2,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name='conv' + str(i + 2)))
self.extra_block_list.append(conv_extra)
i += 1
self._update_out_channels(block_filter[1], i + 1, feature_maps)
def _update_out_channels(self, channel, feature_idx, feature_maps):
if feature_idx in feature_maps:
self._out_channels.append(channel)
def forward(self, inputs):
x = self.conv1(inputs['image'])
outs = []
for idx, block in enumerate(self.block_list):
x = block(x)
if idx + 2 in self.feature_maps:
if isinstance(x, list):
outs.append(x[0])
x = x[1]
else:
outs.append(x)
if not self.with_extra_blocks:
return outs
for i, block in enumerate(self.extra_block_list):
idx = i + len(self.block_list)
x = block(x)
if idx + 2 in self.feature_maps:
outs.append(x)
return outs
@property
def out_shape(self):
return [ShapeSpec(channels=c) for c in self._out_channels]

View File

@@ -0,0 +1,266 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is the paddle implementation of MobileOne block, see: https://arxiv.org/pdf/2206.04040.pdf.
Some codes are based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
Ths copyright of microsoft/Swin-Transformer is as follows:
MIT License [see LICENSE for details]
"""
import paddle
import paddle.nn as nn
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import Normal, Constant
from ppdet.modeling.ops import get_act_fn
from ppdet.modeling.layers import ConvNormLayer
class MobileOneBlock(nn.Layer):
def __init__(
self,
ch_in,
ch_out,
stride,
kernel_size,
conv_num=1,
norm_type='bn',
norm_decay=0.,
norm_groups=32,
bias_on=False,
lr_scale=1.,
freeze_norm=False,
initializer=Normal(
mean=0., std=0.01),
skip_quant=False,
act='relu', ):
super(MobileOneBlock, self).__init__()
self.ch_in = ch_in
self.ch_out = ch_out
self.kernel_size = kernel_size
self.stride = stride
self.padding = (kernel_size - 1) // 2
self.k = conv_num
self.depth_conv = nn.LayerList()
self.point_conv = nn.LayerList()
for _ in range(self.k):
self.depth_conv.append(
ConvNormLayer(
ch_in,
ch_in,
kernel_size,
stride=stride,
groups=ch_in,
norm_type=norm_type,
norm_decay=norm_decay,
norm_groups=norm_groups,
bias_on=bias_on,
lr_scale=lr_scale,
freeze_norm=freeze_norm,
initializer=initializer,
skip_quant=skip_quant))
self.point_conv.append(
ConvNormLayer(
ch_in,
ch_out,
1,
stride=1,
groups=1,
norm_type=norm_type,
norm_decay=norm_decay,
norm_groups=norm_groups,
bias_on=bias_on,
lr_scale=lr_scale,
freeze_norm=freeze_norm,
initializer=initializer,
skip_quant=skip_quant))
self.rbr_1x1 = ConvNormLayer(
ch_in,
ch_in,
1,
stride=self.stride,
groups=ch_in,
norm_type=norm_type,
norm_decay=norm_decay,
norm_groups=norm_groups,
bias_on=bias_on,
lr_scale=lr_scale,
freeze_norm=freeze_norm,
initializer=initializer,
skip_quant=skip_quant)
self.rbr_identity_st1 = nn.BatchNorm2D(
num_features=ch_in,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(
0.0))) if ch_in == ch_out and self.stride == 1 else None
self.rbr_identity_st2 = nn.BatchNorm2D(
num_features=ch_out,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(
0.0))) if ch_in == ch_out and self.stride == 1 else None
self.act = get_act_fn(act) if act is None or isinstance(act, (
str, dict)) else act
def forward(self, x):
if hasattr(self, "conv1") and hasattr(self, "conv2"):
y = self.act(self.conv2(self.act(self.conv1(x))))
else:
if self.rbr_identity_st1 is None:
id_out_st1 = 0
else:
id_out_st1 = self.rbr_identity_st1(x)
x1_1 = 0
for i in range(self.k):
x1_1 += self.depth_conv[i](x)
x1_2 = self.rbr_1x1(x)
x1 = self.act(x1_1 + x1_2 + id_out_st1)
if self.rbr_identity_st2 is None:
id_out_st2 = 0
else:
id_out_st2 = self.rbr_identity_st2(x1)
x2_1 = 0
for i in range(self.k):
x2_1 += self.point_conv[i](x1)
y = self.act(x2_1 + id_out_st2)
return y
def convert_to_deploy(self):
if not hasattr(self, 'conv1'):
self.conv1 = nn.Conv2D(
in_channels=self.ch_in,
out_channels=self.ch_in,
kernel_size=self.kernel_size,
stride=self.stride,
padding=self.padding,
groups=self.ch_in,
bias_attr=ParamAttr(
initializer=Constant(value=0.), learning_rate=1.))
if not hasattr(self, 'conv2'):
self.conv2 = nn.Conv2D(
in_channels=self.ch_in,
out_channels=self.ch_out,
kernel_size=1,
stride=1,
padding='SAME',
groups=1,
bias_attr=ParamAttr(
initializer=Constant(value=0.), learning_rate=1.))
conv1_kernel, conv1_bias, conv2_kernel, conv2_bias = self.get_equivalent_kernel_bias(
)
self.conv1.weight.set_value(conv1_kernel)
self.conv1.bias.set_value(conv1_bias)
self.conv2.weight.set_value(conv2_kernel)
self.conv2.bias.set_value(conv2_bias)
self.__delattr__('depth_conv')
self.__delattr__('point_conv')
self.__delattr__('rbr_1x1')
if hasattr(self, 'rbr_identity_st1'):
self.__delattr__('rbr_identity_st1')
if hasattr(self, 'rbr_identity_st2'):
self.__delattr__('rbr_identity_st2')
def get_equivalent_kernel_bias(self):
st1_kernel3x3, st1_bias3x3 = self._fuse_bn_tensor(self.depth_conv)
st1_kernel1x1, st1_bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
st1_kernelid, st1_biasid = self._fuse_bn_tensor(
self.rbr_identity_st1, kernel_size=self.kernel_size)
st2_kernel1x1, st2_bias1x1 = self._fuse_bn_tensor(self.point_conv)
st2_kernelid, st2_biasid = self._fuse_bn_tensor(
self.rbr_identity_st2, kernel_size=1)
conv1_kernel = st1_kernel3x3 + self._pad_1x1_to_3x3_tensor(
st1_kernel1x1) + st1_kernelid
conv1_bias = st1_bias3x3 + st1_bias1x1 + st1_biasid
conv2_kernel = st2_kernel1x1 + st2_kernelid
conv2_bias = st2_bias1x1 + st2_biasid
return conv1_kernel, conv1_bias, conv2_kernel, conv2_bias
def _pad_1x1_to_3x3_tensor(self, kernel1x1):
if kernel1x1 is None:
return 0
else:
padding_size = (self.kernel_size - 1) // 2
return nn.functional.pad(
kernel1x1,
[padding_size, padding_size, padding_size, padding_size])
def _fuse_bn_tensor(self, branch, kernel_size=3):
if branch is None:
return 0, 0
if isinstance(branch, nn.LayerList):
fused_kernels = []
fused_bias = []
for block in branch:
kernel = block.conv.weight
running_mean = block.norm._mean
running_var = block.norm._variance
gamma = block.norm.weight
beta = block.norm.bias
eps = block.norm._epsilon
std = (running_var + eps).sqrt()
t = (gamma / std).reshape((-1, 1, 1, 1))
fused_kernels.append(kernel * t)
fused_bias.append(beta - running_mean * gamma / std)
return sum(fused_kernels), sum(fused_bias)
elif isinstance(branch, ConvNormLayer):
kernel = branch.conv.weight
running_mean = branch.norm._mean
running_var = branch.norm._variance
gamma = branch.norm.weight
beta = branch.norm.bias
eps = branch.norm._epsilon
else:
assert isinstance(branch, nn.BatchNorm2D)
input_dim = self.ch_in if kernel_size == 1 else 1
kernel_value = paddle.zeros(
shape=[self.ch_in, input_dim, kernel_size, kernel_size],
dtype='float32')
if kernel_size > 1:
for i in range(self.ch_in):
kernel_value[i, i % input_dim, (kernel_size - 1) // 2, (
kernel_size - 1) // 2] = 1
elif kernel_size == 1:
for i in range(self.ch_in):
kernel_value[i, i % input_dim, 0, 0] = 1
else:
raise ValueError("Invalid kernel size recieved!")
kernel = paddle.to_tensor(kernel_value, place=branch.weight.place)
running_mean = branch._mean
running_var = branch._variance
gamma = branch.weight
beta = branch.bias
eps = branch._epsilon
std = (running_var + eps).sqrt()
t = (gamma / std).reshape((-1, 1, 1, 1))
return kernel * t, beta - running_mean * gamma / std

View File

@@ -0,0 +1,69 @@
class NameAdapter(object):
"""Fix the backbones variable names for pretrained weight"""
def __init__(self, model):
super(NameAdapter, self).__init__()
self.model = model
@property
def model_type(self):
return getattr(self.model, '_model_type', '')
@property
def variant(self):
return getattr(self.model, 'variant', '')
def fix_conv_norm_name(self, name):
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
# the naming rule is same as pretrained weight
if self.model_type == 'SEResNeXt':
bn_name = name + "_bn"
return bn_name
def fix_shortcut_name(self, name):
if self.model_type == 'SEResNeXt':
name = 'conv' + name + '_prj'
return name
def fix_bottleneck_name(self, name):
if self.model_type == 'SEResNeXt':
conv_name1 = 'conv' + name + '_x1'
conv_name2 = 'conv' + name + '_x2'
conv_name3 = 'conv' + name + '_x3'
shortcut_name = name
else:
conv_name1 = name + "_branch2a"
conv_name2 = name + "_branch2b"
conv_name3 = name + "_branch2c"
shortcut_name = name + "_branch1"
return conv_name1, conv_name2, conv_name3, shortcut_name
def fix_basicblock_name(self, name):
if self.model_type == 'SEResNeXt':
conv_name1 = 'conv' + name + '_x1'
conv_name2 = 'conv' + name + '_x2'
shortcut_name = name
else:
conv_name1 = name + "_branch2a"
conv_name2 = name + "_branch2b"
shortcut_name = name + "_branch1"
return conv_name1, conv_name2, shortcut_name
def fix_layer_warp_name(self, stage_num, count, i):
name = 'res' + str(stage_num)
if count > 10 and stage_num == 4:
if i == 0:
conv_name = name + "a"
else:
conv_name = name + "b" + str(i)
else:
conv_name = name + chr(ord("a") + i)
if self.model_type == 'SEResNeXt':
conv_name = str(stage_num + 2) + '_' + str(i + 1)
return conv_name
def fix_c1_stage_name(self):
return "res_conv1" if self.model_type == 'ResNeXt' else "conv1"

View File

@@ -0,0 +1,611 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from numbers import Integral
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from paddle.regularizer import L2Decay
from paddle.nn.initializer import Uniform
from paddle import ParamAttr
from paddle.nn.initializer import Constant
from paddle.vision.ops import DeformConv2D
from .name_adapter import NameAdapter
from ..shape_spec import ShapeSpec
__all__ = ['ResNet', 'Res5Head', 'Blocks', 'BasicBlock', 'BottleNeck']
ResNet_cfg = {
18: [2, 2, 2, 2],
34: [3, 4, 6, 3],
50: [3, 4, 6, 3],
101: [3, 4, 23, 3],
152: [3, 8, 36, 3],
}
class ConvNormLayer(nn.Layer):
def __init__(self,
ch_in,
ch_out,
filter_size,
stride,
groups=1,
act=None,
norm_type='bn',
norm_decay=0.,
freeze_norm=True,
lr=1.0,
dcn_v2=False):
super(ConvNormLayer, self).__init__()
assert norm_type in ['bn', 'sync_bn']
self.norm_type = norm_type
self.act = act
self.dcn_v2 = dcn_v2
if not self.dcn_v2:
self.conv = nn.Conv2D(
in_channels=ch_in,
out_channels=ch_out,
kernel_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
weight_attr=ParamAttr(learning_rate=lr),
bias_attr=False)
else:
self.offset_channel = 2 * filter_size**2
self.mask_channel = filter_size**2
self.conv_offset = nn.Conv2D(
in_channels=ch_in,
out_channels=3 * filter_size**2,
kernel_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
weight_attr=ParamAttr(initializer=Constant(0.)),
bias_attr=ParamAttr(initializer=Constant(0.)))
self.conv = DeformConv2D(
in_channels=ch_in,
out_channels=ch_out,
kernel_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
dilation=1,
groups=groups,
weight_attr=ParamAttr(learning_rate=lr),
bias_attr=False)
norm_lr = 0. if freeze_norm else lr
param_attr = ParamAttr(
learning_rate=norm_lr,
regularizer=L2Decay(norm_decay),
trainable=False if freeze_norm else True)
bias_attr = ParamAttr(
learning_rate=norm_lr,
regularizer=L2Decay(norm_decay),
trainable=False if freeze_norm else True)
global_stats = True if freeze_norm else None
if norm_type in ['sync_bn', 'bn']:
self.norm = nn.BatchNorm2D(
ch_out,
weight_attr=param_attr,
bias_attr=bias_attr,
use_global_stats=global_stats)
norm_params = self.norm.parameters()
if freeze_norm:
for param in norm_params:
param.stop_gradient = True
def forward(self, inputs):
if not self.dcn_v2:
out = self.conv(inputs)
else:
offset_mask = self.conv_offset(inputs)
offset, mask = paddle.split(
offset_mask,
num_or_sections=[self.offset_channel, self.mask_channel],
axis=1)
mask = F.sigmoid(mask)
out = self.conv(inputs, offset, mask=mask)
if self.norm_type in ['bn', 'sync_bn']:
out = self.norm(out)
if self.act:
out = getattr(F, self.act)(out)
return out
class SELayer(nn.Layer):
def __init__(self, ch, reduction_ratio=16):
super(SELayer, self).__init__()
self.pool = nn.AdaptiveAvgPool2D(1)
stdv = 1.0 / math.sqrt(ch)
c_ = ch // reduction_ratio
self.squeeze = nn.Linear(
ch,
c_,
weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
bias_attr=True)
stdv = 1.0 / math.sqrt(c_)
self.extract = nn.Linear(
c_,
ch,
weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
bias_attr=True)
def forward(self, inputs):
out = self.pool(inputs)
out = paddle.squeeze(out, axis=[2, 3])
out = self.squeeze(out)
out = F.relu(out)
out = self.extract(out)
out = F.sigmoid(out)
out = paddle.unsqueeze(out, axis=[2, 3])
scale = out * inputs
return scale
class BasicBlock(nn.Layer):
expansion = 1
def __init__(self,
ch_in,
ch_out,
stride,
shortcut,
variant='b',
groups=1,
base_width=64,
lr=1.0,
norm_type='bn',
norm_decay=0.,
freeze_norm=True,
dcn_v2=False,
std_senet=False):
super(BasicBlock, self).__init__()
assert groups == 1 and base_width == 64, 'BasicBlock only supports groups=1 and base_width=64'
self.shortcut = shortcut
if not shortcut:
if variant == 'd' and stride == 2:
self.short = nn.Sequential()
self.short.add_sublayer(
'pool',
nn.AvgPool2D(
kernel_size=2, stride=2, padding=0, ceil_mode=True))
self.short.add_sublayer(
'conv',
ConvNormLayer(
ch_in=ch_in,
ch_out=ch_out,
filter_size=1,
stride=1,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr))
else:
self.short = ConvNormLayer(
ch_in=ch_in,
ch_out=ch_out,
filter_size=1,
stride=stride,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr)
self.branch2a = ConvNormLayer(
ch_in=ch_in,
ch_out=ch_out,
filter_size=3,
stride=stride,
act='relu',
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr)
self.branch2b = ConvNormLayer(
ch_in=ch_out,
ch_out=ch_out,
filter_size=3,
stride=1,
act=None,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr,
dcn_v2=dcn_v2)
self.std_senet = std_senet
if self.std_senet:
self.se = SELayer(ch_out)
def forward(self, inputs):
out = self.branch2a(inputs)
out = self.branch2b(out)
if self.std_senet:
out = self.se(out)
if self.shortcut:
short = inputs
else:
short = self.short(inputs)
out = paddle.add(x=out, y=short)
out = F.relu(out)
return out
class BottleNeck(nn.Layer):
expansion = 4
def __init__(self,
ch_in,
ch_out,
stride,
shortcut,
variant='b',
groups=1,
base_width=4,
lr=1.0,
norm_type='bn',
norm_decay=0.,
freeze_norm=True,
dcn_v2=False,
std_senet=False):
super(BottleNeck, self).__init__()
if variant == 'a':
stride1, stride2 = stride, 1
else:
stride1, stride2 = 1, stride
# ResNeXt
width = int(ch_out * (base_width / 64.)) * groups
self.branch2a = ConvNormLayer(
ch_in=ch_in,
ch_out=width,
filter_size=1,
stride=stride1,
groups=1,
act='relu',
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr)
self.branch2b = ConvNormLayer(
ch_in=width,
ch_out=width,
filter_size=3,
stride=stride2,
groups=groups,
act='relu',
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr,
dcn_v2=dcn_v2)
self.branch2c = ConvNormLayer(
ch_in=width,
ch_out=ch_out * self.expansion,
filter_size=1,
stride=1,
groups=1,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr)
self.shortcut = shortcut
if not shortcut:
if variant == 'd' and stride == 2:
self.short = nn.Sequential()
self.short.add_sublayer(
'pool',
nn.AvgPool2D(
kernel_size=2, stride=2, padding=0, ceil_mode=True))
self.short.add_sublayer(
'conv',
ConvNormLayer(
ch_in=ch_in,
ch_out=ch_out * self.expansion,
filter_size=1,
stride=1,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr))
else:
self.short = ConvNormLayer(
ch_in=ch_in,
ch_out=ch_out * self.expansion,
filter_size=1,
stride=stride,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr)
self.std_senet = std_senet
if self.std_senet:
self.se = SELayer(ch_out * self.expansion)
def forward(self, inputs):
out = self.branch2a(inputs)
out = self.branch2b(out)
out = self.branch2c(out)
if self.std_senet:
out = self.se(out)
if self.shortcut:
short = inputs
else:
short = self.short(inputs)
out = paddle.add(x=out, y=short)
out = F.relu(out)
return out
class Blocks(nn.Layer):
def __init__(self,
block,
ch_in,
ch_out,
count,
name_adapter,
stage_num,
variant='b',
groups=1,
base_width=64,
lr=1.0,
norm_type='bn',
norm_decay=0.,
freeze_norm=True,
dcn_v2=False,
std_senet=False):
super(Blocks, self).__init__()
self.blocks = []
for i in range(count):
conv_name = name_adapter.fix_layer_warp_name(stage_num, count, i)
layer = self.add_sublayer(
conv_name,
block(
ch_in=ch_in,
ch_out=ch_out,
stride=2 if i == 0 and stage_num != 2 else 1,
shortcut=False if i == 0 else True,
variant=variant,
groups=groups,
base_width=base_width,
lr=lr,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
dcn_v2=dcn_v2,
std_senet=std_senet))
self.blocks.append(layer)
if i == 0:
ch_in = ch_out * block.expansion
def forward(self, inputs):
block_out = inputs
for block in self.blocks:
block_out = block(block_out)
return block_out
@register
@serializable
class ResNet(nn.Layer):
__shared__ = ['norm_type']
def __init__(self,
depth=50,
ch_in=64,
variant='b',
lr_mult_list=[1.0, 1.0, 1.0, 1.0],
groups=1,
base_width=64,
norm_type='bn',
norm_decay=0,
freeze_norm=True,
freeze_at=0,
return_idx=[0, 1, 2, 3],
dcn_v2_stages=[-1],
num_stages=4,
std_senet=False,
freeze_stem_only=False):
"""
Residual Network, see https://arxiv.org/abs/1512.03385
Args:
depth (int): ResNet depth, should be 18, 34, 50, 101, 152.
ch_in (int): output channel of first stage, default 64
variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently
lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),
lower learning rate ratio is need for pretrained model
got using distillation(default as [1.0, 1.0, 1.0, 1.0]).
groups (int): group convolution cardinality
base_width (int): base width of each group convolution
norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'
norm_decay (float): weight decay for normalization layer weights
freeze_norm (bool): freeze normalization layers
freeze_at (int): freeze the backbone at which stage
return_idx (list): index of the stages whose feature maps are returned
dcn_v2_stages (list): index of stages who select deformable conv v2
num_stages (int): total num of stages
std_senet (bool): whether use senet, default False.
"""
super(ResNet, self).__init__()
self._model_type = 'ResNet' if groups == 1 else 'ResNeXt'
assert num_stages >= 1 and num_stages <= 4
self.depth = depth
self.variant = variant
self.groups = groups
self.base_width = base_width
self.norm_type = norm_type
self.norm_decay = norm_decay
self.freeze_norm = freeze_norm
self.freeze_at = freeze_at
if isinstance(return_idx, Integral):
return_idx = [return_idx]
assert max(return_idx) < num_stages, \
'the maximum return index must smaller than num_stages, ' \
'but received maximum return index is {} and num_stages ' \
'is {}'.format(max(return_idx), num_stages)
self.return_idx = return_idx
self.num_stages = num_stages
assert len(lr_mult_list) == 4, \
"lr_mult_list length must be 4 but got {}".format(len(lr_mult_list))
if isinstance(dcn_v2_stages, Integral):
dcn_v2_stages = [dcn_v2_stages]
assert max(dcn_v2_stages) < num_stages
if isinstance(dcn_v2_stages, Integral):
dcn_v2_stages = [dcn_v2_stages]
assert max(dcn_v2_stages) < num_stages
self.dcn_v2_stages = dcn_v2_stages
block_nums = ResNet_cfg[depth]
na = NameAdapter(self)
conv1_name = na.fix_c1_stage_name()
if variant in ['c', 'd']:
conv_def = [
[3, ch_in // 2, 3, 2, "conv1_1"],
[ch_in // 2, ch_in // 2, 3, 1, "conv1_2"],
[ch_in // 2, ch_in, 3, 1, "conv1_3"],
]
else:
conv_def = [[3, ch_in, 7, 2, conv1_name]]
self.conv1 = nn.Sequential()
for (c_in, c_out, k, s, _name) in conv_def:
self.conv1.add_sublayer(
_name,
ConvNormLayer(
ch_in=c_in,
ch_out=c_out,
filter_size=k,
stride=s,
groups=1,
act='relu',
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=1.0))
self.ch_in = ch_in
ch_out_list = [64, 128, 256, 512]
block = BottleNeck if depth >= 50 else BasicBlock
self._out_channels = [block.expansion * v for v in ch_out_list]
self._out_strides = [4, 8, 16, 32]
self.res_layers = []
for i in range(num_stages):
lr_mult = lr_mult_list[i]
stage_num = i + 2
res_name = "res{}".format(stage_num)
res_layer = self.add_sublayer(
res_name,
Blocks(
block,
self.ch_in,
ch_out_list[i],
count=block_nums[i],
name_adapter=na,
stage_num=stage_num,
variant=variant,
groups=groups,
base_width=base_width,
lr=lr_mult,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
dcn_v2=(i in self.dcn_v2_stages),
std_senet=std_senet))
self.res_layers.append(res_layer)
self.ch_in = self._out_channels[i]
if freeze_at >= 0:
self._freeze_parameters(self.conv1)
if not freeze_stem_only:
for i in range(min(freeze_at + 1, num_stages)):
self._freeze_parameters(self.res_layers[i])
def _freeze_parameters(self, m):
for p in m.parameters():
p.stop_gradient = True
@property
def out_shape(self):
return [
ShapeSpec(
channels=self._out_channels[i], stride=self._out_strides[i])
for i in self.return_idx
]
def forward(self, inputs):
x = inputs['image']
conv1 = self.conv1(x)
x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1)
outs = []
for idx, stage in enumerate(self.res_layers):
x = stage(x)
if idx in self.return_idx:
outs.append(x)
return outs
@register
class Res5Head(nn.Layer):
def __init__(self, depth=50):
super(Res5Head, self).__init__()
feat_in, feat_out = [1024, 512]
if depth < 50:
feat_in = 256
na = NameAdapter(self)
block = BottleNeck if depth >= 50 else BasicBlock
self.res5 = Blocks(
block, feat_in, feat_out, count=3, name_adapter=na, stage_num=5)
self.feat_out = feat_out if depth < 50 else feat_out * 4
@property
def out_shape(self):
return [ShapeSpec(
channels=self.feat_out,
stride=16, )]
def forward(self, roi_feat, stage=0):
y = self.res5(roi_feat)
return y

View File

@@ -0,0 +1,250 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
from paddle import ParamAttr
import paddle.nn.functional as F
from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm2D
from paddle.nn.initializer import KaimingNormal
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register, serializable
from numbers import Integral
from ..shape_spec import ShapeSpec
from ppdet.modeling.ops import channel_shuffle
__all__ = ['ShuffleNetV2']
class ConvBNLayer(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
padding,
groups=1,
act=None):
super(ConvBNLayer, self).__init__()
self._conv = Conv2D(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=groups,
weight_attr=ParamAttr(initializer=KaimingNormal()),
bias_attr=False)
self._batch_norm = BatchNorm2D(
out_channels,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
if act == "hard_swish":
act = 'hardswish'
self.act = act
def forward(self, inputs):
y = self._conv(inputs)
y = self._batch_norm(y)
if self.act:
y = getattr(F, self.act)(y)
return y
class InvertedResidual(nn.Layer):
def __init__(self, in_channels, out_channels, stride, act="relu"):
super(InvertedResidual, self).__init__()
self._conv_pw = ConvBNLayer(
in_channels=in_channels // 2,
out_channels=out_channels // 2,
kernel_size=1,
stride=1,
padding=0,
groups=1,
act=act)
self._conv_dw = ConvBNLayer(
in_channels=out_channels // 2,
out_channels=out_channels // 2,
kernel_size=3,
stride=stride,
padding=1,
groups=out_channels // 2,
act=None)
self._conv_linear = ConvBNLayer(
in_channels=out_channels // 2,
out_channels=out_channels // 2,
kernel_size=1,
stride=1,
padding=0,
groups=1,
act=act)
def forward(self, inputs):
x1, x2 = paddle.split(
inputs,
num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],
axis=1)
x2 = self._conv_pw(x2)
x2 = self._conv_dw(x2)
x2 = self._conv_linear(x2)
out = paddle.concat([x1, x2], axis=1)
return channel_shuffle(out, 2)
class InvertedResidualDS(nn.Layer):
def __init__(self, in_channels, out_channels, stride, act="relu"):
super(InvertedResidualDS, self).__init__()
# branch1
self._conv_dw_1 = ConvBNLayer(
in_channels=in_channels,
out_channels=in_channels,
kernel_size=3,
stride=stride,
padding=1,
groups=in_channels,
act=None)
self._conv_linear_1 = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels // 2,
kernel_size=1,
stride=1,
padding=0,
groups=1,
act=act)
# branch2
self._conv_pw_2 = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels // 2,
kernel_size=1,
stride=1,
padding=0,
groups=1,
act=act)
self._conv_dw_2 = ConvBNLayer(
in_channels=out_channels // 2,
out_channels=out_channels // 2,
kernel_size=3,
stride=stride,
padding=1,
groups=out_channels // 2,
act=None)
self._conv_linear_2 = ConvBNLayer(
in_channels=out_channels // 2,
out_channels=out_channels // 2,
kernel_size=1,
stride=1,
padding=0,
groups=1,
act=act)
def forward(self, inputs):
x1 = self._conv_dw_1(inputs)
x1 = self._conv_linear_1(x1)
x2 = self._conv_pw_2(inputs)
x2 = self._conv_dw_2(x2)
x2 = self._conv_linear_2(x2)
out = paddle.concat([x1, x2], axis=1)
return channel_shuffle(out, 2)
@register
@serializable
class ShuffleNetV2(nn.Layer):
def __init__(self, scale=1.0, act="relu", feature_maps=[5, 13, 17]):
super(ShuffleNetV2, self).__init__()
self.scale = scale
if isinstance(feature_maps, Integral):
feature_maps = [feature_maps]
self.feature_maps = feature_maps
stage_repeats = [4, 8, 4]
if scale == 0.25:
stage_out_channels = [-1, 24, 24, 48, 96, 512]
elif scale == 0.33:
stage_out_channels = [-1, 24, 32, 64, 128, 512]
elif scale == 0.5:
stage_out_channels = [-1, 24, 48, 96, 192, 1024]
elif scale == 1.0:
stage_out_channels = [-1, 24, 116, 232, 464, 1024]
elif scale == 1.5:
stage_out_channels = [-1, 24, 176, 352, 704, 1024]
elif scale == 2.0:
stage_out_channels = [-1, 24, 244, 488, 976, 2048]
else:
raise NotImplementedError("This scale size:[" + str(scale) +
"] is not implemented!")
self._out_channels = []
self._feature_idx = 0
# 1. conv1
self._conv1 = ConvBNLayer(
in_channels=3,
out_channels=stage_out_channels[1],
kernel_size=3,
stride=2,
padding=1,
act=act)
self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
self._feature_idx += 1
# 2. bottleneck sequences
self._block_list = []
for stage_id, num_repeat in enumerate(stage_repeats):
for i in range(num_repeat):
if i == 0:
block = self.add_sublayer(
name=str(stage_id + 2) + '_' + str(i + 1),
sublayer=InvertedResidualDS(
in_channels=stage_out_channels[stage_id + 1],
out_channels=stage_out_channels[stage_id + 2],
stride=2,
act=act))
else:
block = self.add_sublayer(
name=str(stage_id + 2) + '_' + str(i + 1),
sublayer=InvertedResidual(
in_channels=stage_out_channels[stage_id + 2],
out_channels=stage_out_channels[stage_id + 2],
stride=1,
act=act))
self._block_list.append(block)
self._feature_idx += 1
self._update_out_channels(stage_out_channels[stage_id + 2],
self._feature_idx, self.feature_maps)
def _update_out_channels(self, channel, feature_idx, feature_maps):
if feature_idx in feature_maps:
self._out_channels.append(channel)
def forward(self, inputs):
y = self._conv1(inputs['image'])
y = self._max_pool(y)
outs = []
for i, inv in enumerate(self._block_list):
y = inv(y)
if i + 2 in self.feature_maps:
outs.append(y)
return outs
@property
def out_shape(self):
return [ShapeSpec(channels=c) for c in self._out_channels]

View File

@@ -0,0 +1,752 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
Ths copyright of microsoft/Swin-Transformer is as follows:
MIT License [see LICENSE for details]
"""
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.modeling.shape_spec import ShapeSpec
from ppdet.core.workspace import register, serializable
from .transformer_utils import DropPath, Identity
from .transformer_utils import add_parameter, to_2tuple
from .transformer_utils import ones_, zeros_, trunc_normal_
__all__ = ['SwinTransformer']
MODEL_cfg = {
# use 22kto1k finetune weights as default pretrained, can set by SwinTransformer.pretrained in config
'swin_T_224': dict(
pretrain_img_size=224,
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_tiny_patch4_window7_224_22kto1k_pretrained.pdparams',
),
'swin_S_224': dict(
pretrain_img_size=224,
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_small_patch4_window7_224_22kto1k_pretrained.pdparams',
),
'swin_B_224': dict(
pretrain_img_size=224,
embed_dim=128,
depths=[2, 2, 18, 2],
num_heads=[4, 8, 16, 32],
window_size=7,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window7_224_22kto1k_pretrained.pdparams',
),
'swin_L_224': dict(
pretrain_img_size=224,
embed_dim=192,
depths=[2, 2, 18, 2],
num_heads=[6, 12, 24, 48],
window_size=7,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window7_224_22kto1k_pretrained.pdparams',
),
'swin_B_384': dict(
pretrain_img_size=384,
embed_dim=128,
depths=[2, 2, 18, 2],
num_heads=[4, 8, 16, 32],
window_size=12,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window12_384_22kto1k_pretrained.pdparams',
),
'swin_L_384': dict(
pretrain_img_size=384,
embed_dim=192,
depths=[2, 2, 18, 2],
num_heads=[6, 12, 24, 48],
window_size=12,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window12_384_22kto1k_pretrained.pdparams',
),
}
class Mlp(nn.Layer):
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer=nn.GELU,
drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
def window_partition(x, window_size):
"""
Args:
x: (B, H, W, C)
window_size (int): window size
Returns:
windows: (num_windows*B, window_size, window_size, C)
"""
B, H, W, C = x.shape
x = x.reshape(
[-1, H // window_size, window_size, W // window_size, window_size, C])
windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
[-1, window_size, window_size, C])
return windows
def window_reverse(windows, window_size, H, W):
"""
Args:
windows: (num_windows*B, window_size, window_size, C)
window_size (int): Window size
H (int): Height of image
W (int): Width of image
Returns:
x: (B, H, W, C)
"""
_, _, _, C = windows.shape
B = int(windows.shape[0] / (H * W / window_size / window_size))
x = windows.reshape(
[-1, H // window_size, W // window_size, window_size, window_size, C])
x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C])
return x
class WindowAttention(nn.Layer):
""" Window based multi-head self attention (W-MSA) module with relative position bias.
It supports both of shifted and non-shifted window.
Args:
dim (int): Number of input channels.
window_size (tuple[int]): The height and width of the window.
num_heads (int): Number of attention heads.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
"""
def __init__(self,
dim,
window_size,
num_heads,
qkv_bias=True,
qk_scale=None,
attn_drop=0.,
proj_drop=0.):
super().__init__()
self.dim = dim
self.window_size = window_size # Wh, Ww
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim**-0.5
# define a parameter table of relative position bias
self.relative_position_bias_table = add_parameter(
self,
paddle.zeros(((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
num_heads))) # 2*Wh-1 * 2*Ww-1, nH
# get pair-wise relative position index for each token inside the window
coords_h = paddle.arange(self.window_size[0])
coords_w = paddle.arange(self.window_size[1])
coords = paddle.stack(paddle.meshgrid(
[coords_h, coords_w])) # 2, Wh, Ww
coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww
coords_flatten_1 = coords_flatten.unsqueeze(axis=2)
coords_flatten_2 = coords_flatten.unsqueeze(axis=1)
relative_coords = coords_flatten_1 - coords_flatten_2
relative_coords = relative_coords.transpose(
[1, 2, 0]) # Wh*Ww, Wh*Ww, 2
relative_coords[:, :, 0] += self.window_size[
0] - 1 # shift to start from 0
relative_coords[:, :, 1] += self.window_size[1] - 1
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
self.relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
trunc_normal_(self.relative_position_bias_table)
self.softmax = nn.Softmax(axis=-1)
def forward(self, x, mask=None):
""" Forward function.
Args:
x: input features with shape of (num_windows*B, N, C)
mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
"""
B_, N, C = x.shape
qkv = self.qkv(x).reshape(
[-1, N, 3, self.num_heads, C // self.num_heads]).transpose(
[2, 0, 3, 1, 4])
q, k, v = qkv[0], qkv[1], qkv[2]
q = q * self.scale
attn = paddle.mm(q, k.transpose([0, 1, 3, 2]))
index = self.relative_position_index.flatten()
relative_position_bias = paddle.index_select(
self.relative_position_bias_table, index)
relative_position_bias = relative_position_bias.reshape([
self.window_size[0] * self.window_size[1],
self.window_size[0] * self.window_size[1], -1
]) # Wh*Ww,Wh*Ww,nH
relative_position_bias = relative_position_bias.transpose(
[2, 0, 1]) # nH, Wh*Ww, Wh*Ww
attn = attn + relative_position_bias.unsqueeze(0)
if mask is not None:
nW = mask.shape[0]
attn = attn.reshape([-1, nW, self.num_heads, N, N
]) + mask.unsqueeze(1).unsqueeze(0)
attn = attn.reshape([-1, self.num_heads, N, N])
attn = self.softmax(attn)
else:
attn = self.softmax(attn)
attn = self.attn_drop(attn)
# x = (attn @ v).transpose(1, 2).reshape([B_, N, C])
x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([-1, N, C])
x = self.proj(x)
x = self.proj_drop(x)
return x
class SwinTransformerBlock(nn.Layer):
""" Swin Transformer Block.
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads.
window_size (int): Window size.
shift_size (int): Shift size for SW-MSA.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
drop (float, optional): Dropout rate. Default: 0.0
attn_drop (float, optional): Attention dropout rate. Default: 0.0
drop_path (float, optional): Stochastic depth rate. Default: 0.0
act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
"""
def __init__(self,
dim,
num_heads,
window_size=7,
shift_size=0,
mlp_ratio=4.,
qkv_bias=True,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm):
super().__init__()
self.dim = dim
self.num_heads = num_heads
self.window_size = window_size
self.shift_size = shift_size
self.mlp_ratio = mlp_ratio
assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
self.norm1 = norm_layer(dim)
self.attn = WindowAttention(
dim,
window_size=to_2tuple(self.window_size),
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop)
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
self.H = None
self.W = None
def forward(self, x, mask_matrix):
""" Forward function.
Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
mask_matrix: Attention mask for cyclic shift.
"""
B, L, C = x.shape
H, W = self.H, self.W
assert L == H * W, "input feature has wrong size"
shortcut = x
x = self.norm1(x)
x = x.reshape([-1, H, W, C])
# pad feature maps to multiples of window size
pad_l = pad_t = 0
pad_r = (self.window_size - W % self.window_size) % self.window_size
pad_b = (self.window_size - H % self.window_size) % self.window_size
x = F.pad(x, [0, pad_l, 0, pad_b, 0, pad_r, 0, pad_t],
data_format='NHWC')
_, Hp, Wp, _ = x.shape
# cyclic shift
if self.shift_size > 0:
shifted_x = paddle.roll(
x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))
attn_mask = mask_matrix
else:
shifted_x = x
attn_mask = None
# partition windows
x_windows = window_partition(
shifted_x, self.window_size) # nW*B, window_size, window_size, C
x_windows = x_windows.reshape(
[x_windows.shape[0], self.window_size * self.window_size,
C]) # nW*B, window_size*window_size, C
# W-MSA/SW-MSA
attn_windows = self.attn(
x_windows, mask=attn_mask) # nW*B, window_size*window_size, C
# merge windows
attn_windows = attn_windows.reshape(
[x_windows.shape[0], self.window_size, self.window_size, C])
shifted_x = window_reverse(attn_windows, self.window_size, Hp,
Wp) # B H' W' C
# reverse cyclic shift
if self.shift_size > 0:
x = paddle.roll(
shifted_x,
shifts=(self.shift_size, self.shift_size),
axis=(1, 2))
else:
x = shifted_x
if pad_r > 0 or pad_b > 0:
x = x[:, :H, :W, :]
x = x.reshape([-1, H * W, C])
# FFN
x = shortcut + self.drop_path(x)
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
class PatchMerging(nn.Layer):
r""" Patch Merging Layer.
Args:
dim (int): Number of input channels.
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
"""
def __init__(self, dim, norm_layer=nn.LayerNorm):
super().__init__()
self.dim = dim
self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
self.norm = norm_layer(4 * dim)
def forward(self, x, H, W):
""" Forward function.
Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
"""
B, L, C = x.shape
assert L == H * W, "input feature has wrong size"
x = x.reshape([-1, H, W, C])
# padding
pad_input = (H % 2 == 1) or (W % 2 == 1)
if pad_input:
# paddle F.pad default data_format is 'NCHW'
x = F.pad(x, [0, 0, 0, H % 2, 0, W % 2, 0, 0], data_format='NHWC')
H += H % 2
W += W % 2
x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
x = paddle.concat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
x = x.reshape([-1, H * W // 4, 4 * C]) # B H/2*W/2 4*C
x = self.norm(x)
x = self.reduction(x)
return x
class BasicLayer(nn.Layer):
""" A basic Swin Transformer layer for one stage.
Args:
dim (int): Number of input channels.
depth (int): Number of blocks.
num_heads (int): Number of attention heads.
window_size (int): Local window size.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
drop (float, optional): Dropout rate. Default: 0.0
attn_drop (float, optional): Attention dropout rate. Default: 0.0
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
"""
def __init__(self,
dim,
depth,
num_heads,
window_size=7,
mlp_ratio=4.,
qkv_bias=True,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
norm_layer=nn.LayerNorm,
downsample=None):
super().__init__()
self.window_size = window_size
self.shift_size = window_size // 2
self.depth = depth
# build blocks
self.blocks = nn.LayerList([
SwinTransformerBlock(
dim=dim,
num_heads=num_heads,
window_size=window_size,
shift_size=0 if (i % 2 == 0) else window_size // 2,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop,
attn_drop=attn_drop,
drop_path=drop_path[i]
if isinstance(drop_path, np.ndarray) else drop_path,
norm_layer=norm_layer) for i in range(depth)
])
# patch merging layer
if downsample is not None:
self.downsample = downsample(dim=dim, norm_layer=norm_layer)
else:
self.downsample = None
def forward(self, x, H, W):
""" Forward function.
Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
"""
# calculate attention mask for SW-MSA
Hp = int(np.ceil(H / self.window_size)) * self.window_size
Wp = int(np.ceil(W / self.window_size)) * self.window_size
img_mask = paddle.zeros([1, Hp, Wp, 1], dtype='float32') # 1 Hp Wp 1
h_slices = (slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None))
w_slices = (slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None))
cnt = 0
for h in h_slices:
for w in w_slices:
img_mask[:, h, w, :] = cnt
cnt += 1
mask_windows = window_partition(
img_mask, self.window_size) # nW, window_size, window_size, 1
mask_windows = mask_windows.reshape(
[-1, self.window_size * self.window_size])
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
huns = -100.0 * paddle.ones_like(attn_mask)
attn_mask = huns * (attn_mask != 0).astype("float32")
for blk in self.blocks:
blk.H, blk.W = H, W
x = blk(x, attn_mask)
if self.downsample is not None:
x_down = self.downsample(x, H, W)
Wh, Ww = (H + 1) // 2, (W + 1) // 2
return x, H, W, x_down, Wh, Ww
else:
return x, H, W, x, H, W
class PatchEmbed(nn.Layer):
""" Image to Patch Embedding
Args:
patch_size (int): Patch token size. Default: 4.
in_chans (int): Number of input image channels. Default: 3.
embed_dim (int): Number of linear projection output channels. Default: 96.
norm_layer (nn.Layer, optional): Normalization layer. Default: None
"""
def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
super().__init__()
patch_size = to_2tuple(patch_size)
self.patch_size = patch_size
self.in_chans = in_chans
self.embed_dim = embed_dim
self.proj = nn.Conv2D(
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
if norm_layer is not None:
self.norm = norm_layer(embed_dim)
else:
self.norm = None
def forward(self, x):
# TODO # export dynamic shape
B, C, H, W = x.shape
# assert [H, W] == self.img_size[:2], "Input image size ({H}*{W}) doesn't match model ({}*{}).".format(H, W, self.img_size[0], self.img_size[1])
if W % self.patch_size[1] != 0:
x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
if H % self.patch_size[0] != 0:
x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
x = self.proj(x)
if self.norm is not None:
_, _, Wh, Ww = x.shape
x = x.flatten(2).transpose([0, 2, 1])
x = self.norm(x)
x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
return x
@register
@serializable
class SwinTransformer(nn.Layer):
""" Swin Transformer backbone
Args:
arch (str): Architecture of FocalNet
pretrain_img_size (int | tuple(int)): Input image size. Default 224
patch_size (int | tuple(int)): Patch size. Default: 4
in_chans (int): Number of input image channels. Default: 3
embed_dim (int): Patch embedding dimension. Default: 96
depths (tuple(int)): Depth of each Swin Transformer layer.
num_heads (tuple(int)): Number of attention heads in different layers.
window_size (int): Window size. Default: 7
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
drop_rate (float): Dropout rate. Default: 0
attn_drop_rate (float): Attention dropout rate. Default: 0
drop_path_rate (float): Stochastic depth rate. Default: 0.1
norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
patch_norm (bool): If True, add normalization after patch embedding. Default: True
"""
def __init__(self,
arch='swin_T_224',
pretrain_img_size=224,
patch_size=4,
in_chans=3,
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
mlp_ratio=4.,
qkv_bias=True,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.2,
norm_layer=nn.LayerNorm,
ape=False,
patch_norm=True,
out_indices=(0, 1, 2, 3),
frozen_stages=-1,
pretrained=None):
super(SwinTransformer, self).__init__()
assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch)
pretrain_img_size = MODEL_cfg[arch]['pretrain_img_size']
embed_dim = MODEL_cfg[arch]['embed_dim']
depths = MODEL_cfg[arch]['depths']
num_heads = MODEL_cfg[arch]['num_heads']
window_size = MODEL_cfg[arch]['window_size']
if pretrained is None:
pretrained = MODEL_cfg[arch]['pretrained']
self.num_layers = len(depths)
self.ape = ape
self.patch_norm = patch_norm
self.out_indices = out_indices
self.frozen_stages = frozen_stages
# split image into non-overlapping patches
self.patch_embed = PatchEmbed(
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim,
norm_layer=norm_layer if self.patch_norm else None)
# absolute position embedding
if self.ape:
pretrain_img_size = to_2tuple(pretrain_img_size)
patch_size = to_2tuple(patch_size)
patches_resolution = [
pretrain_img_size[0] // patch_size[0],
pretrain_img_size[1] // patch_size[1]
]
self.absolute_pos_embed = add_parameter(
self,
paddle.zeros((1, embed_dim, patches_resolution[0],
patches_resolution[1])))
trunc_normal_(self.absolute_pos_embed)
self.pos_drop = nn.Dropout(p=drop_rate)
# stochastic depth
dpr = np.linspace(0, drop_path_rate,
sum(depths)) # stochastic depth decay rule
# build layers
self.layers = nn.LayerList()
for i_layer in range(self.num_layers):
layer = BasicLayer(
dim=int(embed_dim * 2**i_layer),
depth=depths[i_layer],
num_heads=num_heads[i_layer],
window_size=window_size,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
norm_layer=norm_layer,
downsample=PatchMerging
if (i_layer < self.num_layers - 1) else None)
self.layers.append(layer)
num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
self.num_features = num_features
# add a norm layer for each output
for i_layer in out_indices:
layer = norm_layer(num_features[i_layer])
layer_name = f'norm{i_layer}'
self.add_sublayer(layer_name, layer)
self.apply(self._init_weights)
self._freeze_stages()
if pretrained:
if 'http' in pretrained: #URL
path = paddle.utils.download.get_weights_path_from_url(
pretrained)
else: #model in local path
path = pretrained
self.set_state_dict(paddle.load(path))
def _freeze_stages(self):
if self.frozen_stages >= 0:
self.patch_embed.eval()
for param in self.patch_embed.parameters():
param.stop_gradient = True
if self.frozen_stages >= 1 and self.ape:
self.absolute_pos_embed.stop_gradient = True
if self.frozen_stages >= 2:
self.pos_drop.eval()
for i in range(0, self.frozen_stages - 1):
m = self.layers[i]
m.eval()
for param in m.parameters():
param.stop_gradient = True
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight)
if isinstance(m, nn.Linear) and m.bias is not None:
zeros_(m.bias)
elif isinstance(m, nn.LayerNorm):
zeros_(m.bias)
ones_(m.weight)
def forward(self, x):
"""Forward function."""
x = self.patch_embed(x['image'])
B, _, Wh, Ww = x.shape
if self.ape:
# interpolate the position embedding to the corresponding size
absolute_pos_embed = F.interpolate(
self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1])
else:
x = x.flatten(2).transpose([0, 2, 1])
x = self.pos_drop(x)
outs = []
for i in range(self.num_layers):
layer = self.layers[i]
x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
if i in self.out_indices:
norm_layer = getattr(self, f'norm{i}')
x_out = norm_layer(x_out)
out = x_out.reshape((-1, H, W, self.num_features[i])).transpose(
(0, 3, 1, 2))
outs.append(out)
return outs
@property
def out_shape(self):
out_strides = [4, 8, 16, 32]
return [
ShapeSpec(
channels=self.num_features[i], stride=out_strides[i])
for i in self.out_indices
]

View File

@@ -0,0 +1,381 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn import ReLU, Swish, GELU
import math
from ppdet.core.workspace import register
from ..shape_spec import ShapeSpec
__all__ = ['TransEncoder']
class BertEmbeddings(nn.Layer):
def __init__(self, word_size, position_embeddings_size, word_type_size,
hidden_size, dropout_prob):
super(BertEmbeddings, self).__init__()
self.word_embeddings = nn.Embedding(
word_size, hidden_size, padding_idx=0)
self.position_embeddings = nn.Embedding(position_embeddings_size,
hidden_size)
self.token_type_embeddings = nn.Embedding(word_type_size, hidden_size)
self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
self.dropout = nn.Dropout(dropout_prob)
def forward(self, x, token_type_ids=None, position_ids=None):
seq_len = paddle.shape(x)[1]
if position_ids is None:
position_ids = paddle.arange(seq_len).unsqueeze(0).expand_as(x)
if token_type_ids is None:
token_type_ids = paddle.zeros(paddle.shape(x))
word_embs = self.word_embeddings(x)
position_embs = self.position_embeddings(position_ids)
token_type_embs = self.token_type_embeddings(token_type_ids)
embs_cmb = word_embs + position_embs + token_type_embs
embs_out = self.layernorm(embs_cmb)
embs_out = self.dropout(embs_out)
return embs_out
class BertSelfAttention(nn.Layer):
def __init__(self,
hidden_size,
num_attention_heads,
attention_probs_dropout_prob,
output_attentions=False):
super(BertSelfAttention, self).__init__()
if hidden_size % num_attention_heads != 0:
raise ValueError(
"The hidden_size must be a multiple of the number of attention "
"heads, but got {} % {} != 0" %
(hidden_size, num_attention_heads))
self.num_attention_heads = num_attention_heads
self.attention_head_size = int(hidden_size / num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(hidden_size, self.all_head_size)
self.key = nn.Linear(hidden_size, self.all_head_size)
self.value = nn.Linear(hidden_size, self.all_head_size)
self.dropout = nn.Dropout(attention_probs_dropout_prob)
self.output_attentions = output_attentions
def forward(self, x, attention_mask, head_mask=None):
query = self.query(x)
key = self.key(x)
value = self.value(x)
query_dim1, query_dim2 = paddle.shape(query)[:-1]
new_shape = [
query_dim1, query_dim2, self.num_attention_heads,
self.attention_head_size
]
query = query.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
key = key.reshape(new_shape).transpose(perm=(0, 2, 3, 1))
value = value.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
attention = paddle.matmul(query,
key) / math.sqrt(self.attention_head_size)
attention = attention + attention_mask
attention_value = F.softmax(attention, axis=-1)
attention_value = self.dropout(attention_value)
if head_mask is not None:
attention_value = attention_value * head_mask
context = paddle.matmul(attention_value, value).transpose(perm=(0, 2, 1,
3))
ctx_dim1, ctx_dim2 = paddle.shape(context)[:-2]
new_context_shape = [
ctx_dim1,
ctx_dim2,
self.all_head_size,
]
context = context.reshape(new_context_shape)
if self.output_attentions:
return (context, attention_value)
else:
return (context, )
class BertAttention(nn.Layer):
def __init__(self,
hidden_size,
num_attention_heads,
attention_probs_dropout_prob,
fc_dropout_prob,
output_attentions=False):
super(BertAttention, self).__init__()
self.bert_selfattention = BertSelfAttention(
hidden_size, num_attention_heads, attention_probs_dropout_prob,
output_attentions)
self.fc = nn.Linear(hidden_size, hidden_size)
self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
self.dropout = nn.Dropout(fc_dropout_prob)
def forward(self, x, attention_mask, head_mask=None):
attention_feats = self.bert_selfattention(x, attention_mask, head_mask)
features = self.fc(attention_feats[0])
features = self.dropout(features)
features = self.layernorm(features + x)
if len(attention_feats) == 2:
return (features, attention_feats[1])
else:
return (features, )
class BertFeedForward(nn.Layer):
def __init__(self,
hidden_size,
intermediate_size,
num_attention_heads,
attention_probs_dropout_prob,
fc_dropout_prob,
act_fn='ReLU',
output_attentions=False):
super(BertFeedForward, self).__init__()
self.fc1 = nn.Linear(hidden_size, intermediate_size)
self.act_fn = eval(act_fn)
self.fc2 = nn.Linear(intermediate_size, hidden_size)
self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
self.dropout = nn.Dropout(fc_dropout_prob)
def forward(self, x):
features = self.fc1(x)
features = self.act_fn(features)
features = self.fc2(features)
features = self.dropout(features)
features = self.layernorm(features + x)
return features
class BertLayer(nn.Layer):
def __init__(self,
hidden_size,
intermediate_size,
num_attention_heads,
attention_probs_dropout_prob,
fc_dropout_prob,
act_fn='ReLU',
output_attentions=False):
super(BertLayer, self).__init__()
self.attention = BertAttention(hidden_size, num_attention_heads,
attention_probs_dropout_prob,
output_attentions)
self.feed_forward = BertFeedForward(
hidden_size, intermediate_size, num_attention_heads,
attention_probs_dropout_prob, fc_dropout_prob, act_fn,
output_attentions)
def forward(self, x, attention_mask, head_mask=None):
attention_feats = self.attention(x, attention_mask, head_mask)
features = self.feed_forward(attention_feats[0])
if len(attention_feats) == 2:
return (features, attention_feats[1])
else:
return (features, )
class BertEncoder(nn.Layer):
def __init__(self,
num_hidden_layers,
hidden_size,
intermediate_size,
num_attention_heads,
attention_probs_dropout_prob,
fc_dropout_prob,
act_fn='ReLU',
output_attentions=False,
output_hidden_feats=False):
super(BertEncoder, self).__init__()
self.output_attentions = output_attentions
self.output_hidden_feats = output_hidden_feats
self.layers = nn.LayerList([
BertLayer(hidden_size, intermediate_size, num_attention_heads,
attention_probs_dropout_prob, fc_dropout_prob, act_fn,
output_attentions) for _ in range(num_hidden_layers)
])
def forward(self, x, attention_mask, head_mask=None):
all_features = (x, )
all_attentions = ()
for i, layer in enumerate(self.layers):
mask = head_mask[i] if head_mask is not None else None
layer_out = layer(x, attention_mask, mask)
if self.output_hidden_feats:
all_features = all_features + (x, )
x = layer_out[0]
if self.output_attentions:
all_attentions = all_attentions + (layer_out[1], )
outputs = (x, )
if self.output_hidden_feats:
outputs += (all_features, )
if self.output_attentions:
outputs += (all_attentions, )
return outputs
class BertPooler(nn.Layer):
def __init__(self, hidden_size):
super(BertPooler, self).__init__()
self.fc = nn.Linear(hidden_size, hidden_size)
self.act = nn.Tanh()
def forward(self, x):
first_token = x[:, 0]
pooled_output = self.fc(first_token)
pooled_output = self.act(pooled_output)
return pooled_output
class METROEncoder(nn.Layer):
def __init__(self,
vocab_size,
num_hidden_layers,
features_dims,
position_embeddings_size,
hidden_size,
intermediate_size,
output_feature_dim,
num_attention_heads,
attention_probs_dropout_prob,
fc_dropout_prob,
act_fn='ReLU',
output_attentions=False,
output_hidden_feats=False,
use_img_layernorm=False):
super(METROEncoder, self).__init__()
self.img_dims = features_dims
self.num_hidden_layers = num_hidden_layers
self.use_img_layernorm = use_img_layernorm
self.output_attentions = output_attentions
self.embedding = BertEmbeddings(vocab_size, position_embeddings_size, 2,
hidden_size, fc_dropout_prob)
self.encoder = BertEncoder(
num_hidden_layers, hidden_size, intermediate_size,
num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob,
act_fn, output_attentions, output_hidden_feats)
self.pooler = BertPooler(hidden_size)
self.position_embeddings = nn.Embedding(position_embeddings_size,
hidden_size)
self.img_embedding = nn.Linear(
features_dims, hidden_size, bias_attr=True)
self.dropout = nn.Dropout(fc_dropout_prob)
self.cls_head = nn.Linear(hidden_size, output_feature_dim)
self.residual = nn.Linear(features_dims, output_feature_dim)
self.apply(self.init_weights)
def init_weights(self, module):
""" Initialize the weights.
"""
if isinstance(module, (nn.Linear, nn.Embedding)):
module.weight.set_value(
paddle.normal(
mean=0.0, std=0.02, shape=module.weight.shape))
elif isinstance(module, nn.LayerNorm):
module.bias.set_value(paddle.zeros(shape=module.bias.shape))
module.weight.set_value(
paddle.full(
shape=module.weight.shape, fill_value=1.0))
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.set_value(paddle.zeros(shape=module.bias.shape))
def forward(self, x):
batchsize, seq_len = paddle.shape(x)[:2]
input_ids = paddle.zeros((batchsize, seq_len), dtype="int64")
position_ids = paddle.arange(
seq_len, dtype="int64").unsqueeze(0).expand_as(input_ids)
attention_mask = paddle.ones_like(input_ids).unsqueeze(1).unsqueeze(2)
head_mask = [None] * self.num_hidden_layers
position_embs = self.position_embeddings(position_ids)
attention_mask = (1.0 - attention_mask) * -10000.0
img_features = self.img_embedding(x)
# We empirically observe that adding an additional learnable position embedding leads to more stable training
embeddings = position_embs + img_features
if self.use_img_layernorm:
embeddings = self.layernorm(embeddings)
embeddings = self.dropout(embeddings)
encoder_outputs = self.encoder(
embeddings, attention_mask, head_mask=head_mask)
pred_score = self.cls_head(encoder_outputs[0])
res_img_feats = self.residual(x)
pred_score = pred_score + res_img_feats
if self.output_attentions and self.output_hidden_feats:
return pred_score, encoder_outputs[1], encoder_outputs[-1]
else:
return pred_score
def gelu(x):
"""Implementation of the gelu activation function.
https://arxiv.org/abs/1606.08415
"""
return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))
@register
class TransEncoder(nn.Layer):
def __init__(self,
vocab_size=30522,
num_hidden_layers=4,
num_attention_heads=4,
position_embeddings_size=512,
intermediate_size=3072,
input_feat_dim=[2048, 512, 128],
hidden_feat_dim=[1024, 256, 128],
attention_probs_dropout_prob=0.1,
fc_dropout_prob=0.1,
act_fn='gelu',
output_attentions=False,
output_hidden_feats=False):
super(TransEncoder, self).__init__()
output_feat_dim = input_feat_dim[1:] + [3]
trans_encoder = []
for i in range(len(output_feat_dim)):
features_dims = input_feat_dim[i]
output_feature_dim = output_feat_dim[i]
hidden_size = hidden_feat_dim[i]
# init a transformer encoder and append it to a list
assert hidden_size % num_attention_heads == 0
model = METROEncoder(vocab_size, num_hidden_layers, features_dims,
position_embeddings_size, hidden_size,
intermediate_size, output_feature_dim,
num_attention_heads,
attention_probs_dropout_prob, fc_dropout_prob,
act_fn, output_attentions, output_hidden_feats)
trans_encoder.append(model)
self.trans_encoder = paddle.nn.Sequential(*trans_encoder)
def forward(self, x):
out = self.trans_encoder(x)
return out

View File

@@ -0,0 +1,124 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import TruncatedNormal, Constant, Assign
# Common initializations
ones_ = Constant(value=1.)
zeros_ = Constant(value=0.)
trunc_normal_ = TruncatedNormal(std=.02)
# Common Layers
def drop_path(x, drop_prob=0., training=False):
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
"""
if drop_prob == 0. or not training:
return x
keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
random_tensor = paddle.floor(random_tensor) # binarize
output = x.divide(keep_prob) * random_tensor
return output
class DropPath(nn.Layer):
def __init__(self, drop_prob=None):
super(DropPath, self).__init__()
self.drop_prob = drop_prob
def forward(self, x):
return drop_path(x, self.drop_prob, self.training)
class Identity(nn.Layer):
def __init__(self):
super(Identity, self).__init__()
def forward(self, input):
return input
# common funcs
def to_2tuple(x):
if isinstance(x, (list, tuple)):
return x
return tuple([x] * 2)
def add_parameter(layer, datas, name=None):
parameter = layer.create_parameter(
shape=(datas.shape), default_initializer=Assign(datas))
if name:
layer.add_parameter(name, parameter)
return parameter
def window_partition(x, window_size):
"""
Partition into non-overlapping windows with padding if needed.
Args:
x (tensor): input tokens with [B, H, W, C].
window_size (int): window size.
Returns:
windows: windows after partition with [B * num_windows, window_size, window_size, C].
(Hp, Wp): padded height and width before partition
"""
B, H, W, C = paddle.shape(x)
pad_h = (window_size - H % window_size) % window_size
pad_w = (window_size - W % window_size) % window_size
x = F.pad(x.transpose([0, 3, 1, 2]),
paddle.to_tensor(
[0, int(pad_w), 0, int(pad_h)],
dtype='int32')).transpose([0, 2, 3, 1])
Hp, Wp = H + pad_h, W + pad_w
num_h, num_w = Hp // window_size, Wp // window_size
x = x.reshape([B, num_h, window_size, num_w, window_size, C])
windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
[-1, window_size, window_size, C])
return windows, (Hp, Wp), (num_h, num_w)
def window_unpartition(x, pad_hw, num_hw, hw):
"""
Window unpartition into original sequences and removing padding.
Args:
x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
pad_hw (Tuple): padded height and width (Hp, Wp).
hw (Tuple): original height and width (H, W) before padding.
Returns:
x: unpartitioned sequences with [B, H, W, C].
"""
Hp, Wp = pad_hw
num_h, num_w = num_hw
H, W = hw
B, window_size, _, C = paddle.shape(x)
B = B // (num_h * num_w)
x = x.reshape([B, num_h, num_w, window_size, window_size, C])
x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, Hp, Wp, C])
return x[:, :H, :W, :]

View File

@@ -0,0 +1,652 @@
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import numpy as np
from paddle.nn.initializer import Constant
from ppdet.modeling.shape_spec import ShapeSpec
from ppdet.core.workspace import register, serializable
from .transformer_utils import zeros_, DropPath, Identity
class Mlp(nn.Layer):
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer=nn.GELU,
drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class Attention(nn.Layer):
def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.,
proj_drop=0.,
window_size=None):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim**-0.5
self.qkv = nn.Linear(dim, dim * 3, bias_attr=False)
if qkv_bias:
self.q_bias = self.create_parameter(
shape=([dim]), default_initializer=zeros_)
self.v_bias = self.create_parameter(
shape=([dim]), default_initializer=zeros_)
else:
self.q_bias = None
self.v_bias = None
if window_size:
self.window_size = window_size
self.num_relative_distance = (2 * window_size[0] - 1) * (
2 * window_size[1] - 1) + 3
self.relative_position_bias_table = self.create_parameter(
shape=(self.num_relative_distance, num_heads),
default_initializer=zeros_) # 2*Wh-1 * 2*Ww-1, nH
# cls to token & token 2 cls & cls to cls
# get pair-wise relative position index for each token inside the window
coords_h = paddle.arange(window_size[0])
coords_w = paddle.arange(window_size[1])
coords = paddle.stack(paddle.meshgrid(
[coords_h, coords_w])) # 2, Wh, Ww
coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww
coords_flatten_1 = paddle.unsqueeze(coords_flatten, 2)
coords_flatten_2 = paddle.unsqueeze(coords_flatten, 1)
relative_coords = coords_flatten_1.clone() - coords_flatten_2.clone(
)
#relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Wh
relative_coords = relative_coords.transpose(
(1, 2, 0)) #.contiguous() # Wh*Ww, Wh*Ww, 2
relative_coords[:, :, 0] += window_size[
0] - 1 # shift to start from 0
relative_coords[:, :, 1] += window_size[1] - 1
relative_coords[:, :, 0] *= 2 * window_size[1] - 1
relative_position_index = \
paddle.zeros(shape=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
relative_position_index[1:, 1:] = relative_coords.sum(
-1) # Wh*Ww, Wh*Ww
relative_position_index[0, 0:] = self.num_relative_distance - 3
relative_position_index[0:, 0] = self.num_relative_distance - 2
relative_position_index[0, 0] = self.num_relative_distance - 1
self.register_buffer("relative_position_index",
relative_position_index)
# trunc_normal_(self.relative_position_bias_table, std=.0)
else:
self.window_size = None
self.relative_position_bias_table = None
self.relative_position_index = None
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x, rel_pos_bias=None):
x_shape = paddle.shape(x)
N, C = x_shape[1], x_shape[2]
qkv_bias = None
if self.q_bias is not None:
qkv_bias = paddle.concat(
(self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
qkv = qkv.reshape((-1, N, 3, self.num_heads,
C // self.num_heads)).transpose((2, 0, 3, 1, 4))
q, k, v = qkv[0], qkv[1], qkv[2]
attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
if self.relative_position_bias_table is not None:
relative_position_bias = self.relative_position_bias_table[
self.relative_position_index.reshape([-1])].reshape([
self.window_size[0] * self.window_size[1] + 1,
self.window_size[0] * self.window_size[1] + 1, -1
]) # Wh*Ww,Wh*Ww,nH
relative_position_bias = relative_position_bias.transpose(
(2, 0, 1)) #.contiguous() # nH, Wh*Ww, Wh*Ww
attn = attn + relative_position_bias.unsqueeze(0)
if rel_pos_bias is not None:
attn = attn + rel_pos_bias
attn = nn.functional.softmax(attn, axis=-1)
attn = self.attn_drop(attn)
x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
x = self.proj(x)
x = self.proj_drop(x)
return x
class Block(nn.Layer):
def __init__(self,
dim,
num_heads,
mlp_ratio=4.,
qkv_bias=False,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
window_size=None,
init_values=None,
act_layer=nn.GELU,
norm_layer='nn.LayerNorm',
epsilon=1e-5):
super().__init__()
self.norm1 = nn.LayerNorm(dim, epsilon=1e-6)
self.attn = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop,
window_size=window_size)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
if init_values is not None:
self.gamma_1 = self.create_parameter(
shape=([dim]), default_initializer=Constant(value=init_values))
self.gamma_2 = self.create_parameter(
shape=([dim]), default_initializer=Constant(value=init_values))
else:
self.gamma_1, self.gamma_2 = None, None
def forward(self, x, rel_pos_bias=None):
if self.gamma_1 is None:
x = x + self.drop_path(
self.attn(
self.norm1(x), rel_pos_bias=rel_pos_bias))
x = x + self.drop_path(self.mlp(self.norm2(x)))
else:
x = x + self.drop_path(self.gamma_1 * self.attn(
self.norm1(x), rel_pos_bias=rel_pos_bias))
x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
return x
class PatchEmbed(nn.Layer):
""" Image to Patch Embedding
"""
def __init__(self,
img_size=[224, 224],
patch_size=16,
in_chans=3,
embed_dim=768):
super().__init__()
self.num_patches_w = img_size[0] // patch_size
self.num_patches_h = img_size[1] // patch_size
num_patches = self.num_patches_w * self.num_patches_h
self.patch_shape = (img_size[0] // patch_size,
img_size[1] // patch_size)
self.img_size = img_size
self.patch_size = patch_size
self.num_patches = num_patches
self.proj = nn.Conv2D(
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
@property
def num_patches_in_h(self):
return self.img_size[1] // self.patch_size
@property
def num_patches_in_w(self):
return self.img_size[0] // self.patch_size
def forward(self, x, mask=None):
B, C, H, W = x.shape
return self.proj(x)
class RelativePositionBias(nn.Layer):
def __init__(self, window_size, num_heads):
super().__init__()
self.window_size = window_size
self.num_relative_distance = (2 * window_size[0] - 1) * (
2 * window_size[1] - 1) + 3
self.relative_position_bias_table = self.create_parameter(
shape=(self.num_relative_distance, num_heads),
default_initialize=zeros_)
# cls to token & token 2 cls & cls to cls
# get pair-wise relative position index for each token inside the window
coords_h = paddle.arange(window_size[0])
coords_w = paddle.arange(window_size[1])
coords = paddle.stack(paddle.meshgrid(
[coords_h, coords_w])) # 2, Wh, Ww
coords_flatten = coords.flatten(1) # 2, Wh*Ww
relative_coords = coords_flatten[:, :,
None] - coords_flatten[:,
None, :] # 2, Wh*Ww, Wh*Ww
relative_coords = relative_coords.transpos(
(1, 2, 0)) # Wh*Ww, Wh*Ww, 2
relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0
relative_coords[:, :, 1] += window_size[1] - 1
relative_coords[:, :, 0] *= 2 * window_size[1] - 1
relative_position_index = \
paddle.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
relative_position_index[1:, 1:] = relative_coords.sum(
-1) # Wh*Ww, Wh*Ww
relative_position_index[0, 0:] = self.num_relative_distance - 3
relative_position_index[0:, 0] = self.num_relative_distance - 2
relative_position_index[0, 0] = self.num_relative_distance - 1
self.register_buffer("relative_position_index", relative_position_index)
def forward(self):
relative_position_bias = \
self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
self.window_size[0] * self.window_size[1] + 1,
self.window_size[0] * self.window_size[1] + 1, -1]) # Wh*Ww,Wh*Ww,nH
return relative_position_bias.transpose((2, 0, 1)) # nH, Wh*Ww, Wh*Ww
def get_sinusoid_encoding_table(n_position, d_hid, token=False):
''' Sinusoid position encoding table '''
def get_position_angle_vec(position):
return [
position / np.power(10000, 2 * (hid_j // 2) / d_hid)
for hid_j in range(d_hid)
]
sinusoid_table = np.array(
[get_position_angle_vec(pos_i) for pos_i in range(n_position)])
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
if token:
sinusoid_table = np.concatenate(
[sinusoid_table, np.zeros([1, d_hid])], dim=0)
return paddle.to_tensor(sinusoid_table, dtype=paddle.float32).unsqueeze(0)
@register
@serializable
class VisionTransformer(nn.Layer):
""" Vision Transformer with support for patch input
"""
def __init__(self,
img_size=[672, 1092],
patch_size=16,
in_chans=3,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4,
qkv_bias=False,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
norm_layer='nn.LayerNorm',
init_values=None,
use_rel_pos_bias=False,
use_shared_rel_pos_bias=False,
epsilon=1e-5,
final_norm=False,
pretrained=None,
out_indices=[3, 5, 7, 11],
use_abs_pos_emb=False,
use_sincos_pos_emb=True,
with_fpn=True,
num_fpn_levels=4,
use_checkpoint=False,
**args):
super().__init__()
self.img_size = img_size
self.embed_dim = embed_dim
self.with_fpn = with_fpn
self.use_checkpoint = use_checkpoint
self.use_sincos_pos_emb = use_sincos_pos_emb
self.use_rel_pos_bias = use_rel_pos_bias
self.final_norm = final_norm
self.out_indices = out_indices
self.num_fpn_levels = num_fpn_levels
if use_checkpoint:
paddle.seed(0)
self.patch_embed = PatchEmbed(
img_size=img_size,
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim)
self.pos_w = self.patch_embed.num_patches_in_w
self.pos_h = self.patch_embed.num_patches_in_h
self.cls_token = self.create_parameter(
shape=(1, 1, embed_dim),
default_initializer=paddle.nn.initializer.Constant(value=0.))
if use_abs_pos_emb:
self.pos_embed = self.create_parameter(
shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
default_initializer=paddle.nn.initializer.TruncatedNormal(
std=.02))
elif use_sincos_pos_emb:
pos_embed = self.build_2d_sincos_position_embedding(embed_dim)
self.pos_embed = pos_embed
self.pos_embed = self.create_parameter(shape=pos_embed.shape)
self.pos_embed.set_value(pos_embed.numpy())
self.pos_embed.stop_gradient = True
else:
self.pos_embed = None
self.pos_drop = nn.Dropout(p=drop_rate)
if use_shared_rel_pos_bias:
self.rel_pos_bias = RelativePositionBias(
window_size=self.patch_embed.patch_shape, num_heads=num_heads)
else:
self.rel_pos_bias = None
dpr = np.linspace(0, drop_path_rate, depth)
self.blocks = nn.LayerList([
Block(
dim=embed_dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[i],
norm_layer=norm_layer,
init_values=init_values,
window_size=self.patch_embed.patch_shape
if use_rel_pos_bias else None,
epsilon=epsilon) for i in range(depth)
])
self.pretrained = pretrained
self.init_weight()
assert len(out_indices) <= 4, ''
self.out_indices = out_indices
self.out_channels = [embed_dim for _ in range(num_fpn_levels)]
self.out_strides = [4, 8, 16, 32][-num_fpn_levels:] if with_fpn else [
patch_size for _ in range(len(out_indices))
]
self.norm = Identity()
if self.with_fpn:
assert num_fpn_levels <= 4, ''
self.init_fpn(
embed_dim=embed_dim,
patch_size=patch_size, )
def init_weight(self):
pretrained = self.pretrained
if pretrained:
if 'http' in pretrained: #URL
path = paddle.utils.download.get_weights_path_from_url(
pretrained)
else: #model in local path
path = pretrained
load_state_dict = paddle.load(path)
model_state_dict = self.state_dict()
pos_embed_name = "pos_embed"
if pos_embed_name in load_state_dict.keys():
load_pos_embed = paddle.to_tensor(
load_state_dict[pos_embed_name], dtype="float32")
if self.pos_embed.shape != load_pos_embed.shape:
pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
model_state_dict[pos_embed_name] = self.resize_pos_embed(
load_pos_embed, (pos_size, pos_size),
(self.pos_h, self.pos_w))
# self.set_state_dict(model_state_dict)
load_state_dict[pos_embed_name] = model_state_dict[
pos_embed_name]
print("Load pos_embed and resize it from {} to {} .".format(
load_pos_embed.shape, self.pos_embed.shape))
self.set_state_dict(load_state_dict)
print("Load load_state_dict....")
def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
if patch_size == 16:
self.fpn1 = nn.Sequential(
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2),
nn.BatchNorm2D(embed_dim),
nn.GELU(),
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2), )
self.fpn2 = nn.Sequential(
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2), )
self.fpn3 = Identity()
self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
elif patch_size == 8:
self.fpn1 = nn.Sequential(
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2), )
self.fpn2 = Identity()
self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )
self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )
if not out_with_norm:
self.norm = Identity()
else:
self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6)
def interpolate_pos_encoding(self, x, w, h):
npatch = x.shape[1] - 1
N = self.pos_embed.shape[1] - 1
w0 = w // self.patch_embed.patch_size
h0 = h // self.patch_embed.patch_size
if npatch == N and w0 == self.patch_embed.num_patches_w and h0 == self.patch_embed.num_patches_h:
return self.pos_embed
class_pos_embed = self.pos_embed[:, 0]
patch_pos_embed = self.pos_embed[:, 1:]
dim = x.shape[-1]
# we add a small number to avoid floating point error in the interpolation
# see discussion at https://github.com/facebookresearch/dino/issues/8
# w0, h0 = w0 + 0.1, h0 + 0.1
# patch_pos_embed = nn.functional.interpolate(
# patch_pos_embed.reshape([
# 1, self.patch_embed.num_patches_w,
# self.patch_embed.num_patches_h, dim
# ]).transpose((0, 3, 1, 2)),
# scale_factor=(w0 / self.patch_embed.num_patches_w,
# h0 / self.patch_embed.num_patches_h),
# mode='bicubic', )
patch_pos_embed = nn.functional.interpolate(
patch_pos_embed.reshape([
1, self.patch_embed.num_patches_w,
self.patch_embed.num_patches_h, dim
]).transpose((0, 3, 1, 2)),
(w0, h0),
mode='bicubic', )
assert int(w0) == patch_pos_embed.shape[-2] and int(
h0) == patch_pos_embed.shape[-1]
patch_pos_embed = patch_pos_embed.transpose(
(0, 2, 3, 1)).reshape([1, -1, dim])
return paddle.concat(
(class_pos_embed.unsqueeze(0), patch_pos_embed), axis=1)
def resize_pos_embed(self, pos_embed, old_hw, new_hw):
"""
Resize pos_embed weight.
Args:
pos_embed (Tensor): the pos_embed weight
old_hw (list[int]): the height and width of old pos_embed
new_hw (list[int]): the height and width of new pos_embed
Returns:
Tensor: the resized pos_embed weight
"""
cls_pos_embed = pos_embed[:, :1, :]
pos_embed = pos_embed[:, 1:, :]
pos_embed = pos_embed.transpose([0, 2, 1])
pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
pos_embed = F.interpolate(
pos_embed, new_hw, mode='bicubic', align_corners=False)
pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
return pos_embed
def build_2d_sincos_position_embedding(
self,
embed_dim=768,
temperature=10000., ):
h, w = self.patch_embed.patch_shape
grid_w = paddle.arange(w, dtype=paddle.float32)
grid_h = paddle.arange(h, dtype=paddle.float32)
grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
pos_dim = embed_dim // 4
omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
omega = 1. / (temperature**omega)
out_w = grid_w.flatten()[..., None] @omega[None]
out_h = grid_h.flatten()[..., None] @omega[None]
pos_emb = paddle.concat(
[
paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
paddle.cos(out_h)
],
axis=1)[None, :, :]
pe_token = paddle.zeros([1, 1, embed_dim], dtype=paddle.float32)
pos_embed = paddle.concat([pe_token, pos_emb], axis=1)
# pos_embed.stop_gradient = True
return pos_embed
def forward(self, x):
x = x['image'] if isinstance(x, dict) else x
_, _, h, w = x.shape
x = self.patch_embed(x)
B, D, Hp, Wp = x.shape # b * c * h * w
cls_tokens = self.cls_token.expand(
(B, self.cls_token.shape[-2], self.cls_token.shape[-1]))
x = x.flatten(2).transpose([0, 2, 1]) # b * hw * c
x = paddle.concat([cls_tokens, x], axis=1)
if self.pos_embed is not None:
# x = x + self.interpolate_pos_encoding(x, w, h)
x = x + self.interpolate_pos_encoding(x, h, w)
x = self.pos_drop(x)
rel_pos_bias = self.rel_pos_bias(
) if self.rel_pos_bias is not None else None
feats = []
for idx, blk in enumerate(self.blocks):
if self.use_checkpoint and self.training:
x = paddle.distributed.fleet.utils.recompute(
blk, x, rel_pos_bias, **{"preserve_rng_state": True})
else:
x = blk(x, rel_pos_bias)
if idx in self.out_indices:
xp = paddle.reshape(
paddle.transpose(
self.norm(x[:, 1:, :]), perm=[0, 2, 1]),
shape=[B, D, Hp, Wp])
feats.append(xp)
if self.with_fpn:
fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4][
-self.num_fpn_levels:]
assert len(fpns) == len(feats) or len(feats) == 1, ''
outputs = []
for i, m in enumerate(fpns):
outputs.append(
m(feats[i] if len(feats) == len(fpns) else feats[-1]))
return outputs
return feats
@property
def num_layers(self):
return len(self.blocks)
@property
def no_weight_decay(self):
return {'pos_embed', 'cls_token'}
@property
def out_shape(self):
return [
ShapeSpec(
channels=c, stride=s)
for c, s in zip(self.out_channels, self.out_strides)
]

View File

@@ -0,0 +1,749 @@
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import numpy as np
import math
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import Constant, TruncatedNormal
from ppdet.modeling.shape_spec import ShapeSpec
from ppdet.core.workspace import register, serializable
from .transformer_utils import (zeros_, DropPath, Identity, window_partition,
window_unpartition)
from ..initializer import linear_init_
__all__ = ['VisionTransformer2D', 'SimpleFeaturePyramid']
class Mlp(nn.Layer):
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer='nn.GELU',
drop=0.,
lr_factor=1.0):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(
in_features,
hidden_features,
weight_attr=ParamAttr(learning_rate=lr_factor),
bias_attr=ParamAttr(learning_rate=lr_factor))
self.act = eval(act_layer)()
self.fc2 = nn.Linear(
hidden_features,
out_features,
weight_attr=ParamAttr(learning_rate=lr_factor),
bias_attr=ParamAttr(learning_rate=lr_factor))
self.drop = nn.Dropout(drop)
self._init_weights()
def _init_weights(self):
linear_init_(self.fc1)
linear_init_(self.fc2)
def forward(self, x):
x = self.drop(self.act(self.fc1(x)))
x = self.drop(self.fc2(x))
return x
class Attention(nn.Layer):
def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
attn_bias=False,
attn_drop=0.,
proj_drop=0.,
use_rel_pos=False,
rel_pos_zero_init=True,
window_size=None,
input_size=None,
qk_scale=None,
lr_factor=1.0):
super().__init__()
self.num_heads = num_heads
self.head_dim = dim // num_heads
self.scale = qk_scale or self.head_dim**-0.5
self.use_rel_pos = use_rel_pos
self.input_size = input_size
self.rel_pos_zero_init = rel_pos_zero_init
self.window_size = window_size
self.lr_factor = lr_factor
self.qkv = nn.Linear(
dim,
dim * 3,
weight_attr=ParamAttr(learning_rate=lr_factor),
bias_attr=ParamAttr(learning_rate=lr_factor)
if attn_bias else False)
if qkv_bias:
self.q_bias = self.create_parameter(
shape=([dim]), default_initializer=zeros_)
self.v_bias = self.create_parameter(
shape=([dim]), default_initializer=zeros_)
else:
self.q_bias = None
self.v_bias = None
self.proj = nn.Linear(
dim,
dim,
weight_attr=ParamAttr(learning_rate=lr_factor),
bias_attr=ParamAttr(learning_rate=lr_factor))
self.attn_drop = nn.Dropout(attn_drop)
if window_size is None:
self.window_size = self.input_size[0]
self._init_weights()
def _init_weights(self):
linear_init_(self.qkv)
linear_init_(self.proj)
if self.use_rel_pos:
self.rel_pos_h = self.create_parameter(
[2 * self.window_size - 1, self.head_dim],
attr=ParamAttr(learning_rate=self.lr_factor),
default_initializer=Constant(value=0.))
self.rel_pos_w = self.create_parameter(
[2 * self.window_size - 1, self.head_dim],
attr=ParamAttr(learning_rate=self.lr_factor),
default_initializer=Constant(value=0.))
if not self.rel_pos_zero_init:
TruncatedNormal(self.rel_pos_h, std=0.02)
TruncatedNormal(self.rel_pos_w, std=0.02)
def get_rel_pos(self, seq_size, rel_pos):
max_rel_dist = int(2 * seq_size - 1)
# Interpolate rel pos if needed.
if rel_pos.shape[0] != max_rel_dist:
# Interpolate rel pos.
rel_pos = rel_pos.reshape([1, rel_pos.shape[0], -1])
rel_pos = rel_pos.transpose([0, 2, 1])
rel_pos_resized = F.interpolate(
rel_pos,
size=(max_rel_dist, ),
mode="linear",
data_format='NCW')
rel_pos_resized = rel_pos_resized.reshape([-1, max_rel_dist])
rel_pos_resized = rel_pos_resized.transpose([1, 0])
else:
rel_pos_resized = rel_pos
coords = paddle.arange(seq_size, dtype='float32')
relative_coords = coords.unsqueeze(-1) - coords.unsqueeze(0)
relative_coords += (seq_size - 1)
relative_coords = relative_coords.astype('int64').flatten()
return paddle.index_select(rel_pos_resized, relative_coords).reshape(
[seq_size, seq_size, self.head_dim])
def add_decomposed_rel_pos(self, attn, q, h, w):
"""
Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
Args:
attn (Tensor): attention map.
q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
Returns:
attn (Tensor): attention map with added relative positional embeddings.
"""
Rh = self.get_rel_pos(h, self.rel_pos_h)
Rw = self.get_rel_pos(w, self.rel_pos_w)
B, _, dim = q.shape
r_q = q.reshape([B, h, w, dim])
# bhwc, hch->bhwh1
# bwhc, wcw->bhw1w
rel_h = paddle.einsum("bhwc,hkc->bhwk", r_q, Rh).unsqueeze(-1)
rel_w = paddle.einsum("bhwc,wkc->bhwk", r_q, Rw).unsqueeze(-2)
attn = attn.reshape([B, h, w, h, w]) + rel_h + rel_w
return attn.reshape([B, h * w, h * w])
def forward(self, x):
B, H, W, C = paddle.shape(x)
if self.q_bias is not None:
qkv_bias = paddle.concat(
(self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
else:
qkv = self.qkv(x).reshape(
[B, H * W, 3, self.num_heads, self.head_dim]).transpose(
[2, 0, 3, 1, 4]).reshape(
[3, B * self.num_heads, H * W, self.head_dim])
q, k, v = qkv[0], qkv[1], qkv[2]
attn = q.matmul(k.transpose([0, 2, 1])) * self.scale
if self.use_rel_pos:
attn = self.add_decomposed_rel_pos(attn, q, H, W)
attn = F.softmax(attn, axis=-1)
attn = self.attn_drop(attn)
x = attn.matmul(v).reshape(
[B, self.num_heads, H * W, self.head_dim]).transpose(
[0, 2, 1, 3]).reshape([B, H, W, C])
x = self.proj(x)
return x
class Block(nn.Layer):
def __init__(self,
dim,
num_heads,
mlp_ratio=4.,
qkv_bias=False,
attn_bias=False,
qk_scale=None,
init_values=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
use_rel_pos=True,
rel_pos_zero_init=True,
window_size=None,
input_size=None,
act_layer='nn.GELU',
norm_layer='nn.LayerNorm',
lr_factor=1.0,
epsilon=1e-5):
super().__init__()
self.window_size = window_size
self.norm1 = eval(norm_layer)(dim,
weight_attr=ParamAttr(
learning_rate=lr_factor,
regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(
learning_rate=lr_factor,
regularizer=L2Decay(0.0)),
epsilon=epsilon)
self.attn = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
attn_bias=attn_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop,
use_rel_pos=use_rel_pos,
rel_pos_zero_init=rel_pos_zero_init,
window_size=window_size,
input_size=input_size,
lr_factor=lr_factor)
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
self.norm2 = eval(norm_layer)(dim,
weight_attr=ParamAttr(
learning_rate=lr_factor,
regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(
learning_rate=lr_factor,
regularizer=L2Decay(0.0)),
epsilon=epsilon)
self.mlp = Mlp(in_features=dim,
hidden_features=int(dim * mlp_ratio),
act_layer=act_layer,
drop=drop,
lr_factor=lr_factor)
if init_values is not None:
self.gamma_1 = self.create_parameter(
shape=([dim]), default_initializer=Constant(value=init_values))
self.gamma_2 = self.create_parameter(
shape=([dim]), default_initializer=Constant(value=init_values))
else:
self.gamma_1, self.gamma_2 = None, None
def forward(self, x):
y = self.norm1(x)
if self.window_size is not None:
y, pad_hw, num_hw = window_partition(y, self.window_size)
y = self.attn(y)
if self.gamma_1 is not None:
y = self.gamma_1 * y
if self.window_size is not None:
y = window_unpartition(y, pad_hw, num_hw, (x.shape[1], x.shape[2]))
x = x + self.drop_path(y)
if self.gamma_2 is None:
x = x + self.drop_path(self.mlp(self.norm2(x)))
else:
x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
return x
class PatchEmbed(nn.Layer):
""" Image to Patch Embedding
"""
def __init__(self,
img_size=(224, 224),
patch_size=16,
in_chans=3,
embed_dim=768,
lr_factor=0.01):
super().__init__()
self.img_size = img_size
self.patch_size = patch_size
self.proj = nn.Conv2D(
in_chans,
embed_dim,
kernel_size=patch_size,
stride=patch_size,
weight_attr=ParamAttr(learning_rate=lr_factor),
bias_attr=ParamAttr(learning_rate=lr_factor))
@property
def num_patches_in_h(self):
return self.img_size[1] // self.patch_size
@property
def num_patches_in_w(self):
return self.img_size[0] // self.patch_size
def forward(self, x):
out = self.proj(x)
return out
@register
@serializable
class VisionTransformer2D(nn.Layer):
""" Vision Transformer with support for patch input
"""
def __init__(self,
img_size=(1024, 1024),
patch_size=16,
in_chans=3,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4,
qkv_bias=False,
attn_bias=False,
qk_scale=None,
init_values=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
act_layer='nn.GELU',
norm_layer='nn.LayerNorm',
lr_decay_rate=1.0,
global_attn_indexes=(2, 5, 8, 11),
use_abs_pos=False,
use_rel_pos=False,
use_abs_pos_emb=False,
use_sincos_pos_emb=False,
rel_pos_zero_init=True,
epsilon=1e-5,
final_norm=False,
pretrained=None,
window_size=None,
out_indices=(11, ),
with_fpn=False,
use_checkpoint=False,
*args,
**kwargs):
super().__init__()
self.img_size = img_size
self.patch_size = patch_size
self.embed_dim = embed_dim
self.num_heads = num_heads
self.depth = depth
self.global_attn_indexes = global_attn_indexes
self.epsilon = epsilon
self.with_fpn = with_fpn
self.use_checkpoint = use_checkpoint
self.patch_h = img_size[0] // patch_size
self.patch_w = img_size[1] // patch_size
self.num_patches = self.patch_h * self.patch_w
self.use_abs_pos = use_abs_pos
self.use_abs_pos_emb = use_abs_pos_emb
self.patch_embed = PatchEmbed(
img_size=img_size,
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim)
dpr = np.linspace(0, drop_path_rate, depth)
if use_checkpoint:
paddle.seed(0)
if use_abs_pos_emb:
self.pos_w = self.patch_embed.num_patches_in_w
self.pos_h = self.patch_embed.num_patches_in_h
self.pos_embed = self.create_parameter(
shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
default_initializer=paddle.nn.initializer.TruncatedNormal(
std=.02))
elif use_sincos_pos_emb:
pos_embed = self.get_2d_sincos_position_embedding(self.patch_h,
self.patch_w)
self.pos_embed = pos_embed
self.pos_embed = self.create_parameter(shape=pos_embed.shape)
self.pos_embed.set_value(pos_embed.numpy())
self.pos_embed.stop_gradient = True
else:
self.pos_embed = None
self.blocks = nn.LayerList([
Block(
embed_dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
attn_bias=attn_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[i],
use_rel_pos=use_rel_pos,
rel_pos_zero_init=rel_pos_zero_init,
window_size=None
if i in self.global_attn_indexes else window_size,
input_size=[self.patch_h, self.patch_w],
act_layer=act_layer,
lr_factor=self.get_vit_lr_decay_rate(i, lr_decay_rate),
norm_layer=norm_layer,
init_values=init_values,
epsilon=epsilon) for i in range(depth)
])
assert len(out_indices) <= 4, 'out_indices out of bound'
self.out_indices = out_indices
self.pretrained = pretrained
self.init_weight()
self.out_channels = [embed_dim for _ in range(len(out_indices))]
self.out_strides = [4, 8, 16, 32][-len(out_indices):] if with_fpn else [
patch_size for _ in range(len(out_indices))
]
self.norm = Identity()
if self.with_fpn:
self.init_fpn(
embed_dim=embed_dim,
patch_size=patch_size,
out_with_norm=final_norm)
def get_vit_lr_decay_rate(self, layer_id, lr_decay_rate):
return lr_decay_rate**(self.depth - layer_id)
def init_weight(self):
pretrained = self.pretrained
if pretrained:
if 'http' in pretrained:
path = paddle.utils.download.get_weights_path_from_url(
pretrained)
else:
path = pretrained
load_state_dict = paddle.load(path)
model_state_dict = self.state_dict()
pos_embed_name = "pos_embed"
if pos_embed_name in load_state_dict.keys(
) and self.use_abs_pos_emb:
load_pos_embed = paddle.to_tensor(
load_state_dict[pos_embed_name], dtype="float32")
if self.pos_embed.shape != load_pos_embed.shape:
pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
model_state_dict[pos_embed_name] = self.resize_pos_embed(
load_pos_embed, (pos_size, pos_size),
(self.pos_h, self.pos_w))
# self.set_state_dict(model_state_dict)
load_state_dict[pos_embed_name] = model_state_dict[
pos_embed_name]
print("Load pos_embed and resize it from {} to {} .".format(
load_pos_embed.shape, self.pos_embed.shape))
self.set_state_dict(load_state_dict)
print("Load load_state_dict....")
def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
if patch_size == 16:
self.fpn1 = nn.Sequential(
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2),
nn.BatchNorm2D(embed_dim),
nn.GELU(),
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2), )
self.fpn2 = nn.Sequential(
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2), )
self.fpn3 = Identity()
self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
elif patch_size == 8:
self.fpn1 = nn.Sequential(
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2), )
self.fpn2 = Identity()
self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )
self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )
if not out_with_norm:
self.norm = Identity()
else:
self.norm = nn.LayerNorm(embed_dim, epsilon=self.epsilon)
def resize_pos_embed(self, pos_embed, old_hw, new_hw):
"""
Resize pos_embed weight.
Args:
pos_embed (Tensor): the pos_embed weight
old_hw (list[int]): the height and width of old pos_embed
new_hw (list[int]): the height and width of new pos_embed
Returns:
Tensor: the resized pos_embed weight
"""
cls_pos_embed = pos_embed[:, :1, :]
pos_embed = pos_embed[:, 1:, :]
pos_embed = pos_embed.transpose([0, 2, 1])
pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
pos_embed = F.interpolate(
pos_embed, new_hw, mode='bicubic', align_corners=False)
pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
return pos_embed
def get_2d_sincos_position_embedding(self, h, w, temperature=10000.):
grid_y, grid_x = paddle.meshgrid(
paddle.arange(
h, dtype=paddle.float32),
paddle.arange(
w, dtype=paddle.float32))
assert self.embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
pos_dim = self.embed_dim // 4
omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
omega = (1. / (temperature**omega)).unsqueeze(0)
out_x = grid_x.reshape([-1, 1]).matmul(omega)
out_y = grid_y.reshape([-1, 1]).matmul(omega)
pos_emb = paddle.concat(
[
paddle.sin(out_y), paddle.cos(out_y), paddle.sin(out_x),
paddle.cos(out_x)
],
axis=1)
return pos_emb.reshape([1, h, w, self.embed_dim])
def forward(self, inputs):
x = self.patch_embed(inputs['image']).transpose([0, 2, 3, 1])
B, Hp, Wp, _ = paddle.shape(x)
if self.use_abs_pos:
x = x + self.get_2d_sincos_position_embedding(Hp, Wp)
if self.use_abs_pos_emb:
x = x + self.resize_pos_embed(self.pos_embed,
(self.pos_h, self.pos_w), (Hp, Wp))
feats = []
for idx, blk in enumerate(self.blocks):
if self.use_checkpoint and self.training:
x = paddle.distributed.fleet.utils.recompute(
blk, x, **{"preserve_rng_state": True})
else:
x = blk(x)
if idx in self.out_indices:
feats.append(self.norm(x.transpose([0, 3, 1, 2])))
if self.with_fpn:
fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
for i in range(len(feats)):
feats[i] = fpns[i](feats[i])
return feats
@property
def num_layers(self):
return len(self.blocks)
@property
def no_weight_decay(self):
return {'pos_embed', 'cls_token'}
@property
def out_shape(self):
return [
ShapeSpec(
channels=c, stride=s)
for c, s in zip(self.out_channels, self.out_strides)
]
class LayerNorm(nn.Layer):
"""
A LayerNorm variant, popularized by Transformers, that performs point-wise mean and
variance normalization over the channel dimension for inputs that have shape
(batch_size, channels, height, width).
Note that, the modified LayerNorm on used in ResBlock and SimpleFeaturePyramid.
In ViT, we use the nn.LayerNorm
"""
def __init__(self, normalized_shape, eps=1e-6):
super().__init__()
self.weight = self.create_parameter([normalized_shape])
self.bias = self.create_parameter([normalized_shape])
self.eps = eps
self.normalized_shape = (normalized_shape, )
def forward(self, x):
u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / paddle.sqrt(s + self.eps)
x = self.weight[:, None, None] * x + self.bias[:, None, None]
return x
@register
@serializable
class SimpleFeaturePyramid(nn.Layer):
def __init__(self,
in_channels,
out_channels,
spatial_scales,
num_levels=4,
use_bias=False):
"""
Args:
in_channels (list[int]): input channels of each level which can be
derived from the output shape of backbone by from_config
out_channel (int): output channel of each level.
spatial_scales (list[float]): list of scaling factors to upsample or downsample
the input features for creating pyramid features which can be derived from
the output shape of backbone by from_config
num_levels (int): number of levels of output features.
use_bias (bool): whether use bias or not.
"""
super(SimpleFeaturePyramid, self).__init__()
self.in_channels = in_channels[0]
self.out_channels = out_channels
self.num_levels = num_levels
self.stages = []
dim = self.in_channels
if num_levels == 4:
scale_factors = [2.0, 1.0, 0.5]
elif num_levels == 5:
scale_factors = [4.0, 2.0, 1.0, 0.5]
else:
raise NotImplementedError(
f"num_levels={num_levels} is not supported yet.")
dim = in_channels[0]
for idx, scale in enumerate(scale_factors):
out_dim = dim
if scale == 4.0:
layers = [
nn.Conv2DTranspose(
dim, dim // 2, kernel_size=2, stride=2),
nn.LayerNorm(dim // 2),
nn.GELU(),
nn.Conv2DTranspose(
dim // 2, dim // 4, kernel_size=2, stride=2),
]
out_dim = dim // 4
elif scale == 2.0:
layers = [
nn.Conv2DTranspose(
dim, dim // 2, kernel_size=2, stride=2)
]
out_dim = dim // 2
elif scale == 1.0:
layers = []
elif scale == 0.5:
layers = [nn.MaxPool2D(kernel_size=2, stride=2)]
layers.extend([
nn.Conv2D(
out_dim,
out_channels,
kernel_size=1,
bias_attr=use_bias, ), LayerNorm(out_channels), nn.Conv2D(
out_channels,
out_channels,
kernel_size=3,
padding=1,
bias_attr=use_bias, ), LayerNorm(out_channels)
])
layers = nn.Sequential(*layers)
stage = -int(math.log2(spatial_scales[0] * scale_factors[idx]))
self.add_sublayer(f"simfp_{stage}", layers)
self.stages.append(layers)
# top block output feature maps.
self.top_block = nn.Sequential(
nn.MaxPool2D(
kernel_size=1, stride=2, padding=0))
@classmethod
def from_config(cls, cfg, input_shape):
return {
'in_channels': [i.channels for i in input_shape],
'spatial_scales': [1.0 / i.stride for i in input_shape],
}
@property
def out_shape(self):
return [
ShapeSpec(channels=self.out_channels)
for _ in range(self.num_levels)
]
def forward(self, feats):
"""
Args:
x: Tensor of shape (N,C,H,W).
"""
features = feats[0]
results = []
for stage in self.stages:
results.append(stage(features))
top_block_in_feature = results[-1]
results.append(self.top_block(top_block_in_feature))
assert self.num_levels == len(results)
return results