first commit

This commit is contained in:
陈赣
2026-06-03 12:42:47 +08:00
commit ec23799148
339 changed files with 57120 additions and 0 deletions

View File

@@ -0,0 +1,27 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
warnings.filterwarnings(
action='ignore', category=DeprecationWarning, module='ops')
from .ops import *
from .backbones import *
from .heads import *
from .losses import *
from .architectures import *
from .post_process import *
from .layers import *
from .transformers import *

View File

@@ -0,0 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .meta_arch import *
from .detr import *

View File

@@ -0,0 +1,116 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
from .meta_arch import BaseArch
from ppdet.core.workspace import register, create
__all__ = ['DETR']
# Deformable DETR, DINO use the same architecture as DETR
@register
class DETR(BaseArch):
__category__ = 'architecture'
__inject__ = ['post_process']
__shared__ = ['with_mask', 'exclude_post_process']
def __init__(self,
backbone,
transformer='DETRTransformer',
detr_head='DETRHead',
neck=None,
post_process='DETRPostProcess',
with_mask=False,
exclude_post_process=False):
super(DETR, self).__init__()
self.backbone = backbone
self.transformer = transformer
self.detr_head = detr_head
self.neck = neck
self.post_process = post_process
self.with_mask = with_mask
self.exclude_post_process = exclude_post_process
@classmethod
def from_config(cls, cfg, *args, **kwargs):
# backbone
backbone = create(cfg['backbone'])
# neck
kwargs = {'input_shape': backbone.out_shape}
neck = create(cfg['neck'], **kwargs) if cfg['neck'] else None
# transformer
if neck is not None:
kwargs = {'input_shape': neck.out_shape}
transformer = create(cfg['transformer'], **kwargs)
# head
kwargs = {
'hidden_dim': transformer.hidden_dim,
'nhead': transformer.nhead,
'input_shape': backbone.out_shape
}
detr_head = create(cfg['detr_head'], **kwargs)
return {
'backbone': backbone,
'transformer': transformer,
"detr_head": detr_head,
"neck": neck
}
def _forward(self):
# Backbone
body_feats = self.backbone(self.inputs)
# Neck
if self.neck is not None:
body_feats = self.neck(body_feats)
# Transformer
pad_mask = self.inputs.get('pad_mask', None)
out_transformer = self.transformer(body_feats, pad_mask, self.inputs)
# DETR Head
if self.training:
detr_losses = self.detr_head(out_transformer, body_feats,
self.inputs)
detr_losses.update({
'loss': paddle.add_n(
[v for k, v in detr_losses.items() if 'log' not in k])
})
return detr_losses
else:
preds = self.detr_head(out_transformer, body_feats)
if self.exclude_post_process:
bbox, bbox_num, mask = preds
else:
bbox, bbox_num, mask = self.post_process(
preds, self.inputs['im_shape'], self.inputs['scale_factor'],
paddle.shape(self.inputs['image'])[2:])
output = {'bbox': bbox, 'bbox_num': bbox_num}
if self.with_mask:
output['mask'] = mask
return output
def get_loss(self):
return self._forward()
def get_pred(self):
return self._forward()

View File

@@ -0,0 +1,132 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle
import paddle.nn as nn
import typing
from ppdet.core.workspace import register
from ppdet.modeling.post_process import nms
__all__ = ['BaseArch']
@register
class BaseArch(nn.Layer):
def __init__(self, data_format='NCHW', use_extra_data=False):
super(BaseArch, self).__init__()
self.data_format = data_format
self.inputs = {}
self.fuse_norm = False
self.use_extra_data = use_extra_data
def load_meanstd(self, cfg_transform):
scale = 1.
mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
for item in cfg_transform:
if 'NormalizeImage' in item:
mean = np.array(
item['NormalizeImage']['mean'], dtype=np.float32)
std = np.array(item['NormalizeImage']['std'], dtype=np.float32)
if item['NormalizeImage'].get('is_scale', True):
scale = 1. / 255.
break
if self.data_format == 'NHWC':
self.scale = paddle.to_tensor(scale / std).reshape((1, 1, 1, 3))
self.bias = paddle.to_tensor(-mean / std).reshape((1, 1, 1, 3))
else:
self.scale = paddle.to_tensor(scale / std).reshape((1, 3, 1, 1))
self.bias = paddle.to_tensor(-mean / std).reshape((1, 3, 1, 1))
def forward(self, inputs):
if self.data_format == 'NHWC':
image = inputs['image']
inputs['image'] = paddle.transpose(image, [0, 2, 3, 1])
if self.fuse_norm:
image = inputs['image']
self.inputs['image'] = image * self.scale + self.bias
self.inputs['im_shape'] = inputs['im_shape']
self.inputs['scale_factor'] = inputs['scale_factor']
else:
self.inputs = inputs
self.model_arch()
if self.training:
out = self.get_loss()
else:
inputs_list = []
# multi-scale input
if not isinstance(inputs, typing.Sequence):
inputs_list.append(inputs)
else:
inputs_list.extend(inputs)
outs = []
for inp in inputs_list:
if self.fuse_norm:
self.inputs['image'] = inp['image'] * self.scale + self.bias
self.inputs['im_shape'] = inp['im_shape']
self.inputs['scale_factor'] = inp['scale_factor']
else:
self.inputs = inp
outs.append(self.get_pred())
# multi-scale test
if len(outs) > 1:
out = self.merge_multi_scale_predictions(outs)
else:
out = outs[0]
return out
def merge_multi_scale_predictions(self, outs):
# default values for architectures not included in following list
num_classes = 80
nms_threshold = 0.5
keep_top_k = 100
if self.__class__.__name__ in ('CascadeRCNN', 'FasterRCNN', 'MaskRCNN'):
num_classes = self.bbox_head.num_classes
keep_top_k = self.bbox_post_process.nms.keep_top_k
nms_threshold = self.bbox_post_process.nms.nms_threshold
else:
raise Exception(
"Multi scale test only supports CascadeRCNN, FasterRCNN and MaskRCNN for now"
)
final_boxes = []
all_scale_outs = paddle.concat([o['bbox'] for o in outs]).numpy()
for c in range(num_classes):
idxs = all_scale_outs[:, 0] == c
if np.count_nonzero(idxs) == 0:
continue
r = nms(all_scale_outs[idxs, 1:], nms_threshold)
final_boxes.append(
np.concatenate([np.full((r.shape[0], 1), c), r], 1))
out = np.concatenate(final_boxes)
out = np.concatenate(sorted(
out, key=lambda e: e[1])[-keep_top_k:]).reshape((-1, 6))
out = {
'bbox': paddle.to_tensor(out),
'bbox_num': paddle.to_tensor(np.array([out.shape[0], ]))
}
return out
def build_inputs(self, data, input_def):
inputs = {}
for i, k in enumerate(input_def):
inputs[k] = data[i]
return inputs
def model_arch(self, ):
pass
def get_loss(self, ):
raise NotImplementedError("Should implement get_loss method!")
def get_pred(self, ):
raise NotImplementedError("Should implement get_pred method!")

View File

@@ -0,0 +1,30 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .resnet import *
from .darknet import *
from .mobilenet_v1 import *
from .mobilenet_v3 import *
from .shufflenet_v2 import *
from .swin_transformer import *
from .lcnet import *
from .cspresnet import *
from .csp_darknet import *
from .convnext import *
from .vision_transformer import *
from .mobileone import *
from .trans_encoder import *
from .focalnet import *
from .vit_mae import *
from .hgnet_v2 import *

View File

@@ -0,0 +1,245 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''
Modified from https://github.com/facebookresearch/ConvNeXt
Copyright (c) Meta Platforms, Inc. and affiliates.
All rights reserved.
This source code is licensed under the license found in the
LICENSE file in the root directory of this source tree.
'''
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.nn.initializer import Constant
import numpy as np
from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec
from .transformer_utils import DropPath, trunc_normal_, zeros_
__all__ = ['ConvNeXt']
class Block(nn.Layer):
r""" ConvNeXt Block. There are two equivalent implementations:
(1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
(2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
We use (2) as we find it slightly faster in Pypaddle
Args:
dim (int): Number of input channels.
drop_path (float): Stochastic depth rate. Default: 0.0
layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
"""
def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
super().__init__()
self.dwconv = nn.Conv2D(
dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
self.norm = LayerNorm(dim, eps=1e-6)
self.pwconv1 = nn.Linear(
dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers
self.act = nn.GELU()
self.pwconv2 = nn.Linear(4 * dim, dim)
if layer_scale_init_value > 0:
self.gamma = self.create_parameter(
shape=(dim, ),
attr=ParamAttr(initializer=Constant(layer_scale_init_value)))
else:
self.gamma = None
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity(
)
def forward(self, x):
input = x
x = self.dwconv(x)
x = x.transpose([0, 2, 3, 1])
x = self.norm(x)
x = self.pwconv1(x)
x = self.act(x)
x = self.pwconv2(x)
if self.gamma is not None:
x = self.gamma * x
x = x.transpose([0, 3, 1, 2])
x = input + self.drop_path(x)
return x
class LayerNorm(nn.Layer):
r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
shape (batch_size, height, width, channels) while channels_first corresponds to inputs
with shape (batch_size, channels, height, width).
"""
def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
super().__init__()
self.weight = self.create_parameter(
shape=(normalized_shape, ),
attr=ParamAttr(initializer=Constant(1.)))
self.bias = self.create_parameter(
shape=(normalized_shape, ),
attr=ParamAttr(initializer=Constant(0.)))
self.eps = eps
self.data_format = data_format
if self.data_format not in ["channels_last", "channels_first"]:
raise NotImplementedError
self.normalized_shape = (normalized_shape, )
def forward(self, x):
if self.data_format == "channels_last":
return F.layer_norm(x, self.normalized_shape, self.weight,
self.bias, self.eps)
elif self.data_format == "channels_first":
u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / paddle.sqrt(s + self.eps)
x = self.weight[:, None, None] * x + self.bias[:, None, None]
return x
@register
@serializable
class ConvNeXt(nn.Layer):
r""" ConvNeXt
A Pypaddle impl of : `A ConvNet for the 2020s` -
https://arxiv.org/pdf/2201.03545.pdf
Args:
in_chans (int): Number of input image channels. Default: 3
depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
drop_path_rate (float): Stochastic depth rate. Default: 0.
layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
"""
arch_settings = {
'tiny': {
'depths': [3, 3, 9, 3],
'dims': [96, 192, 384, 768]
},
'small': {
'depths': [3, 3, 27, 3],
'dims': [96, 192, 384, 768]
},
'base': {
'depths': [3, 3, 27, 3],
'dims': [128, 256, 512, 1024]
},
'large': {
'depths': [3, 3, 27, 3],
'dims': [192, 384, 768, 1536]
},
'xlarge': {
'depths': [3, 3, 27, 3],
'dims': [256, 512, 1024, 2048]
},
}
def __init__(
self,
arch='tiny',
in_chans=3,
drop_path_rate=0.,
layer_scale_init_value=1e-6,
return_idx=[1, 2, 3],
norm_output=True,
pretrained=None, ):
super().__init__()
depths = self.arch_settings[arch]['depths']
dims = self.arch_settings[arch]['dims']
self.downsample_layers = nn.LayerList(
) # stem and 3 intermediate downsampling conv layers
stem = nn.Sequential(
nn.Conv2D(
in_chans, dims[0], kernel_size=4, stride=4),
LayerNorm(
dims[0], eps=1e-6, data_format="channels_first"))
self.downsample_layers.append(stem)
for i in range(3):
downsample_layer = nn.Sequential(
LayerNorm(
dims[i], eps=1e-6, data_format="channels_first"),
nn.Conv2D(
dims[i], dims[i + 1], kernel_size=2, stride=2), )
self.downsample_layers.append(downsample_layer)
self.stages = nn.LayerList(
) # 4 feature resolution stages, each consisting of multiple residual blocks
dp_rates = [x for x in np.linspace(0, drop_path_rate, sum(depths))]
cur = 0
for i in range(4):
stage = nn.Sequential(* [
Block(
dim=dims[i],
drop_path=dp_rates[cur + j],
layer_scale_init_value=layer_scale_init_value)
for j in range(depths[i])
])
self.stages.append(stage)
cur += depths[i]
self.return_idx = return_idx
self.dims = [dims[i] for i in return_idx] # [::-1]
self.norm_output = norm_output
if norm_output:
self.norms = nn.LayerList([
LayerNorm(
c, eps=1e-6, data_format="channels_first")
for c in self.dims
])
self.apply(self._init_weights)
if pretrained is not None:
if 'http' in pretrained: #URL
path = paddle.utils.download.get_weights_path_from_url(
pretrained)
else: #model in local path
path = pretrained
self.set_state_dict(paddle.load(path))
def _init_weights(self, m):
if isinstance(m, (nn.Conv2D, nn.Linear)):
trunc_normal_(m.weight)
zeros_(m.bias)
def forward_features(self, x):
output = []
for i in range(4):
x = self.downsample_layers[i](x)
x = self.stages[i](x)
output.append(x)
outputs = [output[i] for i in self.return_idx]
if self.norm_output:
outputs = [self.norms[i](out) for i, out in enumerate(outputs)]
return outputs
def forward(self, x):
x = self.forward_features(x['image'])
return x
@property
def out_shape(self):
return [ShapeSpec(channels=c) for c in self.dims]

View File

@@ -0,0 +1,404 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register, serializable
from ppdet.modeling.initializer import conv_init_
from ..shape_spec import ShapeSpec
__all__ = [
'CSPDarkNet', 'BaseConv', 'DWConv', 'BottleNeck', 'SPPLayer', 'SPPFLayer'
]
class BaseConv(nn.Layer):
def __init__(self,
in_channels,
out_channels,
ksize,
stride,
groups=1,
bias=False,
act="silu"):
super(BaseConv, self).__init__()
self.conv = nn.Conv2D(
in_channels,
out_channels,
kernel_size=ksize,
stride=stride,
padding=(ksize - 1) // 2,
groups=groups,
bias_attr=bias)
self.bn = nn.BatchNorm2D(
out_channels,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
self._init_weights()
def _init_weights(self):
conv_init_(self.conv)
def forward(self, x):
# use 'x * F.sigmoid(x)' replace 'silu'
x = self.bn(self.conv(x))
y = x * F.sigmoid(x)
return y
class DWConv(nn.Layer):
"""Depthwise Conv"""
def __init__(self,
in_channels,
out_channels,
ksize,
stride=1,
bias=False,
act="silu"):
super(DWConv, self).__init__()
self.dw_conv = BaseConv(
in_channels,
in_channels,
ksize=ksize,
stride=stride,
groups=in_channels,
bias=bias,
act=act)
self.pw_conv = BaseConv(
in_channels,
out_channels,
ksize=1,
stride=1,
groups=1,
bias=bias,
act=act)
def forward(self, x):
return self.pw_conv(self.dw_conv(x))
class Focus(nn.Layer):
"""Focus width and height information into channel space, used in YOLOX."""
def __init__(self,
in_channels,
out_channels,
ksize=3,
stride=1,
bias=False,
act="silu"):
super(Focus, self).__init__()
self.conv = BaseConv(
in_channels * 4,
out_channels,
ksize=ksize,
stride=stride,
bias=bias,
act=act)
def forward(self, inputs):
# inputs [bs, C, H, W] -> outputs [bs, 4C, W/2, H/2]
top_left = inputs[:, :, 0::2, 0::2]
top_right = inputs[:, :, 0::2, 1::2]
bottom_left = inputs[:, :, 1::2, 0::2]
bottom_right = inputs[:, :, 1::2, 1::2]
outputs = paddle.concat(
[top_left, bottom_left, top_right, bottom_right], 1)
return self.conv(outputs)
class BottleNeck(nn.Layer):
def __init__(self,
in_channels,
out_channels,
shortcut=True,
expansion=0.5,
depthwise=False,
bias=False,
act="silu"):
super(BottleNeck, self).__init__()
hidden_channels = int(out_channels * expansion)
Conv = DWConv if depthwise else BaseConv
self.conv1 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.conv2 = Conv(
hidden_channels,
out_channels,
ksize=3,
stride=1,
bias=bias,
act=act)
self.add_shortcut = shortcut and in_channels == out_channels
def forward(self, x):
y = self.conv2(self.conv1(x))
if self.add_shortcut:
y = y + x
return y
class SPPLayer(nn.Layer):
"""Spatial Pyramid Pooling (SPP) layer used in YOLOv3-SPP and YOLOX"""
def __init__(self,
in_channels,
out_channels,
kernel_sizes=(5, 9, 13),
bias=False,
act="silu"):
super(SPPLayer, self).__init__()
hidden_channels = in_channels // 2
self.conv1 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.maxpoolings = nn.LayerList([
nn.MaxPool2D(
kernel_size=ks, stride=1, padding=ks // 2)
for ks in kernel_sizes
])
conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
self.conv2 = BaseConv(
conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)
def forward(self, x):
x = self.conv1(x)
x = paddle.concat([x] + [mp(x) for mp in self.maxpoolings], axis=1)
x = self.conv2(x)
return x
class SPPFLayer(nn.Layer):
""" Spatial Pyramid Pooling - Fast (SPPF) layer used in YOLOv5 by Glenn Jocher,
equivalent to SPP(k=(5, 9, 13))
"""
def __init__(self,
in_channels,
out_channels,
ksize=5,
bias=False,
act='silu'):
super(SPPFLayer, self).__init__()
hidden_channels = in_channels // 2
self.conv1 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.maxpooling = nn.MaxPool2D(
kernel_size=ksize, stride=1, padding=ksize // 2)
conv2_channels = hidden_channels * 4
self.conv2 = BaseConv(
conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)
def forward(self, x):
x = self.conv1(x)
y1 = self.maxpooling(x)
y2 = self.maxpooling(y1)
y3 = self.maxpooling(y2)
concats = paddle.concat([x, y1, y2, y3], axis=1)
out = self.conv2(concats)
return out
class CSPLayer(nn.Layer):
"""CSP (Cross Stage Partial) layer with 3 convs, named C3 in YOLOv5"""
def __init__(self,
in_channels,
out_channels,
num_blocks=1,
shortcut=True,
expansion=0.5,
depthwise=False,
bias=False,
act="silu"):
super(CSPLayer, self).__init__()
hidden_channels = int(out_channels * expansion)
self.conv1 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.conv2 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.bottlenecks = nn.Sequential(* [
BottleNeck(
hidden_channels,
hidden_channels,
shortcut=shortcut,
expansion=1.0,
depthwise=depthwise,
bias=bias,
act=act) for _ in range(num_blocks)
])
self.conv3 = BaseConv(
hidden_channels * 2,
out_channels,
ksize=1,
stride=1,
bias=bias,
act=act)
def forward(self, x):
x_1 = self.conv1(x)
x_1 = self.bottlenecks(x_1)
x_2 = self.conv2(x)
x = paddle.concat([x_1, x_2], axis=1)
x = self.conv3(x)
return x
@register
@serializable
class CSPDarkNet(nn.Layer):
"""
CSPDarkNet backbone.
Args:
arch (str): Architecture of CSPDarkNet, from {P5, P6, X}, default as X,
and 'X' means used in YOLOX, 'P5/P6' means used in YOLOv5.
depth_mult (float): Depth multiplier, multiply number of channels in
each layer, default as 1.0.
width_mult (float): Width multiplier, multiply number of blocks in
CSPLayer, default as 1.0.
depthwise (bool): Whether to use depth-wise conv layer.
act (str): Activation function type, default as 'silu'.
return_idx (list): Index of stages whose feature maps are returned.
"""
__shared__ = ['depth_mult', 'width_mult', 'act', 'trt']
# in_channels, out_channels, num_blocks, add_shortcut, use_spp(use_sppf)
# 'X' means setting used in YOLOX, 'P5/P6' means setting used in YOLOv5.
arch_settings = {
'X': [[64, 128, 3, True, False], [128, 256, 9, True, False],
[256, 512, 9, True, False], [512, 1024, 3, False, True]],
'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
[256, 512, 9, True, False], [512, 1024, 3, True, True]],
'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
[256, 512, 9, True, False], [512, 768, 3, True, False],
[768, 1024, 3, True, True]],
}
def __init__(self,
arch='X',
depth_mult=1.0,
width_mult=1.0,
depthwise=False,
act='silu',
trt=False,
return_idx=[2, 3, 4]):
super(CSPDarkNet, self).__init__()
self.arch = arch
self.return_idx = return_idx
Conv = DWConv if depthwise else BaseConv
arch_setting = self.arch_settings[arch]
base_channels = int(arch_setting[0][0] * width_mult)
# Note: differences between the latest YOLOv5 and the original YOLOX
# 1. self.stem, use SPPF(in YOLOv5) or SPP(in YOLOX)
# 2. use SPPF(in YOLOv5) or SPP(in YOLOX)
# 3. put SPPF before(YOLOv5) or SPP after(YOLOX) the last cspdark block's CSPLayer
# 4. whether SPPF(SPP)'CSPLayer add shortcut, True in YOLOv5, False in YOLOX
if arch in ['P5', 'P6']:
# in the latest YOLOv5, use Conv stem, and SPPF (fast, only single spp kernal size)
self.stem = Conv(
3, base_channels, ksize=6, stride=2, bias=False, act=act)
spp_kernal_sizes = 5
elif arch in ['X']:
# in the original YOLOX, use Focus stem, and SPP (three spp kernal sizes)
self.stem = Focus(
3, base_channels, ksize=3, stride=1, bias=False, act=act)
spp_kernal_sizes = (5, 9, 13)
else:
raise AttributeError("Unsupported arch type: {}".format(arch))
_out_channels = [base_channels]
layers_num = 1
self.csp_dark_blocks = []
for i, (in_channels, out_channels, num_blocks, shortcut,
use_spp) in enumerate(arch_setting):
in_channels = int(in_channels * width_mult)
out_channels = int(out_channels * width_mult)
_out_channels.append(out_channels)
num_blocks = max(round(num_blocks * depth_mult), 1)
stage = []
conv_layer = self.add_sublayer(
'layers{}.stage{}.conv_layer'.format(layers_num, i + 1),
Conv(
in_channels, out_channels, 3, 2, bias=False, act=act))
stage.append(conv_layer)
layers_num += 1
if use_spp and arch in ['X']:
# in YOLOX use SPPLayer
spp_layer = self.add_sublayer(
'layers{}.stage{}.spp_layer'.format(layers_num, i + 1),
SPPLayer(
out_channels,
out_channels,
kernel_sizes=spp_kernal_sizes,
bias=False,
act=act))
stage.append(spp_layer)
layers_num += 1
csp_layer = self.add_sublayer(
'layers{}.stage{}.csp_layer'.format(layers_num, i + 1),
CSPLayer(
out_channels,
out_channels,
num_blocks=num_blocks,
shortcut=shortcut,
depthwise=depthwise,
bias=False,
act=act))
stage.append(csp_layer)
layers_num += 1
if use_spp and arch in ['P5', 'P6']:
# in latest YOLOv5 use SPPFLayer instead of SPPLayer
sppf_layer = self.add_sublayer(
'layers{}.stage{}.sppf_layer'.format(layers_num, i + 1),
SPPFLayer(
out_channels,
out_channels,
ksize=5,
bias=False,
act=act))
stage.append(sppf_layer)
layers_num += 1
self.csp_dark_blocks.append(nn.Sequential(*stage))
self._out_channels = [_out_channels[i] for i in self.return_idx]
self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx]
def forward(self, inputs):
x = inputs['image']
outputs = []
x = self.stem(x)
for i, layer in enumerate(self.csp_dark_blocks):
x = layer(x)
if i + 1 in self.return_idx:
outputs.append(x)
return outputs
@property
def out_shape(self):
return [
ShapeSpec(
channels=c, stride=s)
for c, s in zip(self._out_channels, self.strides)
]

View File

@@ -0,0 +1,321 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import Constant
from ppdet.modeling.ops import get_act_fn
from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec
__all__ = ['CSPResNet', 'BasicBlock', 'EffectiveSELayer', 'ConvBNLayer']
class ConvBNLayer(nn.Layer):
def __init__(self,
ch_in,
ch_out,
filter_size=3,
stride=1,
groups=1,
padding=0,
act=None):
super(ConvBNLayer, self).__init__()
self.conv = nn.Conv2D(
in_channels=ch_in,
out_channels=ch_out,
kernel_size=filter_size,
stride=stride,
padding=padding,
groups=groups,
bias_attr=False)
self.bn = nn.BatchNorm2D(
ch_out,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
self.act = get_act_fn(act) if act is None or isinstance(act, (
str, dict)) else act
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
x = self.act(x)
return x
class RepVggBlock(nn.Layer):
def __init__(self, ch_in, ch_out, act='relu', alpha=False):
super(RepVggBlock, self).__init__()
self.ch_in = ch_in
self.ch_out = ch_out
self.conv1 = ConvBNLayer(
ch_in, ch_out, 3, stride=1, padding=1, act=None)
self.conv2 = ConvBNLayer(
ch_in, ch_out, 1, stride=1, padding=0, act=None)
self.act = get_act_fn(act) if act is None or isinstance(act, (
str, dict)) else act
if alpha:
self.alpha = self.create_parameter(
shape=[1],
attr=ParamAttr(initializer=Constant(value=1.)),
dtype="float32")
else:
self.alpha = None
def forward(self, x):
if hasattr(self, 'conv'):
y = self.conv(x)
else:
if self.alpha:
y = self.conv1(x) + self.alpha * self.conv2(x)
else:
y = self.conv1(x) + self.conv2(x)
y = self.act(y)
return y
def convert_to_deploy(self):
if not hasattr(self, 'conv'):
self.conv = nn.Conv2D(
in_channels=self.ch_in,
out_channels=self.ch_out,
kernel_size=3,
stride=1,
padding=1,
groups=1)
kernel, bias = self.get_equivalent_kernel_bias()
self.conv.weight.set_value(kernel)
self.conv.bias.set_value(bias)
self.__delattr__('conv1')
self.__delattr__('conv2')
def get_equivalent_kernel_bias(self):
kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
if self.alpha:
return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor(
kernel1x1), bias3x3 + self.alpha * bias1x1
else:
return kernel3x3 + self._pad_1x1_to_3x3_tensor(
kernel1x1), bias3x3 + bias1x1
def _pad_1x1_to_3x3_tensor(self, kernel1x1):
if kernel1x1 is None:
return 0
else:
return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
def _fuse_bn_tensor(self, branch):
if branch is None:
return 0, 0
kernel = branch.conv.weight
running_mean = branch.bn._mean
running_var = branch.bn._variance
gamma = branch.bn.weight
beta = branch.bn.bias
eps = branch.bn._epsilon
std = (running_var + eps).sqrt()
t = (gamma / std).reshape((-1, 1, 1, 1))
return kernel * t, beta - running_mean * gamma / std
class BasicBlock(nn.Layer):
def __init__(self,
ch_in,
ch_out,
act='relu',
shortcut=True,
use_alpha=False):
super(BasicBlock, self).__init__()
assert ch_in == ch_out
self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act)
self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha)
self.shortcut = shortcut
def forward(self, x):
y = self.conv1(x)
y = self.conv2(y)
if self.shortcut:
return paddle.add(x, y)
else:
return y
class EffectiveSELayer(nn.Layer):
""" Effective Squeeze-Excitation
From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
"""
def __init__(self, channels, act='hardsigmoid'):
super(EffectiveSELayer, self).__init__()
self.fc = nn.Conv2D(channels, channels, kernel_size=1, padding=0)
self.act = get_act_fn(act) if act is None or isinstance(act, (
str, dict)) else act
def forward(self, x):
x_se = x.mean((2, 3), keepdim=True)
x_se = self.fc(x_se)
return x * self.act(x_se)
class CSPResStage(nn.Layer):
def __init__(self,
block_fn,
ch_in,
ch_out,
n,
stride,
act='relu',
attn='eca',
use_alpha=False):
super(CSPResStage, self).__init__()
ch_mid = (ch_in + ch_out) // 2
if stride == 2:
self.conv_down = ConvBNLayer(
ch_in, ch_mid, 3, stride=2, padding=1, act=act)
else:
self.conv_down = None
self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
self.blocks = nn.Sequential(*[
block_fn(
ch_mid // 2,
ch_mid // 2,
act=act,
shortcut=True,
use_alpha=use_alpha) for i in range(n)
])
if attn:
self.attn = EffectiveSELayer(ch_mid, act='hardsigmoid')
else:
self.attn = None
self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act)
def forward(self, x):
if self.conv_down is not None:
x = self.conv_down(x)
y1 = self.conv1(x)
y2 = self.blocks(self.conv2(x))
y = paddle.concat([y1, y2], axis=1)
if self.attn is not None:
y = self.attn(y)
y = self.conv3(y)
return y
@register
@serializable
class CSPResNet(nn.Layer):
__shared__ = ['width_mult', 'depth_mult', 'trt']
def __init__(self,
layers=[3, 6, 6, 3],
channels=[64, 128, 256, 512, 1024],
act='swish',
return_idx=[1, 2, 3],
depth_wise=False,
use_large_stem=False,
width_mult=1.0,
depth_mult=1.0,
trt=False,
use_checkpoint=False,
use_alpha=False,
**args):
super(CSPResNet, self).__init__()
self.use_checkpoint = use_checkpoint
channels = [max(round(c * width_mult), 1) for c in channels]
layers = [max(round(l * depth_mult), 1) for l in layers]
act = get_act_fn(
act, trt=trt) if act is None or isinstance(act,
(str, dict)) else act
if use_large_stem:
self.stem = nn.Sequential(
('conv1', ConvBNLayer(
3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
('conv2', ConvBNLayer(
channels[0] // 2,
channels[0] // 2,
3,
stride=1,
padding=1,
act=act)), ('conv3', ConvBNLayer(
channels[0] // 2,
channels[0],
3,
stride=1,
padding=1,
act=act)))
else:
self.stem = nn.Sequential(
('conv1', ConvBNLayer(
3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
('conv2', ConvBNLayer(
channels[0] // 2,
channels[0],
3,
stride=1,
padding=1,
act=act)))
n = len(channels) - 1
self.stages = nn.Sequential(*[(str(i), CSPResStage(
BasicBlock,
channels[i],
channels[i + 1],
layers[i],
2,
act=act,
use_alpha=use_alpha)) for i in range(n)])
self._out_channels = channels[1:]
self._out_strides = [4 * 2**i for i in range(n)]
self.return_idx = return_idx
if use_checkpoint:
paddle.seed(0)
def forward(self, inputs):
x = inputs['image']
x = self.stem(x)
outs = []
for idx, stage in enumerate(self.stages):
if self.use_checkpoint and self.training:
x = paddle.distributed.fleet.utils.recompute(
stage, x, **{"preserve_rng_state": True})
else:
x = stage(x)
if idx in self.return_idx:
outs.append(x)
return outs
@property
def out_shape(self):
return [
ShapeSpec(
channels=self._out_channels[i], stride=self._out_strides[i])
for i in self.return_idx
]

View File

@@ -0,0 +1,345 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from ppdet.modeling.ops import batch_norm, mish
from ..shape_spec import ShapeSpec
__all__ = ['DarkNet', 'ConvBNLayer']
class ConvBNLayer(nn.Layer):
def __init__(self,
ch_in,
ch_out,
filter_size=3,
stride=1,
groups=1,
padding=0,
norm_type='bn',
norm_decay=0.,
act="leaky",
freeze_norm=False,
data_format='NCHW',
name=''):
"""
conv + bn + activation layer
Args:
ch_in (int): input channel
ch_out (int): output channel
filter_size (int): filter size, default 3
stride (int): stride, default 1
groups (int): number of groups of conv layer, default 1
padding (int): padding size, default 0
norm_type (str): batch norm type, default bn
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
act (str): activation function type, default 'leaky', which means leaky_relu
freeze_norm (bool): whether to freeze norm, default False
data_format (str): data format, NCHW or NHWC
"""
super(ConvBNLayer, self).__init__()
self.conv = nn.Conv2D(
in_channels=ch_in,
out_channels=ch_out,
kernel_size=filter_size,
stride=stride,
padding=padding,
groups=groups,
data_format=data_format,
bias_attr=False)
self.batch_norm = batch_norm(
ch_out,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format)
self.act = act
def forward(self, inputs):
out = self.conv(inputs)
out = self.batch_norm(out)
if self.act == 'leaky':
out = F.leaky_relu(out, 0.1)
else:
out = getattr(F, self.act)(out)
return out
class DownSample(nn.Layer):
def __init__(self,
ch_in,
ch_out,
filter_size=3,
stride=2,
padding=1,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
data_format='NCHW'):
"""
downsample layer
Args:
ch_in (int): input channel
ch_out (int): output channel
filter_size (int): filter size, default 3
stride (int): stride, default 2
padding (int): padding size, default 1
norm_type (str): batch norm type, default bn
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
freeze_norm (bool): whether to freeze norm, default False
data_format (str): data format, NCHW or NHWC
"""
super(DownSample, self).__init__()
self.conv_bn_layer = ConvBNLayer(
ch_in=ch_in,
ch_out=ch_out,
filter_size=filter_size,
stride=stride,
padding=padding,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format)
self.ch_out = ch_out
def forward(self, inputs):
out = self.conv_bn_layer(inputs)
return out
class BasicBlock(nn.Layer):
def __init__(self,
ch_in,
ch_out,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
data_format='NCHW'):
"""
BasicBlock layer of DarkNet
Args:
ch_in (int): input channel
ch_out (int): output channel
norm_type (str): batch norm type, default bn
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
freeze_norm (bool): whether to freeze norm, default False
data_format (str): data format, NCHW or NHWC
"""
super(BasicBlock, self).__init__()
assert ch_in == ch_out and (ch_in % 2) == 0, \
f"ch_in and ch_out should be the same even int, but the input \'ch_in is {ch_in}, \'ch_out is {ch_out}"
# example:
# --------------{conv1} --> {conv2}
# channel route: 10-->5 --> 5-->10
self.conv1 = ConvBNLayer(
ch_in=ch_in,
ch_out=int(ch_out / 2),
filter_size=1,
stride=1,
padding=0,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format)
self.conv2 = ConvBNLayer(
ch_in=int(ch_out / 2),
ch_out=ch_out,
filter_size=3,
stride=1,
padding=1,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format)
def forward(self, inputs):
conv1 = self.conv1(inputs)
conv2 = self.conv2(conv1)
out = paddle.add(x=inputs, y=conv2)
return out
class Blocks(nn.Layer):
def __init__(self,
ch_in,
ch_out,
count,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
name=None,
data_format='NCHW'):
"""
Blocks layer, which consist of some BaickBlock layers
Args:
ch_in (int): input channel
ch_out (int): output channel
count (int): number of BasicBlock layer
norm_type (str): batch norm type, default bn
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
freeze_norm (bool): whether to freeze norm, default False
name (str): layer name
data_format (str): data format, NCHW or NHWC
"""
super(Blocks, self).__init__()
self.basicblock0 = BasicBlock(
ch_in,
ch_out,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format)
self.res_out_list = []
for i in range(1, count):
block_name = '{}.{}'.format(name, i)
res_out = self.add_sublayer(
block_name,
BasicBlock(
ch_out,
ch_out,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format))
self.res_out_list.append(res_out)
self.ch_out = ch_out
def forward(self, inputs):
y = self.basicblock0(inputs)
for basic_block_i in self.res_out_list:
y = basic_block_i(y)
return y
DarkNet_cfg = {53: ([1, 2, 8, 8, 4])}
@register
@serializable
class DarkNet(nn.Layer):
__shared__ = ['norm_type', 'data_format']
def __init__(self,
depth=53,
freeze_at=-1,
return_idx=[2, 3, 4],
num_stages=5,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
data_format='NCHW'):
"""
Darknet, see https://pjreddie.com/darknet/yolo/
Args:
depth (int): depth of network
freeze_at (int): freeze the backbone at which stage
filter_size (int): filter size, default 3
return_idx (list): index of stages whose feature maps are returned
norm_type (str): batch norm type, default bn
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
data_format (str): data format, NCHW or NHWC
"""
super(DarkNet, self).__init__()
self.depth = depth
self.freeze_at = freeze_at
self.return_idx = return_idx
self.num_stages = num_stages
self.stages = DarkNet_cfg[self.depth][0:num_stages]
self.conv0 = ConvBNLayer(
ch_in=3,
ch_out=32,
filter_size=3,
stride=1,
padding=1,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format)
self.downsample0 = DownSample(
ch_in=32,
ch_out=32 * 2,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format)
self._out_channels = []
self.darknet_conv_block_list = []
self.downsample_list = []
ch_in = [64, 128, 256, 512, 1024]
for i, stage in enumerate(self.stages):
name = 'stage.{}'.format(i)
conv_block = self.add_sublayer(
name,
Blocks(
int(ch_in[i]),
int(ch_in[i]),
stage,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format,
name=name))
self.darknet_conv_block_list.append(conv_block)
if i in return_idx:
self._out_channels.append(int(ch_in[i]))
for i in range(num_stages - 1):
down_name = 'stage.{}.downsample'.format(i)
downsample = self.add_sublayer(
down_name,
DownSample(
ch_in=int(ch_in[i]),
ch_out=int(ch_in[i + 1]),
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
data_format=data_format))
self.downsample_list.append(downsample)
def forward(self, inputs):
x = inputs['image']
out = self.conv0(x)
out = self.downsample0(out)
blocks = []
for i, conv_block_i in enumerate(self.darknet_conv_block_list):
out = conv_block_i(out)
if i == self.freeze_at:
out.stop_gradient = True
if i in self.return_idx:
blocks.append(out)
if i < self.num_stages - 1:
out = self.downsample_list[i](out)
return blocks
@property
def out_shape(self):
return [ShapeSpec(channels=c) for c in self._out_channels]

View File

@@ -0,0 +1,720 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/microsoft/FocalNet/blob/main/classification/focalnet.py
"""
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.modeling.shape_spec import ShapeSpec
from ppdet.core.workspace import register, serializable
from .transformer_utils import DropPath, Identity
from .transformer_utils import add_parameter, to_2tuple
from .transformer_utils import ones_, zeros_, trunc_normal_
from .swin_transformer import Mlp
__all__ = ['FocalNet']
MODEL_cfg = {
'focalnet_T_224_1k_srf': dict(
embed_dim=96,
depths=[2, 2, 6, 2],
focal_levels=[2, 2, 2, 2],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.2,
use_conv_embed=False,
use_postln=False,
use_postln_in_modulation=False,
use_layerscale=False,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_srf_pretrained.pdparams',
),
'focalnet_S_224_1k_srf': dict(
embed_dim=96,
depths=[2, 2, 18, 2],
focal_levels=[2, 2, 2, 2],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.3,
use_conv_embed=False,
use_postln=False,
use_postln_in_modulation=False,
use_layerscale=False,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_srf_pretrained.pdparams',
),
'focalnet_B_224_1k_srf': dict(
embed_dim=128,
depths=[2, 2, 18, 2],
focal_levels=[2, 2, 2, 2],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.5,
use_conv_embed=False,
use_postln=False,
use_postln_in_modulation=False,
use_layerscale=False,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_srf_pretrained.pdparams',
),
'focalnet_T_224_1k_lrf': dict(
embed_dim=96,
depths=[2, 2, 6, 2],
focal_levels=[3, 3, 3, 3],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.2,
use_conv_embed=False,
use_postln=False,
use_postln_in_modulation=False,
use_layerscale=False,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_lrf_pretrained.pdparams',
),
'focalnet_S_224_1k_lrf': dict(
embed_dim=96,
depths=[2, 2, 18, 2],
focal_levels=[3, 3, 3, 3],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.3,
use_conv_embed=False,
use_postln=False,
use_postln_in_modulation=False,
use_layerscale=False,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_lrf_pretrained.pdparams',
),
'focalnet_B_224_1k_lrf': dict(
embed_dim=128,
depths=[2, 2, 18, 2],
focal_levels=[3, 3, 3, 3],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.5,
use_conv_embed=False,
use_postln=False,
use_postln_in_modulation=False,
use_layerscale=False,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_lrf_pretrained.pdparams',
),
'focalnet_L_384_22k_fl3': dict(
embed_dim=192,
depths=[2, 2, 18, 2],
focal_levels=[3, 3, 3, 3],
focal_windows=[5, 5, 5, 5],
drop_path_rate=0.5,
use_conv_embed=True,
use_postln=True,
use_postln_in_modulation=False,
use_layerscale=True,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_pretrained.pdparams',
),
'focalnet_L_384_22k_fl4': dict(
embed_dim=192,
depths=[2, 2, 18, 2],
focal_levels=[4, 4, 4, 4],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.5,
use_conv_embed=True,
use_postln=True,
use_postln_in_modulation=False,
use_layerscale=True,
normalize_modulator=True, #
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_fl4_pretrained.pdparams',
),
'focalnet_XL_384_22k_fl3': dict(
embed_dim=256,
depths=[2, 2, 18, 2],
focal_levels=[3, 3, 3, 3],
focal_windows=[5, 5, 5, 5],
drop_path_rate=0.5,
use_conv_embed=True,
use_postln=True,
use_postln_in_modulation=False,
use_layerscale=True,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_pretrained.pdparams',
),
'focalnet_XL_384_22k_fl4': dict(
embed_dim=256,
depths=[2, 2, 18, 2],
focal_levels=[4, 4, 4, 4],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.5,
use_conv_embed=True,
use_postln=True,
use_postln_in_modulation=False,
use_layerscale=True,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_fl4_pretrained.pdparams',
),
'focalnet_H_224_22k_fl3': dict(
embed_dim=352,
depths=[2, 2, 18, 2],
focal_levels=[3, 3, 3, 3],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.5,
use_conv_embed=True,
use_postln=True,
use_postln_in_modulation=True, #
use_layerscale=True,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_pretrained.pdparams',
),
'focalnet_H_224_22k_fl4': dict(
embed_dim=352,
depths=[2, 2, 18, 2],
focal_levels=[4, 4, 4, 4],
focal_windows=[3, 3, 3, 3],
drop_path_rate=0.5,
use_conv_embed=True,
use_postln=True,
use_postln_in_modulation=True, #
use_layerscale=True,
normalize_modulator=False,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_fl4_pretrained.pdparams',
),
}
class FocalModulation(nn.Layer):
"""
Args:
dim (int): Number of input channels.
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
focal_level (int): Number of focal levels
focal_window (int): Focal window size at focal level 1
focal_factor (int): Step to increase the focal window. Default: 2
use_postln_in_modulation (bool): Whether use post-modulation layernorm
normalize_modulator (bool): Whether use normalize in modulator
"""
def __init__(self,
dim,
proj_drop=0.,
focal_level=2,
focal_window=7,
focal_factor=2,
use_postln_in_modulation=False,
normalize_modulator=False):
super().__init__()
self.dim = dim
# specific args for focalv3
self.focal_level = focal_level
self.focal_window = focal_window
self.focal_factor = focal_factor
self.use_postln_in_modulation = use_postln_in_modulation
self.normalize_modulator = normalize_modulator
self.f = nn.Linear(
dim, 2 * dim + (self.focal_level + 1), bias_attr=True)
self.h = nn.Conv2D(
dim,
dim,
kernel_size=1,
stride=1,
padding=0,
groups=1,
bias_attr=True)
self.act = nn.GELU()
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
self.focal_layers = nn.LayerList()
if self.use_postln_in_modulation:
self.ln = nn.LayerNorm(dim)
for k in range(self.focal_level):
kernel_size = self.focal_factor * k + self.focal_window
self.focal_layers.append(
nn.Sequential(
nn.Conv2D(
dim,
dim,
kernel_size=kernel_size,
stride=1,
groups=dim,
padding=kernel_size // 2,
bias_attr=False),
nn.GELU()))
def forward(self, x):
""" Forward function.
Args:
x: input features with shape of (B, H, W, C)
"""
_, _, _, C = x.shape
x = self.f(x)
x = x.transpose([0, 3, 1, 2])
q, ctx, gates = paddle.split(x, (C, C, self.focal_level + 1), 1)
ctx_all = 0
for l in range(self.focal_level):
ctx = self.focal_layers[l](ctx)
ctx_all = ctx_all + ctx * gates[:, l:l + 1]
ctx_global = self.act(ctx.mean(2, keepdim=True).mean(3, keepdim=True))
ctx_all = ctx_all + ctx_global * gates[:, self.focal_level:]
if self.normalize_modulator:
ctx_all = ctx_all / (self.focal_level + 1)
x_out = q * self.h(ctx_all)
x_out = x_out.transpose([0, 2, 3, 1])
if self.use_postln_in_modulation:
x_out = self.ln(x_out)
x_out = self.proj(x_out)
x_out = self.proj_drop(x_out)
return x_out
class FocalModulationBlock(nn.Layer):
""" Focal Modulation Block.
Args:
dim (int): Number of input channels.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
drop (float, optional): Dropout rate. Default: 0.0
drop_path (float, optional): Stochastic depth rate. Default: 0.0
act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
focal_level (int): number of focal levels
focal_window (int): focal kernel size at level 1
use_postln (bool): Whether use layernorm after modulation. Default: False.
use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
normalize_modulator (bool): Whether use normalize in modulator
use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False
layerscale_value (float): Value for layer scale. Default: 1e-4
"""
def __init__(self,
dim,
mlp_ratio=4.,
drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm,
focal_level=2,
focal_window=9,
use_postln=False,
use_postln_in_modulation=False,
normalize_modulator=False,
use_layerscale=False,
layerscale_value=1e-4):
super().__init__()
self.dim = dim
self.mlp_ratio = mlp_ratio
self.focal_window = focal_window
self.focal_level = focal_level
self.use_postln = use_postln
self.use_layerscale = use_layerscale
self.norm1 = norm_layer(dim)
self.modulation = FocalModulation(
dim,
proj_drop=drop,
focal_level=self.focal_level,
focal_window=self.focal_window,
use_postln_in_modulation=use_postln_in_modulation,
normalize_modulator=normalize_modulator)
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
self.H = None
self.W = None
self.gamma_1 = 1.0
self.gamma_2 = 1.0
if self.use_layerscale:
self.gamma_1 = add_parameter(self,
layerscale_value * paddle.ones([dim]))
self.gamma_2 = add_parameter(self,
layerscale_value * paddle.ones([dim]))
def forward(self, x):
"""
Args:
x: Input feature, tensor size (B, H*W, C).
"""
B, L, C = x.shape
H, W = self.H, self.W
assert L == H * W, "input feature has wrong size"
shortcut = x
if not self.use_postln:
x = self.norm1(x)
x = x.reshape([-1, H, W, C])
# FM
x = self.modulation(x).reshape([-1, H * W, C])
if self.use_postln:
x = self.norm1(x)
# FFN
x = shortcut + self.drop_path(self.gamma_1 * x)
if self.use_postln:
x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x)))
else:
x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
return x
class BasicLayer(nn.Layer):
""" A basic focal modulation layer for one stage.
Args:
dim (int): Number of feature channels
depth (int): Depths of this stage.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
drop (float, optional): Dropout rate. Default: 0.0
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
focal_level (int): Number of focal levels
focal_window (int): Focal window size at focal level 1
use_conv_embed (bool): Whether use overlapped convolution for patch embedding
use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False
layerscale_value (float): Value of layerscale
use_postln (bool): Whether use layernorm after modulation. Default: False.
use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
normalize_modulator (bool): Whether use normalize in modulator
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
"""
def __init__(self,
dim,
depth,
mlp_ratio=4.,
drop=0.,
drop_path=0.,
norm_layer=nn.LayerNorm,
downsample=None,
focal_level=2,
focal_window=9,
use_conv_embed=False,
use_layerscale=False,
layerscale_value=1e-4,
use_postln=False,
use_postln_in_modulation=False,
normalize_modulator=False,
use_checkpoint=False):
super().__init__()
self.depth = depth
self.use_checkpoint = use_checkpoint
# build blocks
self.blocks = nn.LayerList([
FocalModulationBlock(
dim=dim,
mlp_ratio=mlp_ratio,
drop=drop,
drop_path=drop_path[i]
if isinstance(drop_path, np.ndarray) else drop_path,
act_layer=nn.GELU,
norm_layer=norm_layer,
focal_level=focal_level,
focal_window=focal_window,
use_postln=use_postln,
use_postln_in_modulation=use_postln_in_modulation,
normalize_modulator=normalize_modulator,
use_layerscale=use_layerscale,
layerscale_value=layerscale_value) for i in range(depth)
])
# patch merging layer
if downsample is not None:
self.downsample = downsample(
patch_size=2,
in_chans=dim,
embed_dim=2 * dim,
use_conv_embed=use_conv_embed,
norm_layer=norm_layer,
is_stem=False)
else:
self.downsample = None
def forward(self, x, H, W):
"""
Args:
x: Input feature, tensor size (B, H*W, C).
"""
for blk in self.blocks:
blk.H, blk.W = H, W
x = blk(x)
if self.downsample is not None:
x_reshaped = x.transpose([0, 2, 1]).reshape(
[x.shape[0], x.shape[-1], H, W])
x_down = self.downsample(x_reshaped)
x_down = x_down.flatten(2).transpose([0, 2, 1])
Wh, Ww = (H + 1) // 2, (W + 1) // 2
return x, H, W, x_down, Wh, Ww
else:
return x, H, W, x, H, W
class PatchEmbed(nn.Layer):
""" Image to Patch Embedding
Args:
patch_size (int): Patch token size. Default: 4.
in_chans (int): Number of input image channels. Default: 3.
embed_dim (int): Number of linear projection output channels. Default: 96.
norm_layer (nn.Layer, optional): Normalization layer. Default: None
use_conv_embed (bool): Whether use overlapped convolution for patch embedding. Default: False
is_stem (bool): Is the stem block or not.
"""
def __init__(self,
patch_size=4,
in_chans=3,
embed_dim=96,
norm_layer=None,
use_conv_embed=False,
is_stem=False):
super().__init__()
patch_size = to_2tuple(patch_size)
self.patch_size = patch_size
self.in_chans = in_chans
self.embed_dim = embed_dim
if use_conv_embed:
# if we choose to use conv embedding, then we treat the stem and non-stem differently
if is_stem:
kernel_size = 7
padding = 2
stride = 4
else:
kernel_size = 3
padding = 1
stride = 2
self.proj = nn.Conv2D(
in_chans,
embed_dim,
kernel_size=kernel_size,
stride=stride,
padding=padding)
else:
self.proj = nn.Conv2D(
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
if norm_layer is not None:
self.norm = norm_layer(embed_dim)
else:
self.norm = None
def forward(self, x):
_, _, H, W = x.shape
if W % self.patch_size[1] != 0:
# for 3D tensor: [pad_left, pad_right]
# for 4D tensor: [pad_left, pad_right, pad_top, pad_bottom]
x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
W += W % self.patch_size[1]
if H % self.patch_size[0] != 0:
x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
H += H % self.patch_size[0]
x = self.proj(x)
if self.norm is not None:
_, _, Wh, Ww = x.shape
x = x.flatten(2).transpose([0, 2, 1])
x = self.norm(x)
x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
return x
@register
@serializable
class FocalNet(nn.Layer):
""" FocalNet backbone
Args:
arch (str): Architecture of FocalNet
out_indices (Sequence[int]): Output from which stages.
frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-1 means not freezing any parameters.
patch_size (int | tuple(int)): Patch size. Default: 4.
in_chans (int): Number of input image channels. Default: 3.
embed_dim (int): Number of linear projection output channels. Default: 96.
depths (tuple[int]): Depths of each FocalNet Transformer stage.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
drop_rate (float): Dropout rate.
drop_path_rate (float): Stochastic depth rate. Default: 0.2.
norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
patch_norm (bool): If True, add normalization after patch embedding. Default: True.
focal_levels (Sequence[int]): Number of focal levels at four stages
focal_windows (Sequence[int]): Focal window sizes at first focal level at four stages
use_conv_embed (bool): Whether use overlapped convolution for patch embedding
use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False
layerscale_value (float): Value of layerscale
use_postln (bool): Whether use layernorm after modulation. Default: False.
use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
normalize_modulator (bool): Whether use normalize in modulator
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
"""
def __init__(
self,
arch='focalnet_T_224_1k_srf',
out_indices=(0, 1, 2, 3),
frozen_stages=-1,
patch_size=4,
in_chans=3,
embed_dim=96,
depths=[2, 2, 6, 2],
mlp_ratio=4.,
drop_rate=0.,
drop_path_rate=0.2, # 0.5 better for large+ models
norm_layer=nn.LayerNorm,
patch_norm=True,
focal_levels=[2, 2, 2, 2],
focal_windows=[3, 3, 3, 3],
use_conv_embed=False,
use_layerscale=False,
layerscale_value=1e-4,
use_postln=False,
use_postln_in_modulation=False,
normalize_modulator=False,
use_checkpoint=False,
pretrained=None):
super(FocalNet, self).__init__()
assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch)
embed_dim = MODEL_cfg[arch]['embed_dim']
depths = MODEL_cfg[arch]['depths']
drop_path_rate = MODEL_cfg[arch]['drop_path_rate']
focal_levels = MODEL_cfg[arch]['focal_levels']
focal_windows = MODEL_cfg[arch]['focal_windows']
use_conv_embed = MODEL_cfg[arch]['use_conv_embed']
use_layerscale = MODEL_cfg[arch]['use_layerscale']
use_postln = MODEL_cfg[arch]['use_postln']
use_postln_in_modulation = MODEL_cfg[arch]['use_postln_in_modulation']
normalize_modulator = MODEL_cfg[arch]['normalize_modulator']
if pretrained is None:
pretrained = MODEL_cfg[arch]['pretrained']
self.out_indices = out_indices
self.frozen_stages = frozen_stages
self.num_layers = len(depths)
self.patch_norm = patch_norm
# split image into non-overlapping patches
self.patch_embed = PatchEmbed(
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim,
norm_layer=norm_layer if self.patch_norm else None,
use_conv_embed=use_conv_embed,
is_stem=True)
self.pos_drop = nn.Dropout(p=drop_rate)
# stochastic depth decay rule
dpr = np.linspace(0, drop_path_rate, sum(depths))
# build layers
self.layers = nn.LayerList()
for i_layer in range(self.num_layers):
layer = BasicLayer(
dim=int(embed_dim * 2**i_layer),
depth=depths[i_layer],
mlp_ratio=mlp_ratio,
drop=drop_rate,
drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
norm_layer=norm_layer,
downsample=PatchEmbed
if (i_layer < self.num_layers - 1) else None,
focal_level=focal_levels[i_layer],
focal_window=focal_windows[i_layer],
use_conv_embed=use_conv_embed,
use_layerscale=use_layerscale,
layerscale_value=layerscale_value,
use_postln=use_postln,
use_postln_in_modulation=use_postln_in_modulation,
normalize_modulator=normalize_modulator,
use_checkpoint=use_checkpoint)
self.layers.append(layer)
num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
self.num_features = num_features
# add a norm layer for each output
for i_layer in out_indices:
layer = norm_layer(num_features[i_layer])
layer_name = f'norm{i_layer}'
self.add_sublayer(layer_name, layer)
self.apply(self._init_weights)
self._freeze_stages()
if pretrained:
if 'http' in pretrained: #URL
path = paddle.utils.download.get_weights_path_from_url(
pretrained)
else: #model in local path
path = pretrained
self.set_state_dict(paddle.load(path))
def _freeze_stages(self):
if self.frozen_stages >= 0:
self.patch_embed.eval()
for param in self.patch_embed.parameters():
param.stop_gradient = True
if self.frozen_stages >= 2:
self.pos_drop.eval()
for i in range(0, self.frozen_stages - 1):
m = self.layers[i]
m.eval()
for param in m.parameters():
param.stop_gradient = True
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight)
if isinstance(m, nn.Linear) and m.bias is not None:
zeros_(m.bias)
elif isinstance(m, nn.LayerNorm):
zeros_(m.bias)
ones_(m.weight)
def forward(self, x):
x = self.patch_embed(x['image'])
B, _, Wh, Ww = x.shape
x = x.flatten(2).transpose([0, 2, 1])
x = self.pos_drop(x)
outs = []
for i in range(self.num_layers):
layer = self.layers[i]
x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
if i in self.out_indices:
norm_layer = getattr(self, f'norm{i}')
x_out = norm_layer(x_out)
out = x_out.reshape([-1, H, W, self.num_features[i]]).transpose(
(0, 3, 1, 2))
outs.append(out)
return outs
@property
def out_shape(self):
out_strides = [4, 8, 16, 32]
return [
ShapeSpec(
channels=self.num_features[i], stride=out_strides[i])
for i in self.out_indices
]

View File

@@ -0,0 +1,447 @@
# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import KaimingNormal, Constant
from paddle.nn import Conv2D, BatchNorm2D, ReLU, AdaptiveAvgPool2D, MaxPool2D
from paddle.regularizer import L2Decay
from paddle import ParamAttr
import copy
from ppdet.core.workspace import register, serializable
from ..shape_spec import ShapeSpec
__all__ = ['PPHGNetV2']
kaiming_normal_ = KaimingNormal()
zeros_ = Constant(value=0.)
ones_ = Constant(value=1.)
class LearnableAffineBlock(nn.Layer):
def __init__(self,
scale_value=1.0,
bias_value=0.0,
lr_mult=1.0,
lab_lr=0.01):
super().__init__()
self.scale = self.create_parameter(
shape=[1, ],
default_initializer=Constant(value=scale_value),
attr=ParamAttr(learning_rate=lr_mult * lab_lr))
self.add_parameter("scale", self.scale)
self.bias = self.create_parameter(
shape=[1, ],
default_initializer=Constant(value=bias_value),
attr=ParamAttr(learning_rate=lr_mult * lab_lr))
self.add_parameter("bias", self.bias)
def forward(self, x):
return self.scale * x + self.bias
class ConvBNAct(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size=3,
stride=1,
padding=1,
groups=1,
use_act=True,
use_lab=False,
lr_mult=1.0):
super().__init__()
self.use_act = use_act
self.use_lab = use_lab
self.conv = Conv2D(
in_channels,
out_channels,
kernel_size,
stride,
padding=padding
if isinstance(padding, str) else (kernel_size - 1) // 2,
groups=groups,
weight_attr=ParamAttr(learning_rate=lr_mult),
bias_attr=False)
self.bn = BatchNorm2D(
out_channels,
weight_attr=ParamAttr(
regularizer=L2Decay(0.0), learning_rate=lr_mult),
bias_attr=ParamAttr(
regularizer=L2Decay(0.0), learning_rate=lr_mult))
if self.use_act:
self.act = ReLU()
if self.use_lab:
self.lab = LearnableAffineBlock(lr_mult=lr_mult)
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
if self.use_act:
x = self.act(x)
if self.use_lab:
x = self.lab(x)
return x
class LightConvBNAct(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
groups=1,
use_lab=False,
lr_mult=1.0):
super().__init__()
self.conv1 = ConvBNAct(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1,
use_act=False,
use_lab=use_lab,
lr_mult=lr_mult)
self.conv2 = ConvBNAct(
in_channels=out_channels,
out_channels=out_channels,
kernel_size=kernel_size,
groups=out_channels,
use_act=True,
use_lab=use_lab,
lr_mult=lr_mult)
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
return x
class StemBlock(nn.Layer):
def __init__(self,
in_channels,
mid_channels,
out_channels,
use_lab=False,
lr_mult=1.0):
super().__init__()
self.stem1 = ConvBNAct(
in_channels=in_channels,
out_channels=mid_channels,
kernel_size=3,
stride=2,
use_lab=use_lab,
lr_mult=lr_mult)
self.stem2a = ConvBNAct(
in_channels=mid_channels,
out_channels=mid_channels // 2,
kernel_size=2,
stride=1,
padding="SAME",
use_lab=use_lab,
lr_mult=lr_mult)
self.stem2b = ConvBNAct(
in_channels=mid_channels // 2,
out_channels=mid_channels,
kernel_size=2,
stride=1,
padding="SAME",
use_lab=use_lab,
lr_mult=lr_mult)
self.stem3 = ConvBNAct(
in_channels=mid_channels * 2,
out_channels=mid_channels,
kernel_size=3,
stride=2,
use_lab=use_lab,
lr_mult=lr_mult)
self.stem4 = ConvBNAct(
in_channels=mid_channels,
out_channels=out_channels,
kernel_size=1,
stride=1,
use_lab=use_lab,
lr_mult=lr_mult)
self.pool = nn.MaxPool2D(
kernel_size=2, stride=1, ceil_mode=True, padding="SAME")
def forward(self, x):
x = self.stem1(x)
x2 = self.stem2a(x)
x2 = self.stem2b(x2)
x1 = self.pool(x)
x = paddle.concat([x1, x2], 1)
x = self.stem3(x)
x = self.stem4(x)
return x
class HG_Block(nn.Layer):
def __init__(self,
in_channels,
mid_channels,
out_channels,
kernel_size=3,
layer_num=6,
identity=False,
light_block=True,
use_lab=False,
lr_mult=1.0):
super().__init__()
self.identity = identity
self.layers = nn.LayerList()
block_type = "LightConvBNAct" if light_block else "ConvBNAct"
for i in range(layer_num):
self.layers.append(
eval(block_type)(in_channels=in_channels
if i == 0 else mid_channels,
out_channels=mid_channels,
stride=1,
kernel_size=kernel_size,
use_lab=use_lab,
lr_mult=lr_mult))
# feature aggregation
total_channels = in_channels + layer_num * mid_channels
self.aggregation_squeeze_conv = ConvBNAct(
in_channels=total_channels,
out_channels=out_channels // 2,
kernel_size=1,
stride=1,
use_lab=use_lab,
lr_mult=lr_mult)
self.aggregation_excitation_conv = ConvBNAct(
in_channels=out_channels // 2,
out_channels=out_channels,
kernel_size=1,
stride=1,
use_lab=use_lab,
lr_mult=lr_mult)
def forward(self, x):
identity = x
output = []
output.append(x)
for layer in self.layers:
x = layer(x)
output.append(x)
x = paddle.concat(output, axis=1)
x = self.aggregation_squeeze_conv(x)
x = self.aggregation_excitation_conv(x)
if self.identity:
x += identity
return x
class HG_Stage(nn.Layer):
def __init__(self,
in_channels,
mid_channels,
out_channels,
block_num,
layer_num=6,
downsample=True,
light_block=True,
kernel_size=3,
use_lab=False,
lr_mult=1.0):
super().__init__()
self.downsample = downsample
if downsample:
self.downsample = ConvBNAct(
in_channels=in_channels,
out_channels=in_channels,
kernel_size=3,
stride=2,
groups=in_channels,
use_act=False,
use_lab=use_lab,
lr_mult=lr_mult)
blocks_list = []
for i in range(block_num):
blocks_list.append(
HG_Block(
in_channels=in_channels if i == 0 else out_channels,
mid_channels=mid_channels,
out_channels=out_channels,
kernel_size=kernel_size,
layer_num=layer_num,
identity=False if i == 0 else True,
light_block=light_block,
use_lab=use_lab,
lr_mult=lr_mult))
self.blocks = nn.Sequential(*blocks_list)
def forward(self, x):
if self.downsample:
x = self.downsample(x)
x = self.blocks(x)
return x
def _freeze_norm(m: nn.BatchNorm2D):
param_attr = ParamAttr(
learning_rate=0., regularizer=L2Decay(0.), trainable=False)
bias_attr = ParamAttr(
learning_rate=0., regularizer=L2Decay(0.), trainable=False)
global_stats = True
norm = nn.BatchNorm2D(
m._num_features,
weight_attr=param_attr,
bias_attr=bias_attr,
use_global_stats=global_stats)
for param in norm.parameters():
param.stop_gradient = True
return norm
def reset_bn(model: nn.Layer, reset_func=_freeze_norm):
if isinstance(model, nn.BatchNorm2D):
model = reset_func(model)
else:
for name, child in model.named_children():
_child = reset_bn(child, reset_func)
if _child is not child:
setattr(model, name, _child)
return model
@register
@serializable
class PPHGNetV2(nn.Layer):
"""
PPHGNetV2
Args:
stem_channels: list. Number of channels for the stem block.
stage_type: str. The stage configuration of PPHGNet. such as the number of channels, stride, etc.
use_lab: boolean. Whether to use LearnableAffineBlock in network.
lr_mult_list: list. Control the learning rate of different stages.
Returns:
model: nn.Layer. Specific PPHGNetV2 model depends on args.
"""
arch_configs = {
'L': {
'stem_channels': [3, 32, 48],
'stage_config': {
# in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
"stage1": [48, 48, 128, 1, False, False, 3, 6],
"stage2": [128, 96, 512, 1, True, False, 3, 6],
"stage3": [512, 192, 1024, 3, True, True, 5, 6],
"stage4": [1024, 384, 2048, 1, True, True, 5, 6],
}
},
'X': {
'stem_channels': [3, 32, 64],
'stage_config': {
# in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
"stage1": [64, 64, 128, 1, False, False, 3, 6],
"stage2": [128, 128, 512, 2, True, False, 3, 6],
"stage3": [512, 256, 1024, 5, True, True, 5, 6],
"stage4": [1024, 512, 2048, 2, True, True, 5, 6],
}
}
}
def __init__(self,
arch,
use_lab=False,
lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
return_idx=[1, 2, 3],
freeze_stem_only=True,
freeze_at=0,
freeze_norm=True):
super().__init__()
self.use_lab = use_lab
self.return_idx = return_idx
stem_channels = self.arch_configs[arch]['stem_channels']
stage_config = self.arch_configs[arch]['stage_config']
self._out_strides = [4, 8, 16, 32]
self._out_channels = [stage_config[k][2] for k in stage_config]
# stem
self.stem = StemBlock(
in_channels=stem_channels[0],
mid_channels=stem_channels[1],
out_channels=stem_channels[2],
use_lab=use_lab,
lr_mult=lr_mult_list[0])
# stages
self.stages = nn.LayerList()
for i, k in enumerate(stage_config):
in_channels, mid_channels, out_channels, block_num, downsample, light_block, kernel_size, layer_num = stage_config[
k]
self.stages.append(
HG_Stage(
in_channels,
mid_channels,
out_channels,
block_num,
layer_num,
downsample,
light_block,
kernel_size,
use_lab,
lr_mult=lr_mult_list[i + 1]))
if freeze_at >= 0:
self._freeze_parameters(self.stem)
if not freeze_stem_only:
for i in range(min(freeze_at + 1, len(self.stages))):
self._freeze_parameters(self.stages[i])
if freeze_norm:
reset_bn(self, reset_func=_freeze_norm)
self._init_weights()
def _freeze_parameters(self, m):
for p in m.parameters():
p.stop_gradient = True
def _init_weights(self):
for m in self.sublayers():
if isinstance(m, nn.Conv2D):
kaiming_normal_(m.weight)
elif isinstance(m, (nn.BatchNorm2D)):
ones_(m.weight)
zeros_(m.bias)
elif isinstance(m, nn.Linear):
zeros_(m.bias)
@property
def out_shape(self):
return [
ShapeSpec(
channels=self._out_channels[i], stride=self._out_strides[i])
for i in self.return_idx
]
def forward(self, inputs):
x = inputs['image']
x = self.stem(x)
outs = []
for idx, stage in enumerate(self.stages):
x = stage(x)
if idx in self.return_idx:
outs.append(x)
return outs

View File

@@ -0,0 +1,271 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
from paddle import ParamAttr
from paddle.nn import AdaptiveAvgPool2D, Conv2D
from paddle.regularizer import L2Decay
from paddle.nn.initializer import KaimingNormal
from ppdet.core.workspace import register, serializable
from numbers import Integral
from ..shape_spec import ShapeSpec
__all__ = ['LCNet']
NET_CONFIG = {
"blocks2":
#k, in_c, out_c, s, use_se
[[3, 16, 32, 1, False], ],
"blocks3": [
[3, 32, 64, 2, False],
[3, 64, 64, 1, False],
],
"blocks4": [
[3, 64, 128, 2, False],
[3, 128, 128, 1, False],
],
"blocks5": [
[3, 128, 256, 2, False],
[5, 256, 256, 1, False],
[5, 256, 256, 1, False],
[5, 256, 256, 1, False],
[5, 256, 256, 1, False],
[5, 256, 256, 1, False],
],
"blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]]
}
def make_divisible(v, divisor=8, min_value=None):
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
if new_v < 0.9 * v:
new_v += divisor
return new_v
class ConvBNLayer(nn.Layer):
def __init__(self,
num_channels,
filter_size,
num_filters,
stride,
num_groups=1,
act='hard_swish'):
super().__init__()
self.conv = Conv2D(
in_channels=num_channels,
out_channels=num_filters,
kernel_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=num_groups,
weight_attr=ParamAttr(initializer=KaimingNormal()),
bias_attr=False)
self.bn = nn.BatchNorm2D(
num_filters,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
if act == 'hard_swish':
self.act = nn.Hardswish()
elif act == 'relu6':
self.act = nn.ReLU6()
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
x = self.act(x)
return x
class DepthwiseSeparable(nn.Layer):
def __init__(self,
num_channels,
num_filters,
stride,
dw_size=3,
use_se=False,
act='hard_swish'):
super().__init__()
self.use_se = use_se
self.dw_conv = ConvBNLayer(
num_channels=num_channels,
num_filters=num_channels,
filter_size=dw_size,
stride=stride,
num_groups=num_channels,
act=act)
if use_se:
self.se = SEModule(num_channels)
self.pw_conv = ConvBNLayer(
num_channels=num_channels,
filter_size=1,
num_filters=num_filters,
stride=1,
act=act)
def forward(self, x):
x = self.dw_conv(x)
if self.use_se:
x = self.se(x)
x = self.pw_conv(x)
return x
class SEModule(nn.Layer):
def __init__(self, channel, reduction=4):
super().__init__()
self.avg_pool = AdaptiveAvgPool2D(1)
self.conv1 = Conv2D(
in_channels=channel,
out_channels=channel // reduction,
kernel_size=1,
stride=1,
padding=0)
self.relu = nn.ReLU()
self.conv2 = Conv2D(
in_channels=channel // reduction,
out_channels=channel,
kernel_size=1,
stride=1,
padding=0)
self.hardsigmoid = nn.Hardsigmoid()
def forward(self, x):
identity = x
x = self.avg_pool(x)
x = self.conv1(x)
x = self.relu(x)
x = self.conv2(x)
x = self.hardsigmoid(x)
x = paddle.multiply(x=identity, y=x)
return x
@register
@serializable
class LCNet(nn.Layer):
def __init__(self, scale=1.0, feature_maps=[3, 4, 5], act='hard_swish'):
super().__init__()
self.scale = scale
self.feature_maps = feature_maps
out_channels = []
self.conv1 = ConvBNLayer(
num_channels=3,
filter_size=3,
num_filters=make_divisible(16 * scale),
stride=2,
act=act)
self.blocks2 = nn.Sequential(* [
DepthwiseSeparable(
num_channels=make_divisible(in_c * scale),
num_filters=make_divisible(out_c * scale),
dw_size=k,
stride=s,
use_se=se,
act=act)
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"])
])
self.blocks3 = nn.Sequential(* [
DepthwiseSeparable(
num_channels=make_divisible(in_c * scale),
num_filters=make_divisible(out_c * scale),
dw_size=k,
stride=s,
use_se=se,
act=act)
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"])
])
out_channels.append(
make_divisible(NET_CONFIG["blocks3"][-1][2] * scale))
self.blocks4 = nn.Sequential(* [
DepthwiseSeparable(
num_channels=make_divisible(in_c * scale),
num_filters=make_divisible(out_c * scale),
dw_size=k,
stride=s,
use_se=se,
act=act)
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"])
])
out_channels.append(
make_divisible(NET_CONFIG["blocks4"][-1][2] * scale))
self.blocks5 = nn.Sequential(* [
DepthwiseSeparable(
num_channels=make_divisible(in_c * scale),
num_filters=make_divisible(out_c * scale),
dw_size=k,
stride=s,
use_se=se,
act=act)
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"])
])
out_channels.append(
make_divisible(NET_CONFIG["blocks5"][-1][2] * scale))
self.blocks6 = nn.Sequential(* [
DepthwiseSeparable(
num_channels=make_divisible(in_c * scale),
num_filters=make_divisible(out_c * scale),
dw_size=k,
stride=s,
use_se=se,
act=act)
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"])
])
out_channels.append(
make_divisible(NET_CONFIG["blocks6"][-1][2] * scale))
self._out_channels = [
ch for idx, ch in enumerate(out_channels) if idx + 2 in feature_maps
]
def forward(self, inputs):
x = inputs['image']
outs = []
x = self.conv1(x)
x = self.blocks2(x)
x = self.blocks3(x)
outs.append(x)
x = self.blocks4(x)
outs.append(x)
x = self.blocks5(x)
outs.append(x)
x = self.blocks6(x)
outs.append(x)
outs = [o for i, o in enumerate(outs) if i + 2 in self.feature_maps]
return outs
@property
def out_shape(self):
return [ShapeSpec(channels=c) for c in self._out_channels]

View File

@@ -0,0 +1,402 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import KaimingNormal
from ppdet.core.workspace import register, serializable
from numbers import Integral
from ..shape_spec import ShapeSpec
__all__ = ['MobileNet']
class ConvBNLayer(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
padding,
num_groups=1,
act='relu',
conv_lr=1.,
conv_decay=0.,
norm_decay=0.,
norm_type='bn',
name=None):
super(ConvBNLayer, self).__init__()
self.act = act
self._conv = nn.Conv2D(
in_channels,
out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=num_groups,
weight_attr=ParamAttr(
learning_rate=conv_lr,
initializer=KaimingNormal(),
regularizer=L2Decay(conv_decay)),
bias_attr=False)
param_attr = ParamAttr(regularizer=L2Decay(norm_decay))
bias_attr = ParamAttr(regularizer=L2Decay(norm_decay))
if norm_type in ['sync_bn', 'bn']:
self._batch_norm = nn.BatchNorm2D(
out_channels, weight_attr=param_attr, bias_attr=bias_attr)
def forward(self, x):
x = self._conv(x)
x = self._batch_norm(x)
if self.act == "relu":
x = F.relu(x)
elif self.act == "relu6":
x = F.relu6(x)
return x
class DepthwiseSeparable(nn.Layer):
def __init__(self,
in_channels,
out_channels1,
out_channels2,
num_groups,
stride,
scale,
conv_lr=1.,
conv_decay=0.,
norm_decay=0.,
norm_type='bn',
name=None):
super(DepthwiseSeparable, self).__init__()
self._depthwise_conv = ConvBNLayer(
in_channels,
int(out_channels1 * scale),
kernel_size=3,
stride=stride,
padding=1,
num_groups=int(num_groups * scale),
conv_lr=conv_lr,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name=name + "_dw")
self._pointwise_conv = ConvBNLayer(
int(out_channels1 * scale),
int(out_channels2 * scale),
kernel_size=1,
stride=1,
padding=0,
conv_lr=conv_lr,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name=name + "_sep")
def forward(self, x):
x = self._depthwise_conv(x)
x = self._pointwise_conv(x)
return x
class ExtraBlock(nn.Layer):
def __init__(self,
in_channels,
out_channels1,
out_channels2,
num_groups=1,
stride=2,
conv_lr=1.,
conv_decay=0.,
norm_decay=0.,
norm_type='bn',
name=None):
super(ExtraBlock, self).__init__()
self.pointwise_conv = ConvBNLayer(
in_channels,
int(out_channels1),
kernel_size=1,
stride=1,
padding=0,
num_groups=int(num_groups),
act='relu6',
conv_lr=conv_lr,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name=name + "_extra1")
self.normal_conv = ConvBNLayer(
int(out_channels1),
int(out_channels2),
kernel_size=3,
stride=stride,
padding=1,
num_groups=int(num_groups),
act='relu6',
conv_lr=conv_lr,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name=name + "_extra2")
def forward(self, x):
x = self.pointwise_conv(x)
x = self.normal_conv(x)
return x
@register
@serializable
class MobileNet(nn.Layer):
__shared__ = ['norm_type']
def __init__(self,
norm_type='bn',
norm_decay=0.,
conv_decay=0.,
scale=1,
conv_learning_rate=1.0,
feature_maps=[4, 6, 13],
with_extra_blocks=False,
extra_block_filters=[[256, 512], [128, 256], [128, 256],
[64, 128]]):
super(MobileNet, self).__init__()
if isinstance(feature_maps, Integral):
feature_maps = [feature_maps]
self.feature_maps = feature_maps
self.with_extra_blocks = with_extra_blocks
self.extra_block_filters = extra_block_filters
self._out_channels = []
self.conv1 = ConvBNLayer(
in_channels=3,
out_channels=int(32 * scale),
kernel_size=3,
stride=2,
padding=1,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv1")
self.dwsl = []
dws21 = self.add_sublayer(
"conv2_1",
sublayer=DepthwiseSeparable(
in_channels=int(32 * scale),
out_channels1=32,
out_channels2=64,
num_groups=32,
stride=1,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv2_1"))
self.dwsl.append(dws21)
self._update_out_channels(int(64 * scale), len(self.dwsl), feature_maps)
dws22 = self.add_sublayer(
"conv2_2",
sublayer=DepthwiseSeparable(
in_channels=int(64 * scale),
out_channels1=64,
out_channels2=128,
num_groups=64,
stride=2,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv2_2"))
self.dwsl.append(dws22)
self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps)
# 1/4
dws31 = self.add_sublayer(
"conv3_1",
sublayer=DepthwiseSeparable(
in_channels=int(128 * scale),
out_channels1=128,
out_channels2=128,
num_groups=128,
stride=1,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv3_1"))
self.dwsl.append(dws31)
self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps)
dws32 = self.add_sublayer(
"conv3_2",
sublayer=DepthwiseSeparable(
in_channels=int(128 * scale),
out_channels1=128,
out_channels2=256,
num_groups=128,
stride=2,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv3_2"))
self.dwsl.append(dws32)
self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps)
# 1/8
dws41 = self.add_sublayer(
"conv4_1",
sublayer=DepthwiseSeparable(
in_channels=int(256 * scale),
out_channels1=256,
out_channels2=256,
num_groups=256,
stride=1,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv4_1"))
self.dwsl.append(dws41)
self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps)
dws42 = self.add_sublayer(
"conv4_2",
sublayer=DepthwiseSeparable(
in_channels=int(256 * scale),
out_channels1=256,
out_channels2=512,
num_groups=256,
stride=2,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv4_2"))
self.dwsl.append(dws42)
self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps)
# 1/16
for i in range(5):
tmp = self.add_sublayer(
"conv5_" + str(i + 1),
sublayer=DepthwiseSeparable(
in_channels=int(512 * scale),
out_channels1=512,
out_channels2=512,
num_groups=512,
stride=1,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv5_" + str(i + 1)))
self.dwsl.append(tmp)
self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps)
dws56 = self.add_sublayer(
"conv5_6",
sublayer=DepthwiseSeparable(
in_channels=int(512 * scale),
out_channels1=512,
out_channels2=1024,
num_groups=512,
stride=2,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv5_6"))
self.dwsl.append(dws56)
self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps)
# 1/32
dws6 = self.add_sublayer(
"conv6",
sublayer=DepthwiseSeparable(
in_channels=int(1024 * scale),
out_channels1=1024,
out_channels2=1024,
num_groups=1024,
stride=1,
scale=scale,
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv6"))
self.dwsl.append(dws6)
self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps)
if self.with_extra_blocks:
self.extra_blocks = []
for i, block_filter in enumerate(self.extra_block_filters):
in_c = 1024 if i == 0 else self.extra_block_filters[i - 1][1]
conv_extra = self.add_sublayer(
"conv7_" + str(i + 1),
sublayer=ExtraBlock(
in_c,
block_filter[0],
block_filter[1],
conv_lr=conv_learning_rate,
conv_decay=conv_decay,
norm_decay=norm_decay,
norm_type=norm_type,
name="conv7_" + str(i + 1)))
self.extra_blocks.append(conv_extra)
self._update_out_channels(
block_filter[1],
len(self.dwsl) + len(self.extra_blocks), feature_maps)
def _update_out_channels(self, channel, feature_idx, feature_maps):
if feature_idx in feature_maps:
self._out_channels.append(channel)
def forward(self, inputs):
outs = []
y = self.conv1(inputs['image'])
for i, block in enumerate(self.dwsl):
y = block(y)
if i + 1 in self.feature_maps:
outs.append(y)
if not self.with_extra_blocks:
return outs
y = outs[-1]
for i, block in enumerate(self.extra_blocks):
idx = i + len(self.dwsl)
y = block(y)
if idx + 1 in self.feature_maps:
outs.append(y)
return outs
@property
def out_shape(self):
return [ShapeSpec(channels=c) for c in self._out_channels]

View File

@@ -0,0 +1,478 @@
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register, serializable
from numbers import Integral
from ..shape_spec import ShapeSpec
__all__ = ['MobileNetV3']
def make_divisible(v, divisor=8, min_value=None):
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
if new_v < 0.9 * v:
new_v += divisor
return new_v
class ConvBNLayer(nn.Layer):
def __init__(self,
in_c,
out_c,
filter_size,
stride,
padding,
num_groups=1,
act=None,
lr_mult=1.,
conv_decay=0.,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
name=""):
super(ConvBNLayer, self).__init__()
self.act = act
self.conv = nn.Conv2D(
in_channels=in_c,
out_channels=out_c,
kernel_size=filter_size,
stride=stride,
padding=padding,
groups=num_groups,
weight_attr=ParamAttr(
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
bias_attr=False)
norm_lr = 0. if freeze_norm else lr_mult
param_attr = ParamAttr(
learning_rate=norm_lr,
regularizer=L2Decay(norm_decay),
trainable=False if freeze_norm else True)
bias_attr = ParamAttr(
learning_rate=norm_lr,
regularizer=L2Decay(norm_decay),
trainable=False if freeze_norm else True)
global_stats = True if freeze_norm else None
if norm_type in ['sync_bn', 'bn']:
self.bn = nn.BatchNorm2D(
out_c,
weight_attr=param_attr,
bias_attr=bias_attr,
use_global_stats=global_stats)
norm_params = self.bn.parameters()
if freeze_norm:
for param in norm_params:
param.stop_gradient = True
def forward(self, x):
x = self.conv(x)
x = self.bn(x)
if self.act is not None:
if self.act == "relu":
x = F.relu(x)
elif self.act == "relu6":
x = F.relu6(x)
elif self.act == "hard_swish":
x = F.hardswish(x)
else:
raise NotImplementedError(
"The activation function is selected incorrectly.")
return x
class ResidualUnit(nn.Layer):
def __init__(self,
in_c,
mid_c,
out_c,
filter_size,
stride,
use_se,
lr_mult,
conv_decay=0.,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
act=None,
return_list=False,
name=''):
super(ResidualUnit, self).__init__()
self.if_shortcut = stride == 1 and in_c == out_c
self.use_se = use_se
self.return_list = return_list
self.expand_conv = ConvBNLayer(
in_c=in_c,
out_c=mid_c,
filter_size=1,
stride=1,
padding=0,
act=act,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_expand")
self.bottleneck_conv = ConvBNLayer(
in_c=mid_c,
out_c=mid_c,
filter_size=filter_size,
stride=stride,
padding=int((filter_size - 1) // 2),
num_groups=mid_c,
act=act,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_depthwise")
if self.use_se:
self.mid_se = SEModule(
mid_c, lr_mult, conv_decay, name=name + "_se")
self.linear_conv = ConvBNLayer(
in_c=mid_c,
out_c=out_c,
filter_size=1,
stride=1,
padding=0,
act=None,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_linear")
def forward(self, inputs):
y = self.expand_conv(inputs)
x = self.bottleneck_conv(y)
if self.use_se:
x = self.mid_se(x)
x = self.linear_conv(x)
if self.if_shortcut:
x = paddle.add(inputs, x)
if self.return_list:
return [y, x]
else:
return x
class SEModule(nn.Layer):
def __init__(self, channel, lr_mult, conv_decay, reduction=4, name=""):
super(SEModule, self).__init__()
self.avg_pool = nn.AdaptiveAvgPool2D(1)
mid_channels = int(channel // reduction)
self.conv1 = nn.Conv2D(
in_channels=channel,
out_channels=mid_channels,
kernel_size=1,
stride=1,
padding=0,
weight_attr=ParamAttr(
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
bias_attr=ParamAttr(
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
self.conv2 = nn.Conv2D(
in_channels=mid_channels,
out_channels=channel,
kernel_size=1,
stride=1,
padding=0,
weight_attr=ParamAttr(
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
bias_attr=ParamAttr(
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
def forward(self, inputs):
outputs = self.avg_pool(inputs)
outputs = self.conv1(outputs)
outputs = F.relu(outputs)
outputs = self.conv2(outputs)
outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5)
return paddle.multiply(x=inputs, y=outputs)
class ExtraBlockDW(nn.Layer):
def __init__(self,
in_c,
ch_1,
ch_2,
stride,
lr_mult,
conv_decay=0.,
norm_type='bn',
norm_decay=0.,
freeze_norm=False,
name=None):
super(ExtraBlockDW, self).__init__()
self.pointwise_conv = ConvBNLayer(
in_c=in_c,
out_c=ch_1,
filter_size=1,
stride=1,
padding='SAME',
act='relu6',
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_extra1")
self.depthwise_conv = ConvBNLayer(
in_c=ch_1,
out_c=ch_2,
filter_size=3,
stride=stride,
padding='SAME',
num_groups=int(ch_1),
act='relu6',
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_extra2_dw")
self.normal_conv = ConvBNLayer(
in_c=ch_2,
out_c=ch_2,
filter_size=1,
stride=1,
padding='SAME',
act='relu6',
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name=name + "_extra2_sep")
def forward(self, inputs):
x = self.pointwise_conv(inputs)
x = self.depthwise_conv(x)
x = self.normal_conv(x)
return x
@register
@serializable
class MobileNetV3(nn.Layer):
__shared__ = ['norm_type']
def __init__(
self,
scale=1.0,
model_name="large",
feature_maps=[6, 12, 15],
with_extra_blocks=False,
extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]],
lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
conv_decay=0.0,
multiplier=1.0,
norm_type='bn',
norm_decay=0.0,
freeze_norm=False):
super(MobileNetV3, self).__init__()
if isinstance(feature_maps, Integral):
feature_maps = [feature_maps]
if norm_type == 'sync_bn' and freeze_norm:
raise ValueError(
"The norm_type should not be sync_bn when freeze_norm is True")
self.feature_maps = feature_maps
self.with_extra_blocks = with_extra_blocks
self.extra_block_filters = extra_block_filters
inplanes = 16
if model_name == "large":
self.cfg = [
# k, exp, c, se, nl, s,
[3, 16, 16, False, "relu", 1],
[3, 64, 24, False, "relu", 2],
[3, 72, 24, False, "relu", 1],
[5, 72, 40, True, "relu", 2], # RCNN output
[5, 120, 40, True, "relu", 1],
[5, 120, 40, True, "relu", 1], # YOLOv3 output
[3, 240, 80, False, "hard_swish", 2], # RCNN output
[3, 200, 80, False, "hard_swish", 1],
[3, 184, 80, False, "hard_swish", 1],
[3, 184, 80, False, "hard_swish", 1],
[3, 480, 112, True, "hard_swish", 1],
[3, 672, 112, True, "hard_swish", 1], # YOLOv3 output
[5, 672, 160, True, "hard_swish", 2], # SSD/SSDLite/RCNN output
[5, 960, 160, True, "hard_swish", 1],
[5, 960, 160, True, "hard_swish", 1], # YOLOv3 output
]
elif model_name == "small":
self.cfg = [
# k, exp, c, se, nl, s,
[3, 16, 16, True, "relu", 2],
[3, 72, 24, False, "relu", 2], # RCNN output
[3, 88, 24, False, "relu", 1], # YOLOv3 output
[5, 96, 40, True, "hard_swish", 2], # RCNN output
[5, 240, 40, True, "hard_swish", 1],
[5, 240, 40, True, "hard_swish", 1],
[5, 120, 48, True, "hard_swish", 1],
[5, 144, 48, True, "hard_swish", 1], # YOLOv3 output
[5, 288, 96, True, "hard_swish", 2], # SSD/SSDLite/RCNN output
[5, 576, 96, True, "hard_swish", 1],
[5, 576, 96, True, "hard_swish", 1], # YOLOv3 output
]
else:
raise NotImplementedError(
"mode[{}_model] is not implemented!".format(model_name))
if multiplier != 1.0:
self.cfg[-3][2] = int(self.cfg[-3][2] * multiplier)
self.cfg[-2][1] = int(self.cfg[-2][1] * multiplier)
self.cfg[-2][2] = int(self.cfg[-2][2] * multiplier)
self.cfg[-1][1] = int(self.cfg[-1][1] * multiplier)
self.cfg[-1][2] = int(self.cfg[-1][2] * multiplier)
self.conv1 = ConvBNLayer(
in_c=3,
out_c=make_divisible(inplanes * scale),
filter_size=3,
stride=2,
padding=1,
num_groups=1,
act="hard_swish",
lr_mult=lr_mult_list[0],
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name="conv1")
self._out_channels = []
self.block_list = []
i = 0
inplanes = make_divisible(inplanes * scale)
for (k, exp, c, se, nl, s) in self.cfg:
lr_idx = min(i // 3, len(lr_mult_list) - 1)
lr_mult = lr_mult_list[lr_idx]
# for SSD/SSDLite, first head input is after ResidualUnit expand_conv
return_list = self.with_extra_blocks and i + 2 in self.feature_maps
block = self.add_sublayer(
"conv" + str(i + 2),
sublayer=ResidualUnit(
in_c=inplanes,
mid_c=make_divisible(scale * exp),
out_c=make_divisible(scale * c),
filter_size=k,
stride=s,
use_se=se,
act=nl,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
return_list=return_list,
name="conv" + str(i + 2)))
self.block_list.append(block)
inplanes = make_divisible(scale * c)
i += 1
self._update_out_channels(
make_divisible(scale * exp)
if return_list else inplanes, i + 1, feature_maps)
if self.with_extra_blocks:
self.extra_block_list = []
extra_out_c = make_divisible(scale * self.cfg[-1][1])
lr_idx = min(i // 3, len(lr_mult_list) - 1)
lr_mult = lr_mult_list[lr_idx]
conv_extra = self.add_sublayer(
"conv" + str(i + 2),
sublayer=ConvBNLayer(
in_c=inplanes,
out_c=extra_out_c,
filter_size=1,
stride=1,
padding=0,
num_groups=1,
act="hard_swish",
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name="conv" + str(i + 2)))
self.extra_block_list.append(conv_extra)
i += 1
self._update_out_channels(extra_out_c, i + 1, feature_maps)
for j, block_filter in enumerate(self.extra_block_filters):
in_c = extra_out_c if j == 0 else self.extra_block_filters[j -
1][1]
conv_extra = self.add_sublayer(
"conv" + str(i + 2),
sublayer=ExtraBlockDW(
in_c,
block_filter[0],
block_filter[1],
stride=2,
lr_mult=lr_mult,
conv_decay=conv_decay,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
name='conv' + str(i + 2)))
self.extra_block_list.append(conv_extra)
i += 1
self._update_out_channels(block_filter[1], i + 1, feature_maps)
def _update_out_channels(self, channel, feature_idx, feature_maps):
if feature_idx in feature_maps:
self._out_channels.append(channel)
def forward(self, inputs):
x = self.conv1(inputs['image'])
outs = []
for idx, block in enumerate(self.block_list):
x = block(x)
if idx + 2 in self.feature_maps:
if isinstance(x, list):
outs.append(x[0])
x = x[1]
else:
outs.append(x)
if not self.with_extra_blocks:
return outs
for i, block in enumerate(self.extra_block_list):
idx = i + len(self.block_list)
x = block(x)
if idx + 2 in self.feature_maps:
outs.append(x)
return outs
@property
def out_shape(self):
return [ShapeSpec(channels=c) for c in self._out_channels]

View File

@@ -0,0 +1,266 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is the paddle implementation of MobileOne block, see: https://arxiv.org/pdf/2206.04040.pdf.
Some codes are based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
Ths copyright of microsoft/Swin-Transformer is as follows:
MIT License [see LICENSE for details]
"""
import paddle
import paddle.nn as nn
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import Normal, Constant
from ppdet.modeling.ops import get_act_fn
from ppdet.modeling.layers import ConvNormLayer
class MobileOneBlock(nn.Layer):
def __init__(
self,
ch_in,
ch_out,
stride,
kernel_size,
conv_num=1,
norm_type='bn',
norm_decay=0.,
norm_groups=32,
bias_on=False,
lr_scale=1.,
freeze_norm=False,
initializer=Normal(
mean=0., std=0.01),
skip_quant=False,
act='relu', ):
super(MobileOneBlock, self).__init__()
self.ch_in = ch_in
self.ch_out = ch_out
self.kernel_size = kernel_size
self.stride = stride
self.padding = (kernel_size - 1) // 2
self.k = conv_num
self.depth_conv = nn.LayerList()
self.point_conv = nn.LayerList()
for _ in range(self.k):
self.depth_conv.append(
ConvNormLayer(
ch_in,
ch_in,
kernel_size,
stride=stride,
groups=ch_in,
norm_type=norm_type,
norm_decay=norm_decay,
norm_groups=norm_groups,
bias_on=bias_on,
lr_scale=lr_scale,
freeze_norm=freeze_norm,
initializer=initializer,
skip_quant=skip_quant))
self.point_conv.append(
ConvNormLayer(
ch_in,
ch_out,
1,
stride=1,
groups=1,
norm_type=norm_type,
norm_decay=norm_decay,
norm_groups=norm_groups,
bias_on=bias_on,
lr_scale=lr_scale,
freeze_norm=freeze_norm,
initializer=initializer,
skip_quant=skip_quant))
self.rbr_1x1 = ConvNormLayer(
ch_in,
ch_in,
1,
stride=self.stride,
groups=ch_in,
norm_type=norm_type,
norm_decay=norm_decay,
norm_groups=norm_groups,
bias_on=bias_on,
lr_scale=lr_scale,
freeze_norm=freeze_norm,
initializer=initializer,
skip_quant=skip_quant)
self.rbr_identity_st1 = nn.BatchNorm2D(
num_features=ch_in,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(
0.0))) if ch_in == ch_out and self.stride == 1 else None
self.rbr_identity_st2 = nn.BatchNorm2D(
num_features=ch_out,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(
0.0))) if ch_in == ch_out and self.stride == 1 else None
self.act = get_act_fn(act) if act is None or isinstance(act, (
str, dict)) else act
def forward(self, x):
if hasattr(self, "conv1") and hasattr(self, "conv2"):
y = self.act(self.conv2(self.act(self.conv1(x))))
else:
if self.rbr_identity_st1 is None:
id_out_st1 = 0
else:
id_out_st1 = self.rbr_identity_st1(x)
x1_1 = 0
for i in range(self.k):
x1_1 += self.depth_conv[i](x)
x1_2 = self.rbr_1x1(x)
x1 = self.act(x1_1 + x1_2 + id_out_st1)
if self.rbr_identity_st2 is None:
id_out_st2 = 0
else:
id_out_st2 = self.rbr_identity_st2(x1)
x2_1 = 0
for i in range(self.k):
x2_1 += self.point_conv[i](x1)
y = self.act(x2_1 + id_out_st2)
return y
def convert_to_deploy(self):
if not hasattr(self, 'conv1'):
self.conv1 = nn.Conv2D(
in_channels=self.ch_in,
out_channels=self.ch_in,
kernel_size=self.kernel_size,
stride=self.stride,
padding=self.padding,
groups=self.ch_in,
bias_attr=ParamAttr(
initializer=Constant(value=0.), learning_rate=1.))
if not hasattr(self, 'conv2'):
self.conv2 = nn.Conv2D(
in_channels=self.ch_in,
out_channels=self.ch_out,
kernel_size=1,
stride=1,
padding='SAME',
groups=1,
bias_attr=ParamAttr(
initializer=Constant(value=0.), learning_rate=1.))
conv1_kernel, conv1_bias, conv2_kernel, conv2_bias = self.get_equivalent_kernel_bias(
)
self.conv1.weight.set_value(conv1_kernel)
self.conv1.bias.set_value(conv1_bias)
self.conv2.weight.set_value(conv2_kernel)
self.conv2.bias.set_value(conv2_bias)
self.__delattr__('depth_conv')
self.__delattr__('point_conv')
self.__delattr__('rbr_1x1')
if hasattr(self, 'rbr_identity_st1'):
self.__delattr__('rbr_identity_st1')
if hasattr(self, 'rbr_identity_st2'):
self.__delattr__('rbr_identity_st2')
def get_equivalent_kernel_bias(self):
st1_kernel3x3, st1_bias3x3 = self._fuse_bn_tensor(self.depth_conv)
st1_kernel1x1, st1_bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
st1_kernelid, st1_biasid = self._fuse_bn_tensor(
self.rbr_identity_st1, kernel_size=self.kernel_size)
st2_kernel1x1, st2_bias1x1 = self._fuse_bn_tensor(self.point_conv)
st2_kernelid, st2_biasid = self._fuse_bn_tensor(
self.rbr_identity_st2, kernel_size=1)
conv1_kernel = st1_kernel3x3 + self._pad_1x1_to_3x3_tensor(
st1_kernel1x1) + st1_kernelid
conv1_bias = st1_bias3x3 + st1_bias1x1 + st1_biasid
conv2_kernel = st2_kernel1x1 + st2_kernelid
conv2_bias = st2_bias1x1 + st2_biasid
return conv1_kernel, conv1_bias, conv2_kernel, conv2_bias
def _pad_1x1_to_3x3_tensor(self, kernel1x1):
if kernel1x1 is None:
return 0
else:
padding_size = (self.kernel_size - 1) // 2
return nn.functional.pad(
kernel1x1,
[padding_size, padding_size, padding_size, padding_size])
def _fuse_bn_tensor(self, branch, kernel_size=3):
if branch is None:
return 0, 0
if isinstance(branch, nn.LayerList):
fused_kernels = []
fused_bias = []
for block in branch:
kernel = block.conv.weight
running_mean = block.norm._mean
running_var = block.norm._variance
gamma = block.norm.weight
beta = block.norm.bias
eps = block.norm._epsilon
std = (running_var + eps).sqrt()
t = (gamma / std).reshape((-1, 1, 1, 1))
fused_kernels.append(kernel * t)
fused_bias.append(beta - running_mean * gamma / std)
return sum(fused_kernels), sum(fused_bias)
elif isinstance(branch, ConvNormLayer):
kernel = branch.conv.weight
running_mean = branch.norm._mean
running_var = branch.norm._variance
gamma = branch.norm.weight
beta = branch.norm.bias
eps = branch.norm._epsilon
else:
assert isinstance(branch, nn.BatchNorm2D)
input_dim = self.ch_in if kernel_size == 1 else 1
kernel_value = paddle.zeros(
shape=[self.ch_in, input_dim, kernel_size, kernel_size],
dtype='float32')
if kernel_size > 1:
for i in range(self.ch_in):
kernel_value[i, i % input_dim, (kernel_size - 1) // 2, (
kernel_size - 1) // 2] = 1
elif kernel_size == 1:
for i in range(self.ch_in):
kernel_value[i, i % input_dim, 0, 0] = 1
else:
raise ValueError("Invalid kernel size recieved!")
kernel = paddle.to_tensor(kernel_value, place=branch.weight.place)
running_mean = branch._mean
running_var = branch._variance
gamma = branch.weight
beta = branch.bias
eps = branch._epsilon
std = (running_var + eps).sqrt()
t = (gamma / std).reshape((-1, 1, 1, 1))
return kernel * t, beta - running_mean * gamma / std

View File

@@ -0,0 +1,69 @@
class NameAdapter(object):
"""Fix the backbones variable names for pretrained weight"""
def __init__(self, model):
super(NameAdapter, self).__init__()
self.model = model
@property
def model_type(self):
return getattr(self.model, '_model_type', '')
@property
def variant(self):
return getattr(self.model, 'variant', '')
def fix_conv_norm_name(self, name):
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
# the naming rule is same as pretrained weight
if self.model_type == 'SEResNeXt':
bn_name = name + "_bn"
return bn_name
def fix_shortcut_name(self, name):
if self.model_type == 'SEResNeXt':
name = 'conv' + name + '_prj'
return name
def fix_bottleneck_name(self, name):
if self.model_type == 'SEResNeXt':
conv_name1 = 'conv' + name + '_x1'
conv_name2 = 'conv' + name + '_x2'
conv_name3 = 'conv' + name + '_x3'
shortcut_name = name
else:
conv_name1 = name + "_branch2a"
conv_name2 = name + "_branch2b"
conv_name3 = name + "_branch2c"
shortcut_name = name + "_branch1"
return conv_name1, conv_name2, conv_name3, shortcut_name
def fix_basicblock_name(self, name):
if self.model_type == 'SEResNeXt':
conv_name1 = 'conv' + name + '_x1'
conv_name2 = 'conv' + name + '_x2'
shortcut_name = name
else:
conv_name1 = name + "_branch2a"
conv_name2 = name + "_branch2b"
shortcut_name = name + "_branch1"
return conv_name1, conv_name2, shortcut_name
def fix_layer_warp_name(self, stage_num, count, i):
name = 'res' + str(stage_num)
if count > 10 and stage_num == 4:
if i == 0:
conv_name = name + "a"
else:
conv_name = name + "b" + str(i)
else:
conv_name = name + chr(ord("a") + i)
if self.model_type == 'SEResNeXt':
conv_name = str(stage_num + 2) + '_' + str(i + 1)
return conv_name
def fix_c1_stage_name(self):
return "res_conv1" if self.model_type == 'ResNeXt' else "conv1"

View File

@@ -0,0 +1,611 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
from numbers import Integral
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from paddle.regularizer import L2Decay
from paddle.nn.initializer import Uniform
from paddle import ParamAttr
from paddle.nn.initializer import Constant
from paddle.vision.ops import DeformConv2D
from .name_adapter import NameAdapter
from ..shape_spec import ShapeSpec
__all__ = ['ResNet', 'Res5Head', 'Blocks', 'BasicBlock', 'BottleNeck']
ResNet_cfg = {
18: [2, 2, 2, 2],
34: [3, 4, 6, 3],
50: [3, 4, 6, 3],
101: [3, 4, 23, 3],
152: [3, 8, 36, 3],
}
class ConvNormLayer(nn.Layer):
def __init__(self,
ch_in,
ch_out,
filter_size,
stride,
groups=1,
act=None,
norm_type='bn',
norm_decay=0.,
freeze_norm=True,
lr=1.0,
dcn_v2=False):
super(ConvNormLayer, self).__init__()
assert norm_type in ['bn', 'sync_bn']
self.norm_type = norm_type
self.act = act
self.dcn_v2 = dcn_v2
if not self.dcn_v2:
self.conv = nn.Conv2D(
in_channels=ch_in,
out_channels=ch_out,
kernel_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
weight_attr=ParamAttr(learning_rate=lr),
bias_attr=False)
else:
self.offset_channel = 2 * filter_size**2
self.mask_channel = filter_size**2
self.conv_offset = nn.Conv2D(
in_channels=ch_in,
out_channels=3 * filter_size**2,
kernel_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
weight_attr=ParamAttr(initializer=Constant(0.)),
bias_attr=ParamAttr(initializer=Constant(0.)))
self.conv = DeformConv2D(
in_channels=ch_in,
out_channels=ch_out,
kernel_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
dilation=1,
groups=groups,
weight_attr=ParamAttr(learning_rate=lr),
bias_attr=False)
norm_lr = 0. if freeze_norm else lr
param_attr = ParamAttr(
learning_rate=norm_lr,
regularizer=L2Decay(norm_decay),
trainable=False if freeze_norm else True)
bias_attr = ParamAttr(
learning_rate=norm_lr,
regularizer=L2Decay(norm_decay),
trainable=False if freeze_norm else True)
global_stats = True if freeze_norm else None
if norm_type in ['sync_bn', 'bn']:
self.norm = nn.BatchNorm2D(
ch_out,
weight_attr=param_attr,
bias_attr=bias_attr,
use_global_stats=global_stats)
norm_params = self.norm.parameters()
if freeze_norm:
for param in norm_params:
param.stop_gradient = True
def forward(self, inputs):
if not self.dcn_v2:
out = self.conv(inputs)
else:
offset_mask = self.conv_offset(inputs)
offset, mask = paddle.split(
offset_mask,
num_or_sections=[self.offset_channel, self.mask_channel],
axis=1)
mask = F.sigmoid(mask)
out = self.conv(inputs, offset, mask=mask)
if self.norm_type in ['bn', 'sync_bn']:
out = self.norm(out)
if self.act:
out = getattr(F, self.act)(out)
return out
class SELayer(nn.Layer):
def __init__(self, ch, reduction_ratio=16):
super(SELayer, self).__init__()
self.pool = nn.AdaptiveAvgPool2D(1)
stdv = 1.0 / math.sqrt(ch)
c_ = ch // reduction_ratio
self.squeeze = nn.Linear(
ch,
c_,
weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
bias_attr=True)
stdv = 1.0 / math.sqrt(c_)
self.extract = nn.Linear(
c_,
ch,
weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
bias_attr=True)
def forward(self, inputs):
out = self.pool(inputs)
out = paddle.squeeze(out, axis=[2, 3])
out = self.squeeze(out)
out = F.relu(out)
out = self.extract(out)
out = F.sigmoid(out)
out = paddle.unsqueeze(out, axis=[2, 3])
scale = out * inputs
return scale
class BasicBlock(nn.Layer):
expansion = 1
def __init__(self,
ch_in,
ch_out,
stride,
shortcut,
variant='b',
groups=1,
base_width=64,
lr=1.0,
norm_type='bn',
norm_decay=0.,
freeze_norm=True,
dcn_v2=False,
std_senet=False):
super(BasicBlock, self).__init__()
assert groups == 1 and base_width == 64, 'BasicBlock only supports groups=1 and base_width=64'
self.shortcut = shortcut
if not shortcut:
if variant == 'd' and stride == 2:
self.short = nn.Sequential()
self.short.add_sublayer(
'pool',
nn.AvgPool2D(
kernel_size=2, stride=2, padding=0, ceil_mode=True))
self.short.add_sublayer(
'conv',
ConvNormLayer(
ch_in=ch_in,
ch_out=ch_out,
filter_size=1,
stride=1,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr))
else:
self.short = ConvNormLayer(
ch_in=ch_in,
ch_out=ch_out,
filter_size=1,
stride=stride,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr)
self.branch2a = ConvNormLayer(
ch_in=ch_in,
ch_out=ch_out,
filter_size=3,
stride=stride,
act='relu',
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr)
self.branch2b = ConvNormLayer(
ch_in=ch_out,
ch_out=ch_out,
filter_size=3,
stride=1,
act=None,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr,
dcn_v2=dcn_v2)
self.std_senet = std_senet
if self.std_senet:
self.se = SELayer(ch_out)
def forward(self, inputs):
out = self.branch2a(inputs)
out = self.branch2b(out)
if self.std_senet:
out = self.se(out)
if self.shortcut:
short = inputs
else:
short = self.short(inputs)
out = paddle.add(x=out, y=short)
out = F.relu(out)
return out
class BottleNeck(nn.Layer):
expansion = 4
def __init__(self,
ch_in,
ch_out,
stride,
shortcut,
variant='b',
groups=1,
base_width=4,
lr=1.0,
norm_type='bn',
norm_decay=0.,
freeze_norm=True,
dcn_v2=False,
std_senet=False):
super(BottleNeck, self).__init__()
if variant == 'a':
stride1, stride2 = stride, 1
else:
stride1, stride2 = 1, stride
# ResNeXt
width = int(ch_out * (base_width / 64.)) * groups
self.branch2a = ConvNormLayer(
ch_in=ch_in,
ch_out=width,
filter_size=1,
stride=stride1,
groups=1,
act='relu',
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr)
self.branch2b = ConvNormLayer(
ch_in=width,
ch_out=width,
filter_size=3,
stride=stride2,
groups=groups,
act='relu',
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr,
dcn_v2=dcn_v2)
self.branch2c = ConvNormLayer(
ch_in=width,
ch_out=ch_out * self.expansion,
filter_size=1,
stride=1,
groups=1,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr)
self.shortcut = shortcut
if not shortcut:
if variant == 'd' and stride == 2:
self.short = nn.Sequential()
self.short.add_sublayer(
'pool',
nn.AvgPool2D(
kernel_size=2, stride=2, padding=0, ceil_mode=True))
self.short.add_sublayer(
'conv',
ConvNormLayer(
ch_in=ch_in,
ch_out=ch_out * self.expansion,
filter_size=1,
stride=1,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr))
else:
self.short = ConvNormLayer(
ch_in=ch_in,
ch_out=ch_out * self.expansion,
filter_size=1,
stride=stride,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=lr)
self.std_senet = std_senet
if self.std_senet:
self.se = SELayer(ch_out * self.expansion)
def forward(self, inputs):
out = self.branch2a(inputs)
out = self.branch2b(out)
out = self.branch2c(out)
if self.std_senet:
out = self.se(out)
if self.shortcut:
short = inputs
else:
short = self.short(inputs)
out = paddle.add(x=out, y=short)
out = F.relu(out)
return out
class Blocks(nn.Layer):
def __init__(self,
block,
ch_in,
ch_out,
count,
name_adapter,
stage_num,
variant='b',
groups=1,
base_width=64,
lr=1.0,
norm_type='bn',
norm_decay=0.,
freeze_norm=True,
dcn_v2=False,
std_senet=False):
super(Blocks, self).__init__()
self.blocks = []
for i in range(count):
conv_name = name_adapter.fix_layer_warp_name(stage_num, count, i)
layer = self.add_sublayer(
conv_name,
block(
ch_in=ch_in,
ch_out=ch_out,
stride=2 if i == 0 and stage_num != 2 else 1,
shortcut=False if i == 0 else True,
variant=variant,
groups=groups,
base_width=base_width,
lr=lr,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
dcn_v2=dcn_v2,
std_senet=std_senet))
self.blocks.append(layer)
if i == 0:
ch_in = ch_out * block.expansion
def forward(self, inputs):
block_out = inputs
for block in self.blocks:
block_out = block(block_out)
return block_out
@register
@serializable
class ResNet(nn.Layer):
__shared__ = ['norm_type']
def __init__(self,
depth=50,
ch_in=64,
variant='b',
lr_mult_list=[1.0, 1.0, 1.0, 1.0],
groups=1,
base_width=64,
norm_type='bn',
norm_decay=0,
freeze_norm=True,
freeze_at=0,
return_idx=[0, 1, 2, 3],
dcn_v2_stages=[-1],
num_stages=4,
std_senet=False,
freeze_stem_only=False):
"""
Residual Network, see https://arxiv.org/abs/1512.03385
Args:
depth (int): ResNet depth, should be 18, 34, 50, 101, 152.
ch_in (int): output channel of first stage, default 64
variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently
lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),
lower learning rate ratio is need for pretrained model
got using distillation(default as [1.0, 1.0, 1.0, 1.0]).
groups (int): group convolution cardinality
base_width (int): base width of each group convolution
norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'
norm_decay (float): weight decay for normalization layer weights
freeze_norm (bool): freeze normalization layers
freeze_at (int): freeze the backbone at which stage
return_idx (list): index of the stages whose feature maps are returned
dcn_v2_stages (list): index of stages who select deformable conv v2
num_stages (int): total num of stages
std_senet (bool): whether use senet, default False.
"""
super(ResNet, self).__init__()
self._model_type = 'ResNet' if groups == 1 else 'ResNeXt'
assert num_stages >= 1 and num_stages <= 4
self.depth = depth
self.variant = variant
self.groups = groups
self.base_width = base_width
self.norm_type = norm_type
self.norm_decay = norm_decay
self.freeze_norm = freeze_norm
self.freeze_at = freeze_at
if isinstance(return_idx, Integral):
return_idx = [return_idx]
assert max(return_idx) < num_stages, \
'the maximum return index must smaller than num_stages, ' \
'but received maximum return index is {} and num_stages ' \
'is {}'.format(max(return_idx), num_stages)
self.return_idx = return_idx
self.num_stages = num_stages
assert len(lr_mult_list) == 4, \
"lr_mult_list length must be 4 but got {}".format(len(lr_mult_list))
if isinstance(dcn_v2_stages, Integral):
dcn_v2_stages = [dcn_v2_stages]
assert max(dcn_v2_stages) < num_stages
if isinstance(dcn_v2_stages, Integral):
dcn_v2_stages = [dcn_v2_stages]
assert max(dcn_v2_stages) < num_stages
self.dcn_v2_stages = dcn_v2_stages
block_nums = ResNet_cfg[depth]
na = NameAdapter(self)
conv1_name = na.fix_c1_stage_name()
if variant in ['c', 'd']:
conv_def = [
[3, ch_in // 2, 3, 2, "conv1_1"],
[ch_in // 2, ch_in // 2, 3, 1, "conv1_2"],
[ch_in // 2, ch_in, 3, 1, "conv1_3"],
]
else:
conv_def = [[3, ch_in, 7, 2, conv1_name]]
self.conv1 = nn.Sequential()
for (c_in, c_out, k, s, _name) in conv_def:
self.conv1.add_sublayer(
_name,
ConvNormLayer(
ch_in=c_in,
ch_out=c_out,
filter_size=k,
stride=s,
groups=1,
act='relu',
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
lr=1.0))
self.ch_in = ch_in
ch_out_list = [64, 128, 256, 512]
block = BottleNeck if depth >= 50 else BasicBlock
self._out_channels = [block.expansion * v for v in ch_out_list]
self._out_strides = [4, 8, 16, 32]
self.res_layers = []
for i in range(num_stages):
lr_mult = lr_mult_list[i]
stage_num = i + 2
res_name = "res{}".format(stage_num)
res_layer = self.add_sublayer(
res_name,
Blocks(
block,
self.ch_in,
ch_out_list[i],
count=block_nums[i],
name_adapter=na,
stage_num=stage_num,
variant=variant,
groups=groups,
base_width=base_width,
lr=lr_mult,
norm_type=norm_type,
norm_decay=norm_decay,
freeze_norm=freeze_norm,
dcn_v2=(i in self.dcn_v2_stages),
std_senet=std_senet))
self.res_layers.append(res_layer)
self.ch_in = self._out_channels[i]
if freeze_at >= 0:
self._freeze_parameters(self.conv1)
if not freeze_stem_only:
for i in range(min(freeze_at + 1, num_stages)):
self._freeze_parameters(self.res_layers[i])
def _freeze_parameters(self, m):
for p in m.parameters():
p.stop_gradient = True
@property
def out_shape(self):
return [
ShapeSpec(
channels=self._out_channels[i], stride=self._out_strides[i])
for i in self.return_idx
]
def forward(self, inputs):
x = inputs['image']
conv1 = self.conv1(x)
x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1)
outs = []
for idx, stage in enumerate(self.res_layers):
x = stage(x)
if idx in self.return_idx:
outs.append(x)
return outs
@register
class Res5Head(nn.Layer):
def __init__(self, depth=50):
super(Res5Head, self).__init__()
feat_in, feat_out = [1024, 512]
if depth < 50:
feat_in = 256
na = NameAdapter(self)
block = BottleNeck if depth >= 50 else BasicBlock
self.res5 = Blocks(
block, feat_in, feat_out, count=3, name_adapter=na, stage_num=5)
self.feat_out = feat_out if depth < 50 else feat_out * 4
@property
def out_shape(self):
return [ShapeSpec(
channels=self.feat_out,
stride=16, )]
def forward(self, roi_feat, stage=0):
y = self.res5(roi_feat)
return y

View File

@@ -0,0 +1,250 @@
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
from paddle import ParamAttr
import paddle.nn.functional as F
from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm2D
from paddle.nn.initializer import KaimingNormal
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register, serializable
from numbers import Integral
from ..shape_spec import ShapeSpec
from ppdet.modeling.ops import channel_shuffle
__all__ = ['ShuffleNetV2']
class ConvBNLayer(nn.Layer):
def __init__(self,
in_channels,
out_channels,
kernel_size,
stride,
padding,
groups=1,
act=None):
super(ConvBNLayer, self).__init__()
self._conv = Conv2D(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
stride=stride,
padding=padding,
groups=groups,
weight_attr=ParamAttr(initializer=KaimingNormal()),
bias_attr=False)
self._batch_norm = BatchNorm2D(
out_channels,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
if act == "hard_swish":
act = 'hardswish'
self.act = act
def forward(self, inputs):
y = self._conv(inputs)
y = self._batch_norm(y)
if self.act:
y = getattr(F, self.act)(y)
return y
class InvertedResidual(nn.Layer):
def __init__(self, in_channels, out_channels, stride, act="relu"):
super(InvertedResidual, self).__init__()
self._conv_pw = ConvBNLayer(
in_channels=in_channels // 2,
out_channels=out_channels // 2,
kernel_size=1,
stride=1,
padding=0,
groups=1,
act=act)
self._conv_dw = ConvBNLayer(
in_channels=out_channels // 2,
out_channels=out_channels // 2,
kernel_size=3,
stride=stride,
padding=1,
groups=out_channels // 2,
act=None)
self._conv_linear = ConvBNLayer(
in_channels=out_channels // 2,
out_channels=out_channels // 2,
kernel_size=1,
stride=1,
padding=0,
groups=1,
act=act)
def forward(self, inputs):
x1, x2 = paddle.split(
inputs,
num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],
axis=1)
x2 = self._conv_pw(x2)
x2 = self._conv_dw(x2)
x2 = self._conv_linear(x2)
out = paddle.concat([x1, x2], axis=1)
return channel_shuffle(out, 2)
class InvertedResidualDS(nn.Layer):
def __init__(self, in_channels, out_channels, stride, act="relu"):
super(InvertedResidualDS, self).__init__()
# branch1
self._conv_dw_1 = ConvBNLayer(
in_channels=in_channels,
out_channels=in_channels,
kernel_size=3,
stride=stride,
padding=1,
groups=in_channels,
act=None)
self._conv_linear_1 = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels // 2,
kernel_size=1,
stride=1,
padding=0,
groups=1,
act=act)
# branch2
self._conv_pw_2 = ConvBNLayer(
in_channels=in_channels,
out_channels=out_channels // 2,
kernel_size=1,
stride=1,
padding=0,
groups=1,
act=act)
self._conv_dw_2 = ConvBNLayer(
in_channels=out_channels // 2,
out_channels=out_channels // 2,
kernel_size=3,
stride=stride,
padding=1,
groups=out_channels // 2,
act=None)
self._conv_linear_2 = ConvBNLayer(
in_channels=out_channels // 2,
out_channels=out_channels // 2,
kernel_size=1,
stride=1,
padding=0,
groups=1,
act=act)
def forward(self, inputs):
x1 = self._conv_dw_1(inputs)
x1 = self._conv_linear_1(x1)
x2 = self._conv_pw_2(inputs)
x2 = self._conv_dw_2(x2)
x2 = self._conv_linear_2(x2)
out = paddle.concat([x1, x2], axis=1)
return channel_shuffle(out, 2)
@register
@serializable
class ShuffleNetV2(nn.Layer):
def __init__(self, scale=1.0, act="relu", feature_maps=[5, 13, 17]):
super(ShuffleNetV2, self).__init__()
self.scale = scale
if isinstance(feature_maps, Integral):
feature_maps = [feature_maps]
self.feature_maps = feature_maps
stage_repeats = [4, 8, 4]
if scale == 0.25:
stage_out_channels = [-1, 24, 24, 48, 96, 512]
elif scale == 0.33:
stage_out_channels = [-1, 24, 32, 64, 128, 512]
elif scale == 0.5:
stage_out_channels = [-1, 24, 48, 96, 192, 1024]
elif scale == 1.0:
stage_out_channels = [-1, 24, 116, 232, 464, 1024]
elif scale == 1.5:
stage_out_channels = [-1, 24, 176, 352, 704, 1024]
elif scale == 2.0:
stage_out_channels = [-1, 24, 244, 488, 976, 2048]
else:
raise NotImplementedError("This scale size:[" + str(scale) +
"] is not implemented!")
self._out_channels = []
self._feature_idx = 0
# 1. conv1
self._conv1 = ConvBNLayer(
in_channels=3,
out_channels=stage_out_channels[1],
kernel_size=3,
stride=2,
padding=1,
act=act)
self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
self._feature_idx += 1
# 2. bottleneck sequences
self._block_list = []
for stage_id, num_repeat in enumerate(stage_repeats):
for i in range(num_repeat):
if i == 0:
block = self.add_sublayer(
name=str(stage_id + 2) + '_' + str(i + 1),
sublayer=InvertedResidualDS(
in_channels=stage_out_channels[stage_id + 1],
out_channels=stage_out_channels[stage_id + 2],
stride=2,
act=act))
else:
block = self.add_sublayer(
name=str(stage_id + 2) + '_' + str(i + 1),
sublayer=InvertedResidual(
in_channels=stage_out_channels[stage_id + 2],
out_channels=stage_out_channels[stage_id + 2],
stride=1,
act=act))
self._block_list.append(block)
self._feature_idx += 1
self._update_out_channels(stage_out_channels[stage_id + 2],
self._feature_idx, self.feature_maps)
def _update_out_channels(self, channel, feature_idx, feature_maps):
if feature_idx in feature_maps:
self._out_channels.append(channel)
def forward(self, inputs):
y = self._conv1(inputs['image'])
y = self._max_pool(y)
outs = []
for i, inv in enumerate(self._block_list):
y = inv(y)
if i + 2 in self.feature_maps:
outs.append(y)
return outs
@property
def out_shape(self):
return [ShapeSpec(channels=c) for c in self._out_channels]

View File

@@ -0,0 +1,752 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
Ths copyright of microsoft/Swin-Transformer is as follows:
MIT License [see LICENSE for details]
"""
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.modeling.shape_spec import ShapeSpec
from ppdet.core.workspace import register, serializable
from .transformer_utils import DropPath, Identity
from .transformer_utils import add_parameter, to_2tuple
from .transformer_utils import ones_, zeros_, trunc_normal_
__all__ = ['SwinTransformer']
MODEL_cfg = {
# use 22kto1k finetune weights as default pretrained, can set by SwinTransformer.pretrained in config
'swin_T_224': dict(
pretrain_img_size=224,
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_tiny_patch4_window7_224_22kto1k_pretrained.pdparams',
),
'swin_S_224': dict(
pretrain_img_size=224,
embed_dim=96,
depths=[2, 2, 18, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_small_patch4_window7_224_22kto1k_pretrained.pdparams',
),
'swin_B_224': dict(
pretrain_img_size=224,
embed_dim=128,
depths=[2, 2, 18, 2],
num_heads=[4, 8, 16, 32],
window_size=7,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window7_224_22kto1k_pretrained.pdparams',
),
'swin_L_224': dict(
pretrain_img_size=224,
embed_dim=192,
depths=[2, 2, 18, 2],
num_heads=[6, 12, 24, 48],
window_size=7,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window7_224_22kto1k_pretrained.pdparams',
),
'swin_B_384': dict(
pretrain_img_size=384,
embed_dim=128,
depths=[2, 2, 18, 2],
num_heads=[4, 8, 16, 32],
window_size=12,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window12_384_22kto1k_pretrained.pdparams',
),
'swin_L_384': dict(
pretrain_img_size=384,
embed_dim=192,
depths=[2, 2, 18, 2],
num_heads=[6, 12, 24, 48],
window_size=12,
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window12_384_22kto1k_pretrained.pdparams',
),
}
class Mlp(nn.Layer):
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer=nn.GELU,
drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
def window_partition(x, window_size):
"""
Args:
x: (B, H, W, C)
window_size (int): window size
Returns:
windows: (num_windows*B, window_size, window_size, C)
"""
B, H, W, C = x.shape
x = x.reshape(
[-1, H // window_size, window_size, W // window_size, window_size, C])
windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
[-1, window_size, window_size, C])
return windows
def window_reverse(windows, window_size, H, W):
"""
Args:
windows: (num_windows*B, window_size, window_size, C)
window_size (int): Window size
H (int): Height of image
W (int): Width of image
Returns:
x: (B, H, W, C)
"""
_, _, _, C = windows.shape
B = int(windows.shape[0] / (H * W / window_size / window_size))
x = windows.reshape(
[-1, H // window_size, W // window_size, window_size, window_size, C])
x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C])
return x
class WindowAttention(nn.Layer):
""" Window based multi-head self attention (W-MSA) module with relative position bias.
It supports both of shifted and non-shifted window.
Args:
dim (int): Number of input channels.
window_size (tuple[int]): The height and width of the window.
num_heads (int): Number of attention heads.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
"""
def __init__(self,
dim,
window_size,
num_heads,
qkv_bias=True,
qk_scale=None,
attn_drop=0.,
proj_drop=0.):
super().__init__()
self.dim = dim
self.window_size = window_size # Wh, Ww
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim**-0.5
# define a parameter table of relative position bias
self.relative_position_bias_table = add_parameter(
self,
paddle.zeros(((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
num_heads))) # 2*Wh-1 * 2*Ww-1, nH
# get pair-wise relative position index for each token inside the window
coords_h = paddle.arange(self.window_size[0])
coords_w = paddle.arange(self.window_size[1])
coords = paddle.stack(paddle.meshgrid(
[coords_h, coords_w])) # 2, Wh, Ww
coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww
coords_flatten_1 = coords_flatten.unsqueeze(axis=2)
coords_flatten_2 = coords_flatten.unsqueeze(axis=1)
relative_coords = coords_flatten_1 - coords_flatten_2
relative_coords = relative_coords.transpose(
[1, 2, 0]) # Wh*Ww, Wh*Ww, 2
relative_coords[:, :, 0] += self.window_size[
0] - 1 # shift to start from 0
relative_coords[:, :, 1] += self.window_size[1] - 1
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
self.relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
trunc_normal_(self.relative_position_bias_table)
self.softmax = nn.Softmax(axis=-1)
def forward(self, x, mask=None):
""" Forward function.
Args:
x: input features with shape of (num_windows*B, N, C)
mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
"""
B_, N, C = x.shape
qkv = self.qkv(x).reshape(
[-1, N, 3, self.num_heads, C // self.num_heads]).transpose(
[2, 0, 3, 1, 4])
q, k, v = qkv[0], qkv[1], qkv[2]
q = q * self.scale
attn = paddle.mm(q, k.transpose([0, 1, 3, 2]))
index = self.relative_position_index.flatten()
relative_position_bias = paddle.index_select(
self.relative_position_bias_table, index)
relative_position_bias = relative_position_bias.reshape([
self.window_size[0] * self.window_size[1],
self.window_size[0] * self.window_size[1], -1
]) # Wh*Ww,Wh*Ww,nH
relative_position_bias = relative_position_bias.transpose(
[2, 0, 1]) # nH, Wh*Ww, Wh*Ww
attn = attn + relative_position_bias.unsqueeze(0)
if mask is not None:
nW = mask.shape[0]
attn = attn.reshape([-1, nW, self.num_heads, N, N
]) + mask.unsqueeze(1).unsqueeze(0)
attn = attn.reshape([-1, self.num_heads, N, N])
attn = self.softmax(attn)
else:
attn = self.softmax(attn)
attn = self.attn_drop(attn)
# x = (attn @ v).transpose(1, 2).reshape([B_, N, C])
x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([-1, N, C])
x = self.proj(x)
x = self.proj_drop(x)
return x
class SwinTransformerBlock(nn.Layer):
""" Swin Transformer Block.
Args:
dim (int): Number of input channels.
num_heads (int): Number of attention heads.
window_size (int): Window size.
shift_size (int): Shift size for SW-MSA.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
drop (float, optional): Dropout rate. Default: 0.0
attn_drop (float, optional): Attention dropout rate. Default: 0.0
drop_path (float, optional): Stochastic depth rate. Default: 0.0
act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
"""
def __init__(self,
dim,
num_heads,
window_size=7,
shift_size=0,
mlp_ratio=4.,
qkv_bias=True,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm):
super().__init__()
self.dim = dim
self.num_heads = num_heads
self.window_size = window_size
self.shift_size = shift_size
self.mlp_ratio = mlp_ratio
assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
self.norm1 = norm_layer(dim)
self.attn = WindowAttention(
dim,
window_size=to_2tuple(self.window_size),
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop)
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
self.H = None
self.W = None
def forward(self, x, mask_matrix):
""" Forward function.
Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
mask_matrix: Attention mask for cyclic shift.
"""
B, L, C = x.shape
H, W = self.H, self.W
assert L == H * W, "input feature has wrong size"
shortcut = x
x = self.norm1(x)
x = x.reshape([-1, H, W, C])
# pad feature maps to multiples of window size
pad_l = pad_t = 0
pad_r = (self.window_size - W % self.window_size) % self.window_size
pad_b = (self.window_size - H % self.window_size) % self.window_size
x = F.pad(x, [0, pad_l, 0, pad_b, 0, pad_r, 0, pad_t],
data_format='NHWC')
_, Hp, Wp, _ = x.shape
# cyclic shift
if self.shift_size > 0:
shifted_x = paddle.roll(
x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))
attn_mask = mask_matrix
else:
shifted_x = x
attn_mask = None
# partition windows
x_windows = window_partition(
shifted_x, self.window_size) # nW*B, window_size, window_size, C
x_windows = x_windows.reshape(
[x_windows.shape[0], self.window_size * self.window_size,
C]) # nW*B, window_size*window_size, C
# W-MSA/SW-MSA
attn_windows = self.attn(
x_windows, mask=attn_mask) # nW*B, window_size*window_size, C
# merge windows
attn_windows = attn_windows.reshape(
[x_windows.shape[0], self.window_size, self.window_size, C])
shifted_x = window_reverse(attn_windows, self.window_size, Hp,
Wp) # B H' W' C
# reverse cyclic shift
if self.shift_size > 0:
x = paddle.roll(
shifted_x,
shifts=(self.shift_size, self.shift_size),
axis=(1, 2))
else:
x = shifted_x
if pad_r > 0 or pad_b > 0:
x = x[:, :H, :W, :]
x = x.reshape([-1, H * W, C])
# FFN
x = shortcut + self.drop_path(x)
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
class PatchMerging(nn.Layer):
r""" Patch Merging Layer.
Args:
dim (int): Number of input channels.
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
"""
def __init__(self, dim, norm_layer=nn.LayerNorm):
super().__init__()
self.dim = dim
self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
self.norm = norm_layer(4 * dim)
def forward(self, x, H, W):
""" Forward function.
Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
"""
B, L, C = x.shape
assert L == H * W, "input feature has wrong size"
x = x.reshape([-1, H, W, C])
# padding
pad_input = (H % 2 == 1) or (W % 2 == 1)
if pad_input:
# paddle F.pad default data_format is 'NCHW'
x = F.pad(x, [0, 0, 0, H % 2, 0, W % 2, 0, 0], data_format='NHWC')
H += H % 2
W += W % 2
x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
x = paddle.concat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
x = x.reshape([-1, H * W // 4, 4 * C]) # B H/2*W/2 4*C
x = self.norm(x)
x = self.reduction(x)
return x
class BasicLayer(nn.Layer):
""" A basic Swin Transformer layer for one stage.
Args:
dim (int): Number of input channels.
depth (int): Number of blocks.
num_heads (int): Number of attention heads.
window_size (int): Local window size.
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
drop (float, optional): Dropout rate. Default: 0.0
attn_drop (float, optional): Attention dropout rate. Default: 0.0
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
"""
def __init__(self,
dim,
depth,
num_heads,
window_size=7,
mlp_ratio=4.,
qkv_bias=True,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
norm_layer=nn.LayerNorm,
downsample=None):
super().__init__()
self.window_size = window_size
self.shift_size = window_size // 2
self.depth = depth
# build blocks
self.blocks = nn.LayerList([
SwinTransformerBlock(
dim=dim,
num_heads=num_heads,
window_size=window_size,
shift_size=0 if (i % 2 == 0) else window_size // 2,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop,
attn_drop=attn_drop,
drop_path=drop_path[i]
if isinstance(drop_path, np.ndarray) else drop_path,
norm_layer=norm_layer) for i in range(depth)
])
# patch merging layer
if downsample is not None:
self.downsample = downsample(dim=dim, norm_layer=norm_layer)
else:
self.downsample = None
def forward(self, x, H, W):
""" Forward function.
Args:
x: Input feature, tensor size (B, H*W, C).
H, W: Spatial resolution of the input feature.
"""
# calculate attention mask for SW-MSA
Hp = int(np.ceil(H / self.window_size)) * self.window_size
Wp = int(np.ceil(W / self.window_size)) * self.window_size
img_mask = paddle.zeros([1, Hp, Wp, 1], dtype='float32') # 1 Hp Wp 1
h_slices = (slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None))
w_slices = (slice(0, -self.window_size),
slice(-self.window_size, -self.shift_size),
slice(-self.shift_size, None))
cnt = 0
for h in h_slices:
for w in w_slices:
img_mask[:, h, w, :] = cnt
cnt += 1
mask_windows = window_partition(
img_mask, self.window_size) # nW, window_size, window_size, 1
mask_windows = mask_windows.reshape(
[-1, self.window_size * self.window_size])
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
huns = -100.0 * paddle.ones_like(attn_mask)
attn_mask = huns * (attn_mask != 0).astype("float32")
for blk in self.blocks:
blk.H, blk.W = H, W
x = blk(x, attn_mask)
if self.downsample is not None:
x_down = self.downsample(x, H, W)
Wh, Ww = (H + 1) // 2, (W + 1) // 2
return x, H, W, x_down, Wh, Ww
else:
return x, H, W, x, H, W
class PatchEmbed(nn.Layer):
""" Image to Patch Embedding
Args:
patch_size (int): Patch token size. Default: 4.
in_chans (int): Number of input image channels. Default: 3.
embed_dim (int): Number of linear projection output channels. Default: 96.
norm_layer (nn.Layer, optional): Normalization layer. Default: None
"""
def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
super().__init__()
patch_size = to_2tuple(patch_size)
self.patch_size = patch_size
self.in_chans = in_chans
self.embed_dim = embed_dim
self.proj = nn.Conv2D(
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
if norm_layer is not None:
self.norm = norm_layer(embed_dim)
else:
self.norm = None
def forward(self, x):
# TODO # export dynamic shape
B, C, H, W = x.shape
# assert [H, W] == self.img_size[:2], "Input image size ({H}*{W}) doesn't match model ({}*{}).".format(H, W, self.img_size[0], self.img_size[1])
if W % self.patch_size[1] != 0:
x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
if H % self.patch_size[0] != 0:
x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
x = self.proj(x)
if self.norm is not None:
_, _, Wh, Ww = x.shape
x = x.flatten(2).transpose([0, 2, 1])
x = self.norm(x)
x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
return x
@register
@serializable
class SwinTransformer(nn.Layer):
""" Swin Transformer backbone
Args:
arch (str): Architecture of FocalNet
pretrain_img_size (int | tuple(int)): Input image size. Default 224
patch_size (int | tuple(int)): Patch size. Default: 4
in_chans (int): Number of input image channels. Default: 3
embed_dim (int): Patch embedding dimension. Default: 96
depths (tuple(int)): Depth of each Swin Transformer layer.
num_heads (tuple(int)): Number of attention heads in different layers.
window_size (int): Window size. Default: 7
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
drop_rate (float): Dropout rate. Default: 0
attn_drop_rate (float): Attention dropout rate. Default: 0
drop_path_rate (float): Stochastic depth rate. Default: 0.1
norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
patch_norm (bool): If True, add normalization after patch embedding. Default: True
"""
def __init__(self,
arch='swin_T_224',
pretrain_img_size=224,
patch_size=4,
in_chans=3,
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
mlp_ratio=4.,
qkv_bias=True,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.2,
norm_layer=nn.LayerNorm,
ape=False,
patch_norm=True,
out_indices=(0, 1, 2, 3),
frozen_stages=-1,
pretrained=None):
super(SwinTransformer, self).__init__()
assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch)
pretrain_img_size = MODEL_cfg[arch]['pretrain_img_size']
embed_dim = MODEL_cfg[arch]['embed_dim']
depths = MODEL_cfg[arch]['depths']
num_heads = MODEL_cfg[arch]['num_heads']
window_size = MODEL_cfg[arch]['window_size']
if pretrained is None:
pretrained = MODEL_cfg[arch]['pretrained']
self.num_layers = len(depths)
self.ape = ape
self.patch_norm = patch_norm
self.out_indices = out_indices
self.frozen_stages = frozen_stages
# split image into non-overlapping patches
self.patch_embed = PatchEmbed(
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim,
norm_layer=norm_layer if self.patch_norm else None)
# absolute position embedding
if self.ape:
pretrain_img_size = to_2tuple(pretrain_img_size)
patch_size = to_2tuple(patch_size)
patches_resolution = [
pretrain_img_size[0] // patch_size[0],
pretrain_img_size[1] // patch_size[1]
]
self.absolute_pos_embed = add_parameter(
self,
paddle.zeros((1, embed_dim, patches_resolution[0],
patches_resolution[1])))
trunc_normal_(self.absolute_pos_embed)
self.pos_drop = nn.Dropout(p=drop_rate)
# stochastic depth
dpr = np.linspace(0, drop_path_rate,
sum(depths)) # stochastic depth decay rule
# build layers
self.layers = nn.LayerList()
for i_layer in range(self.num_layers):
layer = BasicLayer(
dim=int(embed_dim * 2**i_layer),
depth=depths[i_layer],
num_heads=num_heads[i_layer],
window_size=window_size,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
norm_layer=norm_layer,
downsample=PatchMerging
if (i_layer < self.num_layers - 1) else None)
self.layers.append(layer)
num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
self.num_features = num_features
# add a norm layer for each output
for i_layer in out_indices:
layer = norm_layer(num_features[i_layer])
layer_name = f'norm{i_layer}'
self.add_sublayer(layer_name, layer)
self.apply(self._init_weights)
self._freeze_stages()
if pretrained:
if 'http' in pretrained: #URL
path = paddle.utils.download.get_weights_path_from_url(
pretrained)
else: #model in local path
path = pretrained
self.set_state_dict(paddle.load(path))
def _freeze_stages(self):
if self.frozen_stages >= 0:
self.patch_embed.eval()
for param in self.patch_embed.parameters():
param.stop_gradient = True
if self.frozen_stages >= 1 and self.ape:
self.absolute_pos_embed.stop_gradient = True
if self.frozen_stages >= 2:
self.pos_drop.eval()
for i in range(0, self.frozen_stages - 1):
m = self.layers[i]
m.eval()
for param in m.parameters():
param.stop_gradient = True
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight)
if isinstance(m, nn.Linear) and m.bias is not None:
zeros_(m.bias)
elif isinstance(m, nn.LayerNorm):
zeros_(m.bias)
ones_(m.weight)
def forward(self, x):
"""Forward function."""
x = self.patch_embed(x['image'])
B, _, Wh, Ww = x.shape
if self.ape:
# interpolate the position embedding to the corresponding size
absolute_pos_embed = F.interpolate(
self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1])
else:
x = x.flatten(2).transpose([0, 2, 1])
x = self.pos_drop(x)
outs = []
for i in range(self.num_layers):
layer = self.layers[i]
x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
if i in self.out_indices:
norm_layer = getattr(self, f'norm{i}')
x_out = norm_layer(x_out)
out = x_out.reshape((-1, H, W, self.num_features[i])).transpose(
(0, 3, 1, 2))
outs.append(out)
return outs
@property
def out_shape(self):
out_strides = [4, 8, 16, 32]
return [
ShapeSpec(
channels=self.num_features[i], stride=out_strides[i])
for i in self.out_indices
]

View File

@@ -0,0 +1,381 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn import ReLU, Swish, GELU
import math
from ppdet.core.workspace import register
from ..shape_spec import ShapeSpec
__all__ = ['TransEncoder']
class BertEmbeddings(nn.Layer):
def __init__(self, word_size, position_embeddings_size, word_type_size,
hidden_size, dropout_prob):
super(BertEmbeddings, self).__init__()
self.word_embeddings = nn.Embedding(
word_size, hidden_size, padding_idx=0)
self.position_embeddings = nn.Embedding(position_embeddings_size,
hidden_size)
self.token_type_embeddings = nn.Embedding(word_type_size, hidden_size)
self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
self.dropout = nn.Dropout(dropout_prob)
def forward(self, x, token_type_ids=None, position_ids=None):
seq_len = paddle.shape(x)[1]
if position_ids is None:
position_ids = paddle.arange(seq_len).unsqueeze(0).expand_as(x)
if token_type_ids is None:
token_type_ids = paddle.zeros(paddle.shape(x))
word_embs = self.word_embeddings(x)
position_embs = self.position_embeddings(position_ids)
token_type_embs = self.token_type_embeddings(token_type_ids)
embs_cmb = word_embs + position_embs + token_type_embs
embs_out = self.layernorm(embs_cmb)
embs_out = self.dropout(embs_out)
return embs_out
class BertSelfAttention(nn.Layer):
def __init__(self,
hidden_size,
num_attention_heads,
attention_probs_dropout_prob,
output_attentions=False):
super(BertSelfAttention, self).__init__()
if hidden_size % num_attention_heads != 0:
raise ValueError(
"The hidden_size must be a multiple of the number of attention "
"heads, but got {} % {} != 0" %
(hidden_size, num_attention_heads))
self.num_attention_heads = num_attention_heads
self.attention_head_size = int(hidden_size / num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(hidden_size, self.all_head_size)
self.key = nn.Linear(hidden_size, self.all_head_size)
self.value = nn.Linear(hidden_size, self.all_head_size)
self.dropout = nn.Dropout(attention_probs_dropout_prob)
self.output_attentions = output_attentions
def forward(self, x, attention_mask, head_mask=None):
query = self.query(x)
key = self.key(x)
value = self.value(x)
query_dim1, query_dim2 = paddle.shape(query)[:-1]
new_shape = [
query_dim1, query_dim2, self.num_attention_heads,
self.attention_head_size
]
query = query.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
key = key.reshape(new_shape).transpose(perm=(0, 2, 3, 1))
value = value.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
attention = paddle.matmul(query,
key) / math.sqrt(self.attention_head_size)
attention = attention + attention_mask
attention_value = F.softmax(attention, axis=-1)
attention_value = self.dropout(attention_value)
if head_mask is not None:
attention_value = attention_value * head_mask
context = paddle.matmul(attention_value, value).transpose(perm=(0, 2, 1,
3))
ctx_dim1, ctx_dim2 = paddle.shape(context)[:-2]
new_context_shape = [
ctx_dim1,
ctx_dim2,
self.all_head_size,
]
context = context.reshape(new_context_shape)
if self.output_attentions:
return (context, attention_value)
else:
return (context, )
class BertAttention(nn.Layer):
def __init__(self,
hidden_size,
num_attention_heads,
attention_probs_dropout_prob,
fc_dropout_prob,
output_attentions=False):
super(BertAttention, self).__init__()
self.bert_selfattention = BertSelfAttention(
hidden_size, num_attention_heads, attention_probs_dropout_prob,
output_attentions)
self.fc = nn.Linear(hidden_size, hidden_size)
self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
self.dropout = nn.Dropout(fc_dropout_prob)
def forward(self, x, attention_mask, head_mask=None):
attention_feats = self.bert_selfattention(x, attention_mask, head_mask)
features = self.fc(attention_feats[0])
features = self.dropout(features)
features = self.layernorm(features + x)
if len(attention_feats) == 2:
return (features, attention_feats[1])
else:
return (features, )
class BertFeedForward(nn.Layer):
def __init__(self,
hidden_size,
intermediate_size,
num_attention_heads,
attention_probs_dropout_prob,
fc_dropout_prob,
act_fn='ReLU',
output_attentions=False):
super(BertFeedForward, self).__init__()
self.fc1 = nn.Linear(hidden_size, intermediate_size)
self.act_fn = eval(act_fn)
self.fc2 = nn.Linear(intermediate_size, hidden_size)
self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
self.dropout = nn.Dropout(fc_dropout_prob)
def forward(self, x):
features = self.fc1(x)
features = self.act_fn(features)
features = self.fc2(features)
features = self.dropout(features)
features = self.layernorm(features + x)
return features
class BertLayer(nn.Layer):
def __init__(self,
hidden_size,
intermediate_size,
num_attention_heads,
attention_probs_dropout_prob,
fc_dropout_prob,
act_fn='ReLU',
output_attentions=False):
super(BertLayer, self).__init__()
self.attention = BertAttention(hidden_size, num_attention_heads,
attention_probs_dropout_prob,
output_attentions)
self.feed_forward = BertFeedForward(
hidden_size, intermediate_size, num_attention_heads,
attention_probs_dropout_prob, fc_dropout_prob, act_fn,
output_attentions)
def forward(self, x, attention_mask, head_mask=None):
attention_feats = self.attention(x, attention_mask, head_mask)
features = self.feed_forward(attention_feats[0])
if len(attention_feats) == 2:
return (features, attention_feats[1])
else:
return (features, )
class BertEncoder(nn.Layer):
def __init__(self,
num_hidden_layers,
hidden_size,
intermediate_size,
num_attention_heads,
attention_probs_dropout_prob,
fc_dropout_prob,
act_fn='ReLU',
output_attentions=False,
output_hidden_feats=False):
super(BertEncoder, self).__init__()
self.output_attentions = output_attentions
self.output_hidden_feats = output_hidden_feats
self.layers = nn.LayerList([
BertLayer(hidden_size, intermediate_size, num_attention_heads,
attention_probs_dropout_prob, fc_dropout_prob, act_fn,
output_attentions) for _ in range(num_hidden_layers)
])
def forward(self, x, attention_mask, head_mask=None):
all_features = (x, )
all_attentions = ()
for i, layer in enumerate(self.layers):
mask = head_mask[i] if head_mask is not None else None
layer_out = layer(x, attention_mask, mask)
if self.output_hidden_feats:
all_features = all_features + (x, )
x = layer_out[0]
if self.output_attentions:
all_attentions = all_attentions + (layer_out[1], )
outputs = (x, )
if self.output_hidden_feats:
outputs += (all_features, )
if self.output_attentions:
outputs += (all_attentions, )
return outputs
class BertPooler(nn.Layer):
def __init__(self, hidden_size):
super(BertPooler, self).__init__()
self.fc = nn.Linear(hidden_size, hidden_size)
self.act = nn.Tanh()
def forward(self, x):
first_token = x[:, 0]
pooled_output = self.fc(first_token)
pooled_output = self.act(pooled_output)
return pooled_output
class METROEncoder(nn.Layer):
def __init__(self,
vocab_size,
num_hidden_layers,
features_dims,
position_embeddings_size,
hidden_size,
intermediate_size,
output_feature_dim,
num_attention_heads,
attention_probs_dropout_prob,
fc_dropout_prob,
act_fn='ReLU',
output_attentions=False,
output_hidden_feats=False,
use_img_layernorm=False):
super(METROEncoder, self).__init__()
self.img_dims = features_dims
self.num_hidden_layers = num_hidden_layers
self.use_img_layernorm = use_img_layernorm
self.output_attentions = output_attentions
self.embedding = BertEmbeddings(vocab_size, position_embeddings_size, 2,
hidden_size, fc_dropout_prob)
self.encoder = BertEncoder(
num_hidden_layers, hidden_size, intermediate_size,
num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob,
act_fn, output_attentions, output_hidden_feats)
self.pooler = BertPooler(hidden_size)
self.position_embeddings = nn.Embedding(position_embeddings_size,
hidden_size)
self.img_embedding = nn.Linear(
features_dims, hidden_size, bias_attr=True)
self.dropout = nn.Dropout(fc_dropout_prob)
self.cls_head = nn.Linear(hidden_size, output_feature_dim)
self.residual = nn.Linear(features_dims, output_feature_dim)
self.apply(self.init_weights)
def init_weights(self, module):
""" Initialize the weights.
"""
if isinstance(module, (nn.Linear, nn.Embedding)):
module.weight.set_value(
paddle.normal(
mean=0.0, std=0.02, shape=module.weight.shape))
elif isinstance(module, nn.LayerNorm):
module.bias.set_value(paddle.zeros(shape=module.bias.shape))
module.weight.set_value(
paddle.full(
shape=module.weight.shape, fill_value=1.0))
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.set_value(paddle.zeros(shape=module.bias.shape))
def forward(self, x):
batchsize, seq_len = paddle.shape(x)[:2]
input_ids = paddle.zeros((batchsize, seq_len), dtype="int64")
position_ids = paddle.arange(
seq_len, dtype="int64").unsqueeze(0).expand_as(input_ids)
attention_mask = paddle.ones_like(input_ids).unsqueeze(1).unsqueeze(2)
head_mask = [None] * self.num_hidden_layers
position_embs = self.position_embeddings(position_ids)
attention_mask = (1.0 - attention_mask) * -10000.0
img_features = self.img_embedding(x)
# We empirically observe that adding an additional learnable position embedding leads to more stable training
embeddings = position_embs + img_features
if self.use_img_layernorm:
embeddings = self.layernorm(embeddings)
embeddings = self.dropout(embeddings)
encoder_outputs = self.encoder(
embeddings, attention_mask, head_mask=head_mask)
pred_score = self.cls_head(encoder_outputs[0])
res_img_feats = self.residual(x)
pred_score = pred_score + res_img_feats
if self.output_attentions and self.output_hidden_feats:
return pred_score, encoder_outputs[1], encoder_outputs[-1]
else:
return pred_score
def gelu(x):
"""Implementation of the gelu activation function.
https://arxiv.org/abs/1606.08415
"""
return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))
@register
class TransEncoder(nn.Layer):
def __init__(self,
vocab_size=30522,
num_hidden_layers=4,
num_attention_heads=4,
position_embeddings_size=512,
intermediate_size=3072,
input_feat_dim=[2048, 512, 128],
hidden_feat_dim=[1024, 256, 128],
attention_probs_dropout_prob=0.1,
fc_dropout_prob=0.1,
act_fn='gelu',
output_attentions=False,
output_hidden_feats=False):
super(TransEncoder, self).__init__()
output_feat_dim = input_feat_dim[1:] + [3]
trans_encoder = []
for i in range(len(output_feat_dim)):
features_dims = input_feat_dim[i]
output_feature_dim = output_feat_dim[i]
hidden_size = hidden_feat_dim[i]
# init a transformer encoder and append it to a list
assert hidden_size % num_attention_heads == 0
model = METROEncoder(vocab_size, num_hidden_layers, features_dims,
position_embeddings_size, hidden_size,
intermediate_size, output_feature_dim,
num_attention_heads,
attention_probs_dropout_prob, fc_dropout_prob,
act_fn, output_attentions, output_hidden_feats)
trans_encoder.append(model)
self.trans_encoder = paddle.nn.Sequential(*trans_encoder)
def forward(self, x):
out = self.trans_encoder(x)
return out

View File

@@ -0,0 +1,124 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle.nn.initializer import TruncatedNormal, Constant, Assign
# Common initializations
ones_ = Constant(value=1.)
zeros_ = Constant(value=0.)
trunc_normal_ = TruncatedNormal(std=.02)
# Common Layers
def drop_path(x, drop_prob=0., training=False):
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
"""
if drop_prob == 0. or not training:
return x
keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
random_tensor = paddle.floor(random_tensor) # binarize
output = x.divide(keep_prob) * random_tensor
return output
class DropPath(nn.Layer):
def __init__(self, drop_prob=None):
super(DropPath, self).__init__()
self.drop_prob = drop_prob
def forward(self, x):
return drop_path(x, self.drop_prob, self.training)
class Identity(nn.Layer):
def __init__(self):
super(Identity, self).__init__()
def forward(self, input):
return input
# common funcs
def to_2tuple(x):
if isinstance(x, (list, tuple)):
return x
return tuple([x] * 2)
def add_parameter(layer, datas, name=None):
parameter = layer.create_parameter(
shape=(datas.shape), default_initializer=Assign(datas))
if name:
layer.add_parameter(name, parameter)
return parameter
def window_partition(x, window_size):
"""
Partition into non-overlapping windows with padding if needed.
Args:
x (tensor): input tokens with [B, H, W, C].
window_size (int): window size.
Returns:
windows: windows after partition with [B * num_windows, window_size, window_size, C].
(Hp, Wp): padded height and width before partition
"""
B, H, W, C = paddle.shape(x)
pad_h = (window_size - H % window_size) % window_size
pad_w = (window_size - W % window_size) % window_size
x = F.pad(x.transpose([0, 3, 1, 2]),
paddle.to_tensor(
[0, int(pad_w), 0, int(pad_h)],
dtype='int32')).transpose([0, 2, 3, 1])
Hp, Wp = H + pad_h, W + pad_w
num_h, num_w = Hp // window_size, Wp // window_size
x = x.reshape([B, num_h, window_size, num_w, window_size, C])
windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
[-1, window_size, window_size, C])
return windows, (Hp, Wp), (num_h, num_w)
def window_unpartition(x, pad_hw, num_hw, hw):
"""
Window unpartition into original sequences and removing padding.
Args:
x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
pad_hw (Tuple): padded height and width (Hp, Wp).
hw (Tuple): original height and width (H, W) before padding.
Returns:
x: unpartitioned sequences with [B, H, W, C].
"""
Hp, Wp = pad_hw
num_h, num_w = num_hw
H, W = hw
B, window_size, _, C = paddle.shape(x)
B = B // (num_h * num_w)
x = x.reshape([B, num_h, num_w, window_size, window_size, C])
x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, Hp, Wp, C])
return x[:, :H, :W, :]

View File

@@ -0,0 +1,652 @@
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import numpy as np
from paddle.nn.initializer import Constant
from ppdet.modeling.shape_spec import ShapeSpec
from ppdet.core.workspace import register, serializable
from .transformer_utils import zeros_, DropPath, Identity
class Mlp(nn.Layer):
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer=nn.GELU,
drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class Attention(nn.Layer):
def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
qk_scale=None,
attn_drop=0.,
proj_drop=0.,
window_size=None):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim**-0.5
self.qkv = nn.Linear(dim, dim * 3, bias_attr=False)
if qkv_bias:
self.q_bias = self.create_parameter(
shape=([dim]), default_initializer=zeros_)
self.v_bias = self.create_parameter(
shape=([dim]), default_initializer=zeros_)
else:
self.q_bias = None
self.v_bias = None
if window_size:
self.window_size = window_size
self.num_relative_distance = (2 * window_size[0] - 1) * (
2 * window_size[1] - 1) + 3
self.relative_position_bias_table = self.create_parameter(
shape=(self.num_relative_distance, num_heads),
default_initializer=zeros_) # 2*Wh-1 * 2*Ww-1, nH
# cls to token & token 2 cls & cls to cls
# get pair-wise relative position index for each token inside the window
coords_h = paddle.arange(window_size[0])
coords_w = paddle.arange(window_size[1])
coords = paddle.stack(paddle.meshgrid(
[coords_h, coords_w])) # 2, Wh, Ww
coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww
coords_flatten_1 = paddle.unsqueeze(coords_flatten, 2)
coords_flatten_2 = paddle.unsqueeze(coords_flatten, 1)
relative_coords = coords_flatten_1.clone() - coords_flatten_2.clone(
)
#relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Wh
relative_coords = relative_coords.transpose(
(1, 2, 0)) #.contiguous() # Wh*Ww, Wh*Ww, 2
relative_coords[:, :, 0] += window_size[
0] - 1 # shift to start from 0
relative_coords[:, :, 1] += window_size[1] - 1
relative_coords[:, :, 0] *= 2 * window_size[1] - 1
relative_position_index = \
paddle.zeros(shape=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
relative_position_index[1:, 1:] = relative_coords.sum(
-1) # Wh*Ww, Wh*Ww
relative_position_index[0, 0:] = self.num_relative_distance - 3
relative_position_index[0:, 0] = self.num_relative_distance - 2
relative_position_index[0, 0] = self.num_relative_distance - 1
self.register_buffer("relative_position_index",
relative_position_index)
# trunc_normal_(self.relative_position_bias_table, std=.0)
else:
self.window_size = None
self.relative_position_bias_table = None
self.relative_position_index = None
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x, rel_pos_bias=None):
x_shape = paddle.shape(x)
N, C = x_shape[1], x_shape[2]
qkv_bias = None
if self.q_bias is not None:
qkv_bias = paddle.concat(
(self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
qkv = qkv.reshape((-1, N, 3, self.num_heads,
C // self.num_heads)).transpose((2, 0, 3, 1, 4))
q, k, v = qkv[0], qkv[1], qkv[2]
attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
if self.relative_position_bias_table is not None:
relative_position_bias = self.relative_position_bias_table[
self.relative_position_index.reshape([-1])].reshape([
self.window_size[0] * self.window_size[1] + 1,
self.window_size[0] * self.window_size[1] + 1, -1
]) # Wh*Ww,Wh*Ww,nH
relative_position_bias = relative_position_bias.transpose(
(2, 0, 1)) #.contiguous() # nH, Wh*Ww, Wh*Ww
attn = attn + relative_position_bias.unsqueeze(0)
if rel_pos_bias is not None:
attn = attn + rel_pos_bias
attn = nn.functional.softmax(attn, axis=-1)
attn = self.attn_drop(attn)
x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
x = self.proj(x)
x = self.proj_drop(x)
return x
class Block(nn.Layer):
def __init__(self,
dim,
num_heads,
mlp_ratio=4.,
qkv_bias=False,
qk_scale=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
window_size=None,
init_values=None,
act_layer=nn.GELU,
norm_layer='nn.LayerNorm',
epsilon=1e-5):
super().__init__()
self.norm1 = nn.LayerNorm(dim, epsilon=1e-6)
self.attn = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop,
window_size=window_size)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim,
hidden_features=mlp_hidden_dim,
act_layer=act_layer,
drop=drop)
if init_values is not None:
self.gamma_1 = self.create_parameter(
shape=([dim]), default_initializer=Constant(value=init_values))
self.gamma_2 = self.create_parameter(
shape=([dim]), default_initializer=Constant(value=init_values))
else:
self.gamma_1, self.gamma_2 = None, None
def forward(self, x, rel_pos_bias=None):
if self.gamma_1 is None:
x = x + self.drop_path(
self.attn(
self.norm1(x), rel_pos_bias=rel_pos_bias))
x = x + self.drop_path(self.mlp(self.norm2(x)))
else:
x = x + self.drop_path(self.gamma_1 * self.attn(
self.norm1(x), rel_pos_bias=rel_pos_bias))
x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
return x
class PatchEmbed(nn.Layer):
""" Image to Patch Embedding
"""
def __init__(self,
img_size=[224, 224],
patch_size=16,
in_chans=3,
embed_dim=768):
super().__init__()
self.num_patches_w = img_size[0] // patch_size
self.num_patches_h = img_size[1] // patch_size
num_patches = self.num_patches_w * self.num_patches_h
self.patch_shape = (img_size[0] // patch_size,
img_size[1] // patch_size)
self.img_size = img_size
self.patch_size = patch_size
self.num_patches = num_patches
self.proj = nn.Conv2D(
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
@property
def num_patches_in_h(self):
return self.img_size[1] // self.patch_size
@property
def num_patches_in_w(self):
return self.img_size[0] // self.patch_size
def forward(self, x, mask=None):
B, C, H, W = x.shape
return self.proj(x)
class RelativePositionBias(nn.Layer):
def __init__(self, window_size, num_heads):
super().__init__()
self.window_size = window_size
self.num_relative_distance = (2 * window_size[0] - 1) * (
2 * window_size[1] - 1) + 3
self.relative_position_bias_table = self.create_parameter(
shape=(self.num_relative_distance, num_heads),
default_initialize=zeros_)
# cls to token & token 2 cls & cls to cls
# get pair-wise relative position index for each token inside the window
coords_h = paddle.arange(window_size[0])
coords_w = paddle.arange(window_size[1])
coords = paddle.stack(paddle.meshgrid(
[coords_h, coords_w])) # 2, Wh, Ww
coords_flatten = coords.flatten(1) # 2, Wh*Ww
relative_coords = coords_flatten[:, :,
None] - coords_flatten[:,
None, :] # 2, Wh*Ww, Wh*Ww
relative_coords = relative_coords.transpos(
(1, 2, 0)) # Wh*Ww, Wh*Ww, 2
relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0
relative_coords[:, :, 1] += window_size[1] - 1
relative_coords[:, :, 0] *= 2 * window_size[1] - 1
relative_position_index = \
paddle.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
relative_position_index[1:, 1:] = relative_coords.sum(
-1) # Wh*Ww, Wh*Ww
relative_position_index[0, 0:] = self.num_relative_distance - 3
relative_position_index[0:, 0] = self.num_relative_distance - 2
relative_position_index[0, 0] = self.num_relative_distance - 1
self.register_buffer("relative_position_index", relative_position_index)
def forward(self):
relative_position_bias = \
self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
self.window_size[0] * self.window_size[1] + 1,
self.window_size[0] * self.window_size[1] + 1, -1]) # Wh*Ww,Wh*Ww,nH
return relative_position_bias.transpose((2, 0, 1)) # nH, Wh*Ww, Wh*Ww
def get_sinusoid_encoding_table(n_position, d_hid, token=False):
''' Sinusoid position encoding table '''
def get_position_angle_vec(position):
return [
position / np.power(10000, 2 * (hid_j // 2) / d_hid)
for hid_j in range(d_hid)
]
sinusoid_table = np.array(
[get_position_angle_vec(pos_i) for pos_i in range(n_position)])
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
if token:
sinusoid_table = np.concatenate(
[sinusoid_table, np.zeros([1, d_hid])], dim=0)
return paddle.to_tensor(sinusoid_table, dtype=paddle.float32).unsqueeze(0)
@register
@serializable
class VisionTransformer(nn.Layer):
""" Vision Transformer with support for patch input
"""
def __init__(self,
img_size=[672, 1092],
patch_size=16,
in_chans=3,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4,
qkv_bias=False,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
norm_layer='nn.LayerNorm',
init_values=None,
use_rel_pos_bias=False,
use_shared_rel_pos_bias=False,
epsilon=1e-5,
final_norm=False,
pretrained=None,
out_indices=[3, 5, 7, 11],
use_abs_pos_emb=False,
use_sincos_pos_emb=True,
with_fpn=True,
num_fpn_levels=4,
use_checkpoint=False,
**args):
super().__init__()
self.img_size = img_size
self.embed_dim = embed_dim
self.with_fpn = with_fpn
self.use_checkpoint = use_checkpoint
self.use_sincos_pos_emb = use_sincos_pos_emb
self.use_rel_pos_bias = use_rel_pos_bias
self.final_norm = final_norm
self.out_indices = out_indices
self.num_fpn_levels = num_fpn_levels
if use_checkpoint:
paddle.seed(0)
self.patch_embed = PatchEmbed(
img_size=img_size,
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim)
self.pos_w = self.patch_embed.num_patches_in_w
self.pos_h = self.patch_embed.num_patches_in_h
self.cls_token = self.create_parameter(
shape=(1, 1, embed_dim),
default_initializer=paddle.nn.initializer.Constant(value=0.))
if use_abs_pos_emb:
self.pos_embed = self.create_parameter(
shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
default_initializer=paddle.nn.initializer.TruncatedNormal(
std=.02))
elif use_sincos_pos_emb:
pos_embed = self.build_2d_sincos_position_embedding(embed_dim)
self.pos_embed = pos_embed
self.pos_embed = self.create_parameter(shape=pos_embed.shape)
self.pos_embed.set_value(pos_embed.numpy())
self.pos_embed.stop_gradient = True
else:
self.pos_embed = None
self.pos_drop = nn.Dropout(p=drop_rate)
if use_shared_rel_pos_bias:
self.rel_pos_bias = RelativePositionBias(
window_size=self.patch_embed.patch_shape, num_heads=num_heads)
else:
self.rel_pos_bias = None
dpr = np.linspace(0, drop_path_rate, depth)
self.blocks = nn.LayerList([
Block(
dim=embed_dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[i],
norm_layer=norm_layer,
init_values=init_values,
window_size=self.patch_embed.patch_shape
if use_rel_pos_bias else None,
epsilon=epsilon) for i in range(depth)
])
self.pretrained = pretrained
self.init_weight()
assert len(out_indices) <= 4, ''
self.out_indices = out_indices
self.out_channels = [embed_dim for _ in range(num_fpn_levels)]
self.out_strides = [4, 8, 16, 32][-num_fpn_levels:] if with_fpn else [
patch_size for _ in range(len(out_indices))
]
self.norm = Identity()
if self.with_fpn:
assert num_fpn_levels <= 4, ''
self.init_fpn(
embed_dim=embed_dim,
patch_size=patch_size, )
def init_weight(self):
pretrained = self.pretrained
if pretrained:
if 'http' in pretrained: #URL
path = paddle.utils.download.get_weights_path_from_url(
pretrained)
else: #model in local path
path = pretrained
load_state_dict = paddle.load(path)
model_state_dict = self.state_dict()
pos_embed_name = "pos_embed"
if pos_embed_name in load_state_dict.keys():
load_pos_embed = paddle.to_tensor(
load_state_dict[pos_embed_name], dtype="float32")
if self.pos_embed.shape != load_pos_embed.shape:
pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
model_state_dict[pos_embed_name] = self.resize_pos_embed(
load_pos_embed, (pos_size, pos_size),
(self.pos_h, self.pos_w))
# self.set_state_dict(model_state_dict)
load_state_dict[pos_embed_name] = model_state_dict[
pos_embed_name]
print("Load pos_embed and resize it from {} to {} .".format(
load_pos_embed.shape, self.pos_embed.shape))
self.set_state_dict(load_state_dict)
print("Load load_state_dict....")
def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
if patch_size == 16:
self.fpn1 = nn.Sequential(
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2),
nn.BatchNorm2D(embed_dim),
nn.GELU(),
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2), )
self.fpn2 = nn.Sequential(
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2), )
self.fpn3 = Identity()
self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
elif patch_size == 8:
self.fpn1 = nn.Sequential(
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2), )
self.fpn2 = Identity()
self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )
self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )
if not out_with_norm:
self.norm = Identity()
else:
self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6)
def interpolate_pos_encoding(self, x, w, h):
npatch = x.shape[1] - 1
N = self.pos_embed.shape[1] - 1
w0 = w // self.patch_embed.patch_size
h0 = h // self.patch_embed.patch_size
if npatch == N and w0 == self.patch_embed.num_patches_w and h0 == self.patch_embed.num_patches_h:
return self.pos_embed
class_pos_embed = self.pos_embed[:, 0]
patch_pos_embed = self.pos_embed[:, 1:]
dim = x.shape[-1]
# we add a small number to avoid floating point error in the interpolation
# see discussion at https://github.com/facebookresearch/dino/issues/8
# w0, h0 = w0 + 0.1, h0 + 0.1
# patch_pos_embed = nn.functional.interpolate(
# patch_pos_embed.reshape([
# 1, self.patch_embed.num_patches_w,
# self.patch_embed.num_patches_h, dim
# ]).transpose((0, 3, 1, 2)),
# scale_factor=(w0 / self.patch_embed.num_patches_w,
# h0 / self.patch_embed.num_patches_h),
# mode='bicubic', )
patch_pos_embed = nn.functional.interpolate(
patch_pos_embed.reshape([
1, self.patch_embed.num_patches_w,
self.patch_embed.num_patches_h, dim
]).transpose((0, 3, 1, 2)),
(w0, h0),
mode='bicubic', )
assert int(w0) == patch_pos_embed.shape[-2] and int(
h0) == patch_pos_embed.shape[-1]
patch_pos_embed = patch_pos_embed.transpose(
(0, 2, 3, 1)).reshape([1, -1, dim])
return paddle.concat(
(class_pos_embed.unsqueeze(0), patch_pos_embed), axis=1)
def resize_pos_embed(self, pos_embed, old_hw, new_hw):
"""
Resize pos_embed weight.
Args:
pos_embed (Tensor): the pos_embed weight
old_hw (list[int]): the height and width of old pos_embed
new_hw (list[int]): the height and width of new pos_embed
Returns:
Tensor: the resized pos_embed weight
"""
cls_pos_embed = pos_embed[:, :1, :]
pos_embed = pos_embed[:, 1:, :]
pos_embed = pos_embed.transpose([0, 2, 1])
pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
pos_embed = F.interpolate(
pos_embed, new_hw, mode='bicubic', align_corners=False)
pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
return pos_embed
def build_2d_sincos_position_embedding(
self,
embed_dim=768,
temperature=10000., ):
h, w = self.patch_embed.patch_shape
grid_w = paddle.arange(w, dtype=paddle.float32)
grid_h = paddle.arange(h, dtype=paddle.float32)
grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
pos_dim = embed_dim // 4
omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
omega = 1. / (temperature**omega)
out_w = grid_w.flatten()[..., None] @omega[None]
out_h = grid_h.flatten()[..., None] @omega[None]
pos_emb = paddle.concat(
[
paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
paddle.cos(out_h)
],
axis=1)[None, :, :]
pe_token = paddle.zeros([1, 1, embed_dim], dtype=paddle.float32)
pos_embed = paddle.concat([pe_token, pos_emb], axis=1)
# pos_embed.stop_gradient = True
return pos_embed
def forward(self, x):
x = x['image'] if isinstance(x, dict) else x
_, _, h, w = x.shape
x = self.patch_embed(x)
B, D, Hp, Wp = x.shape # b * c * h * w
cls_tokens = self.cls_token.expand(
(B, self.cls_token.shape[-2], self.cls_token.shape[-1]))
x = x.flatten(2).transpose([0, 2, 1]) # b * hw * c
x = paddle.concat([cls_tokens, x], axis=1)
if self.pos_embed is not None:
# x = x + self.interpolate_pos_encoding(x, w, h)
x = x + self.interpolate_pos_encoding(x, h, w)
x = self.pos_drop(x)
rel_pos_bias = self.rel_pos_bias(
) if self.rel_pos_bias is not None else None
feats = []
for idx, blk in enumerate(self.blocks):
if self.use_checkpoint and self.training:
x = paddle.distributed.fleet.utils.recompute(
blk, x, rel_pos_bias, **{"preserve_rng_state": True})
else:
x = blk(x, rel_pos_bias)
if idx in self.out_indices:
xp = paddle.reshape(
paddle.transpose(
self.norm(x[:, 1:, :]), perm=[0, 2, 1]),
shape=[B, D, Hp, Wp])
feats.append(xp)
if self.with_fpn:
fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4][
-self.num_fpn_levels:]
assert len(fpns) == len(feats) or len(feats) == 1, ''
outputs = []
for i, m in enumerate(fpns):
outputs.append(
m(feats[i] if len(feats) == len(fpns) else feats[-1]))
return outputs
return feats
@property
def num_layers(self):
return len(self.blocks)
@property
def no_weight_decay(self):
return {'pos_embed', 'cls_token'}
@property
def out_shape(self):
return [
ShapeSpec(
channels=c, stride=s)
for c, s in zip(self.out_channels, self.out_strides)
]

View File

@@ -0,0 +1,749 @@
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import numpy as np
import math
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import Constant, TruncatedNormal
from ppdet.modeling.shape_spec import ShapeSpec
from ppdet.core.workspace import register, serializable
from .transformer_utils import (zeros_, DropPath, Identity, window_partition,
window_unpartition)
from ..initializer import linear_init_
__all__ = ['VisionTransformer2D', 'SimpleFeaturePyramid']
class Mlp(nn.Layer):
def __init__(self,
in_features,
hidden_features=None,
out_features=None,
act_layer='nn.GELU',
drop=0.,
lr_factor=1.0):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(
in_features,
hidden_features,
weight_attr=ParamAttr(learning_rate=lr_factor),
bias_attr=ParamAttr(learning_rate=lr_factor))
self.act = eval(act_layer)()
self.fc2 = nn.Linear(
hidden_features,
out_features,
weight_attr=ParamAttr(learning_rate=lr_factor),
bias_attr=ParamAttr(learning_rate=lr_factor))
self.drop = nn.Dropout(drop)
self._init_weights()
def _init_weights(self):
linear_init_(self.fc1)
linear_init_(self.fc2)
def forward(self, x):
x = self.drop(self.act(self.fc1(x)))
x = self.drop(self.fc2(x))
return x
class Attention(nn.Layer):
def __init__(self,
dim,
num_heads=8,
qkv_bias=False,
attn_bias=False,
attn_drop=0.,
proj_drop=0.,
use_rel_pos=False,
rel_pos_zero_init=True,
window_size=None,
input_size=None,
qk_scale=None,
lr_factor=1.0):
super().__init__()
self.num_heads = num_heads
self.head_dim = dim // num_heads
self.scale = qk_scale or self.head_dim**-0.5
self.use_rel_pos = use_rel_pos
self.input_size = input_size
self.rel_pos_zero_init = rel_pos_zero_init
self.window_size = window_size
self.lr_factor = lr_factor
self.qkv = nn.Linear(
dim,
dim * 3,
weight_attr=ParamAttr(learning_rate=lr_factor),
bias_attr=ParamAttr(learning_rate=lr_factor)
if attn_bias else False)
if qkv_bias:
self.q_bias = self.create_parameter(
shape=([dim]), default_initializer=zeros_)
self.v_bias = self.create_parameter(
shape=([dim]), default_initializer=zeros_)
else:
self.q_bias = None
self.v_bias = None
self.proj = nn.Linear(
dim,
dim,
weight_attr=ParamAttr(learning_rate=lr_factor),
bias_attr=ParamAttr(learning_rate=lr_factor))
self.attn_drop = nn.Dropout(attn_drop)
if window_size is None:
self.window_size = self.input_size[0]
self._init_weights()
def _init_weights(self):
linear_init_(self.qkv)
linear_init_(self.proj)
if self.use_rel_pos:
self.rel_pos_h = self.create_parameter(
[2 * self.window_size - 1, self.head_dim],
attr=ParamAttr(learning_rate=self.lr_factor),
default_initializer=Constant(value=0.))
self.rel_pos_w = self.create_parameter(
[2 * self.window_size - 1, self.head_dim],
attr=ParamAttr(learning_rate=self.lr_factor),
default_initializer=Constant(value=0.))
if not self.rel_pos_zero_init:
TruncatedNormal(self.rel_pos_h, std=0.02)
TruncatedNormal(self.rel_pos_w, std=0.02)
def get_rel_pos(self, seq_size, rel_pos):
max_rel_dist = int(2 * seq_size - 1)
# Interpolate rel pos if needed.
if rel_pos.shape[0] != max_rel_dist:
# Interpolate rel pos.
rel_pos = rel_pos.reshape([1, rel_pos.shape[0], -1])
rel_pos = rel_pos.transpose([0, 2, 1])
rel_pos_resized = F.interpolate(
rel_pos,
size=(max_rel_dist, ),
mode="linear",
data_format='NCW')
rel_pos_resized = rel_pos_resized.reshape([-1, max_rel_dist])
rel_pos_resized = rel_pos_resized.transpose([1, 0])
else:
rel_pos_resized = rel_pos
coords = paddle.arange(seq_size, dtype='float32')
relative_coords = coords.unsqueeze(-1) - coords.unsqueeze(0)
relative_coords += (seq_size - 1)
relative_coords = relative_coords.astype('int64').flatten()
return paddle.index_select(rel_pos_resized, relative_coords).reshape(
[seq_size, seq_size, self.head_dim])
def add_decomposed_rel_pos(self, attn, q, h, w):
"""
Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
Args:
attn (Tensor): attention map.
q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
Returns:
attn (Tensor): attention map with added relative positional embeddings.
"""
Rh = self.get_rel_pos(h, self.rel_pos_h)
Rw = self.get_rel_pos(w, self.rel_pos_w)
B, _, dim = q.shape
r_q = q.reshape([B, h, w, dim])
# bhwc, hch->bhwh1
# bwhc, wcw->bhw1w
rel_h = paddle.einsum("bhwc,hkc->bhwk", r_q, Rh).unsqueeze(-1)
rel_w = paddle.einsum("bhwc,wkc->bhwk", r_q, Rw).unsqueeze(-2)
attn = attn.reshape([B, h, w, h, w]) + rel_h + rel_w
return attn.reshape([B, h * w, h * w])
def forward(self, x):
B, H, W, C = paddle.shape(x)
if self.q_bias is not None:
qkv_bias = paddle.concat(
(self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
else:
qkv = self.qkv(x).reshape(
[B, H * W, 3, self.num_heads, self.head_dim]).transpose(
[2, 0, 3, 1, 4]).reshape(
[3, B * self.num_heads, H * W, self.head_dim])
q, k, v = qkv[0], qkv[1], qkv[2]
attn = q.matmul(k.transpose([0, 2, 1])) * self.scale
if self.use_rel_pos:
attn = self.add_decomposed_rel_pos(attn, q, H, W)
attn = F.softmax(attn, axis=-1)
attn = self.attn_drop(attn)
x = attn.matmul(v).reshape(
[B, self.num_heads, H * W, self.head_dim]).transpose(
[0, 2, 1, 3]).reshape([B, H, W, C])
x = self.proj(x)
return x
class Block(nn.Layer):
def __init__(self,
dim,
num_heads,
mlp_ratio=4.,
qkv_bias=False,
attn_bias=False,
qk_scale=None,
init_values=None,
drop=0.,
attn_drop=0.,
drop_path=0.,
use_rel_pos=True,
rel_pos_zero_init=True,
window_size=None,
input_size=None,
act_layer='nn.GELU',
norm_layer='nn.LayerNorm',
lr_factor=1.0,
epsilon=1e-5):
super().__init__()
self.window_size = window_size
self.norm1 = eval(norm_layer)(dim,
weight_attr=ParamAttr(
learning_rate=lr_factor,
regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(
learning_rate=lr_factor,
regularizer=L2Decay(0.0)),
epsilon=epsilon)
self.attn = Attention(
dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
attn_bias=attn_bias,
qk_scale=qk_scale,
attn_drop=attn_drop,
proj_drop=drop,
use_rel_pos=use_rel_pos,
rel_pos_zero_init=rel_pos_zero_init,
window_size=window_size,
input_size=input_size,
lr_factor=lr_factor)
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
self.norm2 = eval(norm_layer)(dim,
weight_attr=ParamAttr(
learning_rate=lr_factor,
regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(
learning_rate=lr_factor,
regularizer=L2Decay(0.0)),
epsilon=epsilon)
self.mlp = Mlp(in_features=dim,
hidden_features=int(dim * mlp_ratio),
act_layer=act_layer,
drop=drop,
lr_factor=lr_factor)
if init_values is not None:
self.gamma_1 = self.create_parameter(
shape=([dim]), default_initializer=Constant(value=init_values))
self.gamma_2 = self.create_parameter(
shape=([dim]), default_initializer=Constant(value=init_values))
else:
self.gamma_1, self.gamma_2 = None, None
def forward(self, x):
y = self.norm1(x)
if self.window_size is not None:
y, pad_hw, num_hw = window_partition(y, self.window_size)
y = self.attn(y)
if self.gamma_1 is not None:
y = self.gamma_1 * y
if self.window_size is not None:
y = window_unpartition(y, pad_hw, num_hw, (x.shape[1], x.shape[2]))
x = x + self.drop_path(y)
if self.gamma_2 is None:
x = x + self.drop_path(self.mlp(self.norm2(x)))
else:
x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
return x
class PatchEmbed(nn.Layer):
""" Image to Patch Embedding
"""
def __init__(self,
img_size=(224, 224),
patch_size=16,
in_chans=3,
embed_dim=768,
lr_factor=0.01):
super().__init__()
self.img_size = img_size
self.patch_size = patch_size
self.proj = nn.Conv2D(
in_chans,
embed_dim,
kernel_size=patch_size,
stride=patch_size,
weight_attr=ParamAttr(learning_rate=lr_factor),
bias_attr=ParamAttr(learning_rate=lr_factor))
@property
def num_patches_in_h(self):
return self.img_size[1] // self.patch_size
@property
def num_patches_in_w(self):
return self.img_size[0] // self.patch_size
def forward(self, x):
out = self.proj(x)
return out
@register
@serializable
class VisionTransformer2D(nn.Layer):
""" Vision Transformer with support for patch input
"""
def __init__(self,
img_size=(1024, 1024),
patch_size=16,
in_chans=3,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4,
qkv_bias=False,
attn_bias=False,
qk_scale=None,
init_values=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
act_layer='nn.GELU',
norm_layer='nn.LayerNorm',
lr_decay_rate=1.0,
global_attn_indexes=(2, 5, 8, 11),
use_abs_pos=False,
use_rel_pos=False,
use_abs_pos_emb=False,
use_sincos_pos_emb=False,
rel_pos_zero_init=True,
epsilon=1e-5,
final_norm=False,
pretrained=None,
window_size=None,
out_indices=(11, ),
with_fpn=False,
use_checkpoint=False,
*args,
**kwargs):
super().__init__()
self.img_size = img_size
self.patch_size = patch_size
self.embed_dim = embed_dim
self.num_heads = num_heads
self.depth = depth
self.global_attn_indexes = global_attn_indexes
self.epsilon = epsilon
self.with_fpn = with_fpn
self.use_checkpoint = use_checkpoint
self.patch_h = img_size[0] // patch_size
self.patch_w = img_size[1] // patch_size
self.num_patches = self.patch_h * self.patch_w
self.use_abs_pos = use_abs_pos
self.use_abs_pos_emb = use_abs_pos_emb
self.patch_embed = PatchEmbed(
img_size=img_size,
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim)
dpr = np.linspace(0, drop_path_rate, depth)
if use_checkpoint:
paddle.seed(0)
if use_abs_pos_emb:
self.pos_w = self.patch_embed.num_patches_in_w
self.pos_h = self.patch_embed.num_patches_in_h
self.pos_embed = self.create_parameter(
shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
default_initializer=paddle.nn.initializer.TruncatedNormal(
std=.02))
elif use_sincos_pos_emb:
pos_embed = self.get_2d_sincos_position_embedding(self.patch_h,
self.patch_w)
self.pos_embed = pos_embed
self.pos_embed = self.create_parameter(shape=pos_embed.shape)
self.pos_embed.set_value(pos_embed.numpy())
self.pos_embed.stop_gradient = True
else:
self.pos_embed = None
self.blocks = nn.LayerList([
Block(
embed_dim,
num_heads=num_heads,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
attn_bias=attn_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[i],
use_rel_pos=use_rel_pos,
rel_pos_zero_init=rel_pos_zero_init,
window_size=None
if i in self.global_attn_indexes else window_size,
input_size=[self.patch_h, self.patch_w],
act_layer=act_layer,
lr_factor=self.get_vit_lr_decay_rate(i, lr_decay_rate),
norm_layer=norm_layer,
init_values=init_values,
epsilon=epsilon) for i in range(depth)
])
assert len(out_indices) <= 4, 'out_indices out of bound'
self.out_indices = out_indices
self.pretrained = pretrained
self.init_weight()
self.out_channels = [embed_dim for _ in range(len(out_indices))]
self.out_strides = [4, 8, 16, 32][-len(out_indices):] if with_fpn else [
patch_size for _ in range(len(out_indices))
]
self.norm = Identity()
if self.with_fpn:
self.init_fpn(
embed_dim=embed_dim,
patch_size=patch_size,
out_with_norm=final_norm)
def get_vit_lr_decay_rate(self, layer_id, lr_decay_rate):
return lr_decay_rate**(self.depth - layer_id)
def init_weight(self):
pretrained = self.pretrained
if pretrained:
if 'http' in pretrained:
path = paddle.utils.download.get_weights_path_from_url(
pretrained)
else:
path = pretrained
load_state_dict = paddle.load(path)
model_state_dict = self.state_dict()
pos_embed_name = "pos_embed"
if pos_embed_name in load_state_dict.keys(
) and self.use_abs_pos_emb:
load_pos_embed = paddle.to_tensor(
load_state_dict[pos_embed_name], dtype="float32")
if self.pos_embed.shape != load_pos_embed.shape:
pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
model_state_dict[pos_embed_name] = self.resize_pos_embed(
load_pos_embed, (pos_size, pos_size),
(self.pos_h, self.pos_w))
# self.set_state_dict(model_state_dict)
load_state_dict[pos_embed_name] = model_state_dict[
pos_embed_name]
print("Load pos_embed and resize it from {} to {} .".format(
load_pos_embed.shape, self.pos_embed.shape))
self.set_state_dict(load_state_dict)
print("Load load_state_dict....")
def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
if patch_size == 16:
self.fpn1 = nn.Sequential(
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2),
nn.BatchNorm2D(embed_dim),
nn.GELU(),
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2), )
self.fpn2 = nn.Sequential(
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2), )
self.fpn3 = Identity()
self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
elif patch_size == 8:
self.fpn1 = nn.Sequential(
nn.Conv2DTranspose(
embed_dim, embed_dim, kernel_size=2, stride=2), )
self.fpn2 = Identity()
self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )
self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )
if not out_with_norm:
self.norm = Identity()
else:
self.norm = nn.LayerNorm(embed_dim, epsilon=self.epsilon)
def resize_pos_embed(self, pos_embed, old_hw, new_hw):
"""
Resize pos_embed weight.
Args:
pos_embed (Tensor): the pos_embed weight
old_hw (list[int]): the height and width of old pos_embed
new_hw (list[int]): the height and width of new pos_embed
Returns:
Tensor: the resized pos_embed weight
"""
cls_pos_embed = pos_embed[:, :1, :]
pos_embed = pos_embed[:, 1:, :]
pos_embed = pos_embed.transpose([0, 2, 1])
pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
pos_embed = F.interpolate(
pos_embed, new_hw, mode='bicubic', align_corners=False)
pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
return pos_embed
def get_2d_sincos_position_embedding(self, h, w, temperature=10000.):
grid_y, grid_x = paddle.meshgrid(
paddle.arange(
h, dtype=paddle.float32),
paddle.arange(
w, dtype=paddle.float32))
assert self.embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
pos_dim = self.embed_dim // 4
omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
omega = (1. / (temperature**omega)).unsqueeze(0)
out_x = grid_x.reshape([-1, 1]).matmul(omega)
out_y = grid_y.reshape([-1, 1]).matmul(omega)
pos_emb = paddle.concat(
[
paddle.sin(out_y), paddle.cos(out_y), paddle.sin(out_x),
paddle.cos(out_x)
],
axis=1)
return pos_emb.reshape([1, h, w, self.embed_dim])
def forward(self, inputs):
x = self.patch_embed(inputs['image']).transpose([0, 2, 3, 1])
B, Hp, Wp, _ = paddle.shape(x)
if self.use_abs_pos:
x = x + self.get_2d_sincos_position_embedding(Hp, Wp)
if self.use_abs_pos_emb:
x = x + self.resize_pos_embed(self.pos_embed,
(self.pos_h, self.pos_w), (Hp, Wp))
feats = []
for idx, blk in enumerate(self.blocks):
if self.use_checkpoint and self.training:
x = paddle.distributed.fleet.utils.recompute(
blk, x, **{"preserve_rng_state": True})
else:
x = blk(x)
if idx in self.out_indices:
feats.append(self.norm(x.transpose([0, 3, 1, 2])))
if self.with_fpn:
fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
for i in range(len(feats)):
feats[i] = fpns[i](feats[i])
return feats
@property
def num_layers(self):
return len(self.blocks)
@property
def no_weight_decay(self):
return {'pos_embed', 'cls_token'}
@property
def out_shape(self):
return [
ShapeSpec(
channels=c, stride=s)
for c, s in zip(self.out_channels, self.out_strides)
]
class LayerNorm(nn.Layer):
"""
A LayerNorm variant, popularized by Transformers, that performs point-wise mean and
variance normalization over the channel dimension for inputs that have shape
(batch_size, channels, height, width).
Note that, the modified LayerNorm on used in ResBlock and SimpleFeaturePyramid.
In ViT, we use the nn.LayerNorm
"""
def __init__(self, normalized_shape, eps=1e-6):
super().__init__()
self.weight = self.create_parameter([normalized_shape])
self.bias = self.create_parameter([normalized_shape])
self.eps = eps
self.normalized_shape = (normalized_shape, )
def forward(self, x):
u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / paddle.sqrt(s + self.eps)
x = self.weight[:, None, None] * x + self.bias[:, None, None]
return x
@register
@serializable
class SimpleFeaturePyramid(nn.Layer):
def __init__(self,
in_channels,
out_channels,
spatial_scales,
num_levels=4,
use_bias=False):
"""
Args:
in_channels (list[int]): input channels of each level which can be
derived from the output shape of backbone by from_config
out_channel (int): output channel of each level.
spatial_scales (list[float]): list of scaling factors to upsample or downsample
the input features for creating pyramid features which can be derived from
the output shape of backbone by from_config
num_levels (int): number of levels of output features.
use_bias (bool): whether use bias or not.
"""
super(SimpleFeaturePyramid, self).__init__()
self.in_channels = in_channels[0]
self.out_channels = out_channels
self.num_levels = num_levels
self.stages = []
dim = self.in_channels
if num_levels == 4:
scale_factors = [2.0, 1.0, 0.5]
elif num_levels == 5:
scale_factors = [4.0, 2.0, 1.0, 0.5]
else:
raise NotImplementedError(
f"num_levels={num_levels} is not supported yet.")
dim = in_channels[0]
for idx, scale in enumerate(scale_factors):
out_dim = dim
if scale == 4.0:
layers = [
nn.Conv2DTranspose(
dim, dim // 2, kernel_size=2, stride=2),
nn.LayerNorm(dim // 2),
nn.GELU(),
nn.Conv2DTranspose(
dim // 2, dim // 4, kernel_size=2, stride=2),
]
out_dim = dim // 4
elif scale == 2.0:
layers = [
nn.Conv2DTranspose(
dim, dim // 2, kernel_size=2, stride=2)
]
out_dim = dim // 2
elif scale == 1.0:
layers = []
elif scale == 0.5:
layers = [nn.MaxPool2D(kernel_size=2, stride=2)]
layers.extend([
nn.Conv2D(
out_dim,
out_channels,
kernel_size=1,
bias_attr=use_bias, ), LayerNorm(out_channels), nn.Conv2D(
out_channels,
out_channels,
kernel_size=3,
padding=1,
bias_attr=use_bias, ), LayerNorm(out_channels)
])
layers = nn.Sequential(*layers)
stage = -int(math.log2(spatial_scales[0] * scale_factors[idx]))
self.add_sublayer(f"simfp_{stage}", layers)
self.stages.append(layers)
# top block output feature maps.
self.top_block = nn.Sequential(
nn.MaxPool2D(
kernel_size=1, stride=2, padding=0))
@classmethod
def from_config(cls, cfg, input_shape):
return {
'in_channels': [i.channels for i in input_shape],
'spatial_scales': [1.0 / i.stride for i in input_shape],
}
@property
def out_shape(self):
return [
ShapeSpec(channels=self.out_channels)
for _ in range(self.num_levels)
]
def forward(self, feats):
"""
Args:
x: Tensor of shape (N,C,H,W).
"""
features = feats[0]
results = []
for stage in self.stages:
results.append(stage(features))
top_block_in_feature = results[-1]
results.append(self.top_block(top_block_in_feature))
assert self.num_levels == len(results)
return results

View File

@@ -0,0 +1,607 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import paddle
import numpy as np
def bbox2delta(src_boxes, tgt_boxes, weights=[1.0, 1.0, 1.0, 1.0]):
"""Encode bboxes to deltas.
"""
src_w = src_boxes[:, 2] - src_boxes[:, 0]
src_h = src_boxes[:, 3] - src_boxes[:, 1]
src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
src_ctr_y = src_boxes[:, 1] + 0.5 * src_h
tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]
tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]
tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w
tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h
wx, wy, ww, wh = weights
dx = wx * (tgt_ctr_x - src_ctr_x) / src_w
dy = wy * (tgt_ctr_y - src_ctr_y) / src_h
dw = ww * paddle.log(tgt_w / src_w)
dh = wh * paddle.log(tgt_h / src_h)
deltas = paddle.stack((dx, dy, dw, dh), axis=1)
return deltas
def delta2bbox(deltas, boxes, weights=[1.0, 1.0, 1.0, 1.0], max_shape=None):
"""Decode deltas to boxes. Used in RCNNBox,CascadeHead,RCNNHead,RetinaHead.
Note: return tensor shape [n,1,4]
If you want to add a reshape, please add after the calling code instead of here.
"""
clip_scale = math.log(1000.0 / 16)
widths = boxes[:, 2] - boxes[:, 0]
heights = boxes[:, 3] - boxes[:, 1]
ctr_x = boxes[:, 0] + 0.5 * widths
ctr_y = boxes[:, 1] + 0.5 * heights
wx, wy, ww, wh = weights
dx = deltas[:, 0::4] / wx
dy = deltas[:, 1::4] / wy
dw = deltas[:, 2::4] / ww
dh = deltas[:, 3::4] / wh
# Prevent sending too large values into paddle.exp()
dw = paddle.clip(dw, max=clip_scale)
dh = paddle.clip(dh, max=clip_scale)
pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1)
pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1)
pred_w = paddle.exp(dw) * widths.unsqueeze(1)
pred_h = paddle.exp(dh) * heights.unsqueeze(1)
pred_boxes = []
pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
pred_boxes = paddle.stack(pred_boxes, axis=-1)
if max_shape is not None:
pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(
min=0, max=max_shape[1])
pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(
min=0, max=max_shape[0])
return pred_boxes
def bbox2delta_v2(src_boxes,
tgt_boxes,
delta_mean=[0.0, 0.0, 0.0, 0.0],
delta_std=[1.0, 1.0, 1.0, 1.0]):
"""Encode bboxes to deltas.
Modified from bbox2delta() which just use weight parameters to multiply deltas.
"""
src_w = src_boxes[:, 2] - src_boxes[:, 0]
src_h = src_boxes[:, 3] - src_boxes[:, 1]
src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
src_ctr_y = src_boxes[:, 1] + 0.5 * src_h
tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]
tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]
tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w
tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h
dx = (tgt_ctr_x - src_ctr_x) / src_w
dy = (tgt_ctr_y - src_ctr_y) / src_h
dw = paddle.log(tgt_w / src_w)
dh = paddle.log(tgt_h / src_h)
deltas = paddle.stack((dx, dy, dw, dh), axis=1)
deltas = (
deltas - paddle.to_tensor(delta_mean)) / paddle.to_tensor(delta_std)
return deltas
def delta2bbox_v2(deltas,
boxes,
delta_mean=[0.0, 0.0, 0.0, 0.0],
delta_std=[1.0, 1.0, 1.0, 1.0],
max_shape=None,
ctr_clip=32.0):
"""Decode deltas to bboxes.
Modified from delta2bbox() which just use weight parameters to be divided by deltas.
Used in YOLOFHead.
Note: return tensor shape [n,1,4]
If you want to add a reshape, please add after the calling code instead of here.
"""
clip_scale = math.log(1000.0 / 16)
widths = boxes[:, 2] - boxes[:, 0]
heights = boxes[:, 3] - boxes[:, 1]
ctr_x = boxes[:, 0] + 0.5 * widths
ctr_y = boxes[:, 1] + 0.5 * heights
deltas = deltas * paddle.to_tensor(delta_std) + paddle.to_tensor(delta_mean)
dx = deltas[:, 0::4]
dy = deltas[:, 1::4]
dw = deltas[:, 2::4]
dh = deltas[:, 3::4]
# Prevent sending too large values into paddle.exp()
dx = dx * widths.unsqueeze(1)
dy = dy * heights.unsqueeze(1)
if ctr_clip is not None:
dx = paddle.clip(dx, max=ctr_clip, min=-ctr_clip)
dy = paddle.clip(dy, max=ctr_clip, min=-ctr_clip)
dw = paddle.clip(dw, max=clip_scale)
dh = paddle.clip(dh, max=clip_scale)
else:
dw = dw.clip(min=-clip_scale, max=clip_scale)
dh = dh.clip(min=-clip_scale, max=clip_scale)
pred_ctr_x = dx + ctr_x.unsqueeze(1)
pred_ctr_y = dy + ctr_y.unsqueeze(1)
pred_w = paddle.exp(dw) * widths.unsqueeze(1)
pred_h = paddle.exp(dh) * heights.unsqueeze(1)
pred_boxes = []
pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
pred_boxes = paddle.stack(pred_boxes, axis=-1)
if max_shape is not None:
pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(
min=0, max=max_shape[1])
pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(
min=0, max=max_shape[0])
return pred_boxes
def expand_bbox(bboxes, scale):
w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5
h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5
x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5
y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5
w_half *= scale
h_half *= scale
bboxes_exp = np.zeros(bboxes.shape, dtype=np.float32)
bboxes_exp[:, 0] = x_c - w_half
bboxes_exp[:, 2] = x_c + w_half
bboxes_exp[:, 1] = y_c - h_half
bboxes_exp[:, 3] = y_c + h_half
return bboxes_exp
def clip_bbox(boxes, im_shape):
h, w = im_shape[0], im_shape[1]
x1 = boxes[:, 0].clip(0, w)
y1 = boxes[:, 1].clip(0, h)
x2 = boxes[:, 2].clip(0, w)
y2 = boxes[:, 3].clip(0, h)
return paddle.stack([x1, y1, x2, y2], axis=1)
def nonempty_bbox(boxes, min_size=0, return_mask=False):
w = boxes[:, 2] - boxes[:, 0]
h = boxes[:, 3] - boxes[:, 1]
mask = paddle.logical_and(h > min_size, w > min_size)
if return_mask:
return mask
keep = paddle.nonzero(mask).flatten()
return keep
def bbox_area(boxes):
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
def bbox_overlaps(boxes1, boxes2):
"""
Calculate overlaps between boxes1 and boxes2
Args:
boxes1 (Tensor): boxes with shape [M, 4]
boxes2 (Tensor): boxes with shape [N, 4]
Return:
overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N]
"""
M = boxes1.shape[0]
N = boxes2.shape[0]
if M * N == 0:
return paddle.zeros([M, N], dtype='float32')
area1 = bbox_area(boxes1)
area2 = bbox_area(boxes2)
xy_max = paddle.minimum(
paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:])
xy_min = paddle.maximum(
paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2])
width_height = xy_max - xy_min
width_height = width_height.clip(min=0)
inter = width_height.prod(axis=2)
overlaps = paddle.where(inter > 0, inter /
(paddle.unsqueeze(area1, 1) + area2 - inter),
paddle.zeros_like(inter))
return overlaps
def batch_bbox_overlaps(bboxes1,
bboxes2,
mode='iou',
is_aligned=False,
eps=1e-6):
"""Calculate overlap between two set of bboxes.
If ``is_aligned `` is ``False``, then calculate the overlaps between each
bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
pair of bboxes1 and bboxes2.
Args:
bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
B indicates the batch dim, in shape (B1, B2, ..., Bn).
If ``is_aligned `` is ``True``, then m and n must be equal.
mode (str): "iou" (intersection over union) or "iof" (intersection over
foreground).
is_aligned (bool, optional): If True, then m and n must be equal.
Default False.
eps (float, optional): A value added to the denominator for numerical
stability. Default 1e-6.
Returns:
Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
"""
assert mode in ['iou', 'iof', 'giou'], 'Unsupported mode {}'.format(mode)
# Either the boxes are empty or the length of boxes's last dimenstion is 4
assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0)
assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0)
# Batch dim must be the same
# Batch dim: (B1, B2, ... Bn)
assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
batch_shape = bboxes1.shape[:-2]
rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0
cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0
if is_aligned:
assert rows == cols
if rows * cols == 0:
if is_aligned:
return paddle.full(batch_shape + (rows, ), 1)
else:
return paddle.full(batch_shape + (rows, cols), 1)
area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1])
if is_aligned:
lt = paddle.maximum(bboxes1[:, :2], bboxes2[:, :2]) # [B, rows, 2]
rb = paddle.minimum(bboxes1[:, 2:], bboxes2[:, 2:]) # [B, rows, 2]
wh = (rb - lt).clip(min=0) # [B, rows, 2]
overlap = wh[:, 0] * wh[:, 1]
if mode in ['iou', 'giou']:
union = area1 + area2 - overlap
else:
union = area1
if mode == 'giou':
enclosed_lt = paddle.minimum(bboxes1[:, :2], bboxes2[:, :2])
enclosed_rb = paddle.maximum(bboxes1[:, 2:], bboxes2[:, 2:])
else:
lt = paddle.maximum(bboxes1[:, :2].reshape([rows, 1, 2]),
bboxes2[:, :2]) # [B, rows, cols, 2]
rb = paddle.minimum(bboxes1[:, 2:].reshape([rows, 1, 2]),
bboxes2[:, 2:]) # [B, rows, cols, 2]
wh = (rb - lt).clip(min=0) # [B, rows, cols, 2]
overlap = wh[:, :, 0] * wh[:, :, 1]
if mode in ['iou', 'giou']:
union = area1.reshape([rows,1]) \
+ area2.reshape([1,cols]) - overlap
else:
union = area1[:, None]
if mode == 'giou':
enclosed_lt = paddle.minimum(bboxes1[:, :2].reshape([rows, 1, 2]),
bboxes2[:, :2])
enclosed_rb = paddle.maximum(bboxes1[:, 2:].reshape([rows, 1, 2]),
bboxes2[:, 2:])
eps = paddle.to_tensor([eps])
union = paddle.maximum(union, eps)
ious = overlap / union
if mode in ['iou', 'iof']:
return ious
# calculate gious
enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
enclose_area = enclose_wh[:, :, 0] * enclose_wh[:, :, 1]
enclose_area = paddle.maximum(enclose_area, eps)
gious = ious - (enclose_area - union) / enclose_area
return 1 - gious
def xywh2xyxy(box):
x, y, w, h = box
x1 = x - w * 0.5
y1 = y - h * 0.5
x2 = x + w * 0.5
y2 = y + h * 0.5
return [x1, y1, x2, y2]
def make_grid(h, w, dtype):
yv, xv = paddle.meshgrid([paddle.arange(h), paddle.arange(w)])
return paddle.stack((xv, yv), 2).cast(dtype=dtype)
def decode_yolo(box, anchor, downsample_ratio):
"""decode yolo box
Args:
box (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
anchor (list): anchor with the shape [na, 2]
downsample_ratio (int): downsample ratio, default 32
scale (float): scale, default 1.
Return:
box (list): decoded box, [x, y, w, h], all have the shape [b, na, h, w, 1]
"""
x, y, w, h = box
na, grid_h, grid_w = x.shape[1:4]
grid = make_grid(grid_h, grid_w, x.dtype).reshape((1, 1, grid_h, grid_w, 2))
x1 = (x + grid[:, :, :, :, 0:1]) / grid_w
y1 = (y + grid[:, :, :, :, 1:2]) / grid_h
anchor = paddle.to_tensor(anchor, dtype=x.dtype)
anchor = anchor.reshape((1, na, 1, 1, 2))
w1 = paddle.exp(w) * anchor[:, :, :, :, 0:1] / (downsample_ratio * grid_w)
h1 = paddle.exp(h) * anchor[:, :, :, :, 1:2] / (downsample_ratio * grid_h)
return [x1, y1, w1, h1]
def batch_iou_similarity(box1, box2, eps=1e-9):
"""Calculate iou of box1 and box2 in batch
Args:
box1 (Tensor): box with the shape [N, M1, 4]
box2 (Tensor): box with the shape [N, M2, 4]
Return:
iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2]
"""
box1 = box1.unsqueeze(2) # [N, M1, 4] -> [N, M1, 1, 4]
box2 = box2.unsqueeze(1) # [N, M2, 4] -> [N, 1, M2, 4]
px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4]
gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4]
x1y1 = paddle.maximum(px1y1, gx1y1)
x2y2 = paddle.minimum(px2y2, gx2y2)
overlap = (x2y2 - x1y1).clip(0).prod(-1)
area1 = (px2y2 - px1y1).clip(0).prod(-1)
area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
union = area1 + area2 - overlap + eps
return overlap / union
def bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9):
"""calculate the iou of box1 and box2
Args:
box1 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
box2 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
giou (bool): whether use giou or not, default False
diou (bool): whether use diou or not, default False
ciou (bool): whether use ciou or not, default False
eps (float): epsilon to avoid divide by zero
Return:
iou (Tensor): iou of box1 and box1, with the shape [b, na, h, w, 1]
"""
px1, py1, px2, py2 = box1
gx1, gy1, gx2, gy2 = box2
x1 = paddle.maximum(px1, gx1)
y1 = paddle.maximum(py1, gy1)
x2 = paddle.minimum(px2, gx2)
y2 = paddle.minimum(py2, gy2)
overlap = ((x2 - x1).clip(0)) * ((y2 - y1).clip(0))
area1 = (px2 - px1) * (py2 - py1)
area1 = area1.clip(0)
area2 = (gx2 - gx1) * (gy2 - gy1)
area2 = area2.clip(0)
union = area1 + area2 - overlap + eps
iou = overlap / union
if giou or ciou or diou:
# convex w, h
cw = paddle.maximum(px2, gx2) - paddle.minimum(px1, gx1)
ch = paddle.maximum(py2, gy2) - paddle.minimum(py1, gy1)
if giou:
c_area = cw * ch + eps
return iou - (c_area - union) / c_area
else:
# convex diagonal squared
c2 = cw**2 + ch**2 + eps
# center distance
rho2 = ((px1 + px2 - gx1 - gx2)**2 + (py1 + py2 - gy1 - gy2)**2) / 4
if diou:
return iou - rho2 / c2
else:
w1, h1 = px2 - px1, py2 - py1 + eps
w2, h2 = gx2 - gx1, gy2 - gy1 + eps
delta = paddle.atan(w1 / h1) - paddle.atan(w2 / h2)
v = (4 / math.pi**2) * paddle.pow(delta, 2)
alpha = v / (1 + eps - iou + v)
alpha.stop_gradient = True
return iou - (rho2 / c2 + v * alpha)
else:
return iou
def bbox_iou_np_expand(box1, box2, x1y1x2y2=True, eps=1e-16):
"""
Calculate the iou of box1 and box2 with numpy.
Args:
box1 (ndarray): [N, 4]
box2 (ndarray): [M, 4], usually N != M
x1y1x2y2 (bool): whether in x1y1x2y2 stype, default True
eps (float): epsilon to avoid divide by zero
Return:
iou (ndarray): iou of box1 and box2, [N, M]
"""
N, M = len(box1), len(box2) # usually N != M
if x1y1x2y2:
b1_x1, b1_y1 = box1[:, 0], box1[:, 1]
b1_x2, b1_y2 = box1[:, 2], box1[:, 3]
b2_x1, b2_y1 = box2[:, 0], box2[:, 1]
b2_x2, b2_y2 = box2[:, 2], box2[:, 3]
else:
# cxcywh style
# Transform from center and width to exact coordinates
b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
# get the coordinates of the intersection rectangle
inter_rect_x1 = np.zeros((N, M), dtype=np.float32)
inter_rect_y1 = np.zeros((N, M), dtype=np.float32)
inter_rect_x2 = np.zeros((N, M), dtype=np.float32)
inter_rect_y2 = np.zeros((N, M), dtype=np.float32)
for i in range(len(box2)):
inter_rect_x1[:, i] = np.maximum(b1_x1, b2_x1[i])
inter_rect_y1[:, i] = np.maximum(b1_y1, b2_y1[i])
inter_rect_x2[:, i] = np.minimum(b1_x2, b2_x2[i])
inter_rect_y2[:, i] = np.minimum(b1_y2, b2_y2[i])
# Intersection area
inter_area = np.maximum(inter_rect_x2 - inter_rect_x1, 0) * np.maximum(
inter_rect_y2 - inter_rect_y1, 0)
# Union Area
b1_area = np.repeat(
((b1_x2 - b1_x1) * (b1_y2 - b1_y1)).reshape(-1, 1), M, axis=-1)
b2_area = np.repeat(
((b2_x2 - b2_x1) * (b2_y2 - b2_y1)).reshape(1, -1), N, axis=0)
ious = inter_area / (b1_area + b2_area - inter_area + eps)
return ious
def bbox2distance(points, bbox, max_dis=None, eps=0.1):
"""Decode bounding box based on distances.
Args:
points (Tensor): Shape (n, 2), [x, y].
bbox (Tensor): Shape (n, 4), "xyxy" format
max_dis (float): Upper bound of the distance.
eps (float): a small value to ensure target < max_dis, instead <=
Returns:
Tensor: Decoded distances.
"""
left = points[:, 0] - bbox[:, 0]
top = points[:, 1] - bbox[:, 1]
right = bbox[:, 2] - points[:, 0]
bottom = bbox[:, 3] - points[:, 1]
if max_dis is not None:
left = left.clip(min=0, max=max_dis - eps)
top = top.clip(min=0, max=max_dis - eps)
right = right.clip(min=0, max=max_dis - eps)
bottom = bottom.clip(min=0, max=max_dis - eps)
return paddle.stack([left, top, right, bottom], -1)
def distance2bbox(points, distance, max_shape=None):
"""Decode distance prediction to bounding box.
Args:
points (Tensor): Shape (n, 2), [x, y].
distance (Tensor): Distance from the given point to 4
boundaries (left, top, right, bottom).
max_shape (tuple): Shape of the image.
Returns:
Tensor: Decoded bboxes.
"""
x1 = points[:, 0] - distance[:, 0]
y1 = points[:, 1] - distance[:, 1]
x2 = points[:, 0] + distance[:, 2]
y2 = points[:, 1] + distance[:, 3]
if max_shape is not None:
x1 = x1.clip(min=0, max=max_shape[1])
y1 = y1.clip(min=0, max=max_shape[0])
x2 = x2.clip(min=0, max=max_shape[1])
y2 = y2.clip(min=0, max=max_shape[0])
return paddle.stack([x1, y1, x2, y2], -1)
def bbox_center(boxes):
"""Get bbox centers from boxes.
Args:
boxes (Tensor): boxes with shape (..., 4), "xmin, ymin, xmax, ymax" format.
Returns:
Tensor: boxes centers with shape (..., 2), "cx, cy" format.
"""
boxes_cx = (boxes[..., 0] + boxes[..., 2]) / 2
boxes_cy = (boxes[..., 1] + boxes[..., 3]) / 2
return paddle.stack([boxes_cx, boxes_cy], axis=-1)
def batch_distance2bbox(points, distance, max_shapes=None):
"""Decode distance prediction to bounding box for batch.
Args:
points (Tensor): [B, ..., 2], "xy" format
distance (Tensor): [B, ..., 4], "ltrb" format
max_shapes (Tensor): [B, 2], "h,w" format, Shape of the image.
Returns:
Tensor: Decoded bboxes, "x1y1x2y2" format.
"""
lt, rb = paddle.split(distance, 2, -1)
# while tensor add parameters, parameters should be better placed on the second place
x1y1 = -lt + points
x2y2 = rb + points
out_bbox = paddle.concat([x1y1, x2y2], -1)
if max_shapes is not None:
max_shapes = max_shapes.flip(-1).tile([1, 2])
delta_dim = out_bbox.ndim - max_shapes.ndim
for _ in range(delta_dim):
max_shapes.unsqueeze_(1)
out_bbox = paddle.where(out_bbox < max_shapes, out_bbox, max_shapes)
out_bbox = paddle.where(out_bbox > 0, out_bbox,
paddle.zeros_like(out_bbox))
return out_bbox
def iou_similarity(box1, box2, eps=1e-10):
"""Calculate iou of box1 and box2
Args:
box1 (Tensor): box with the shape [M1, 4]
box2 (Tensor): box with the shape [M2, 4]
Return:
iou (Tensor): iou between box1 and box2 with the shape [M1, M2]
"""
box1 = box1.unsqueeze(1) # [M1, 4] -> [M1, 1, 4]
box2 = box2.unsqueeze(0) # [M2, 4] -> [1, M2, 4]
px1y1, px2y2 = box1[:, :, 0:2], box1[:, :, 2:4]
gx1y1, gx2y2 = box2[:, :, 0:2], box2[:, :, 2:4]
x1y1 = paddle.maximum(px1y1, gx1y1)
x2y2 = paddle.minimum(px2y2, gx2y2)
overlap = (x2y2 - x1y1).clip(0).prod(-1)
area1 = (px2y2 - px1y1).clip(0).prod(-1)
area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
union = area1 + area2 - overlap + eps
return overlap / union

View File

@@ -0,0 +1,40 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
def _get_class_default_kwargs(cls, *args, **kwargs):
"""
Get default arguments of a class in dict format, if args and
kwargs is specified, it will replace default arguments
"""
varnames = cls.__init__.__code__.co_varnames
argcount = cls.__init__.__code__.co_argcount
keys = varnames[:argcount]
assert keys[0] == 'self'
keys = keys[1:]
values = list(cls.__init__.__defaults__)
assert len(values) == len(keys)
if len(args) > 0:
for i, arg in enumerate(args):
values[i] = arg
default_kwargs = dict(zip(keys, values))
if len(kwargs) > 0:
for k, v in kwargs.items():
default_kwargs[k] = v
return default_kwargs

View File

@@ -0,0 +1,16 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .detr_head import *

View File

@@ -0,0 +1,534 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from ..initializer import linear_init_, constant_
from ..transformers.utils import inverse_sigmoid
import pycocotools.mask as mask_util
__all__ = ['DETRHead', 'DeformableDETRHead', 'DINOHead', 'MaskDINOHead']
class MLP(nn.Layer):
"""This code is based on
https://github.com/facebookresearch/detr/blob/main/models/detr.py
"""
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
self.layers = nn.LayerList(
nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
self._reset_parameters()
def _reset_parameters(self):
for l in self.layers:
linear_init_(l)
def forward(self, x):
for i, layer in enumerate(self.layers):
x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
return x
class MultiHeadAttentionMap(nn.Layer):
"""This code is based on
https://github.com/facebookresearch/detr/blob/main/models/segmentation.py
This is a 2D attention module, which only returns the attention softmax (no multiplication by value)
"""
def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0,
bias=True):
super().__init__()
self.num_heads = num_heads
self.hidden_dim = hidden_dim
self.dropout = nn.Dropout(dropout)
weight_attr = paddle.ParamAttr(
initializer=paddle.nn.initializer.XavierUniform())
bias_attr = paddle.framework.ParamAttr(
initializer=paddle.nn.initializer.Constant()) if bias else False
self.q_proj = nn.Linear(query_dim, hidden_dim, weight_attr, bias_attr)
self.k_proj = nn.Conv2D(
query_dim,
hidden_dim,
1,
weight_attr=weight_attr,
bias_attr=bias_attr)
self.normalize_fact = float(hidden_dim / self.num_heads)**-0.5
def forward(self, q, k, mask=None):
q = self.q_proj(q)
k = self.k_proj(k)
bs, num_queries, n, c, h, w = q.shape[0], q.shape[1], self.num_heads,\
self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1]
qh = q.reshape([bs, num_queries, n, c])
kh = k.reshape([bs, n, c, h, w])
# weights = paddle.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh)
qh = qh.transpose([0, 2, 1, 3]).reshape([-1, num_queries, c])
kh = kh.reshape([-1, c, h * w])
weights = paddle.bmm(qh * self.normalize_fact, kh).reshape(
[bs, n, num_queries, h, w]).transpose([0, 2, 1, 3, 4])
if mask is not None:
weights += mask
# fix a potenial bug: https://github.com/facebookresearch/detr/issues/247
weights = F.softmax(weights.flatten(3), axis=-1).reshape(weights.shape)
weights = self.dropout(weights)
return weights
class MaskHeadFPNConv(nn.Layer):
"""This code is based on
https://github.com/facebookresearch/detr/blob/main/models/segmentation.py
Simple convolutional head, using group norm.
Upsampling is done using a FPN approach
"""
def __init__(self, input_dim, fpn_dims, context_dim, num_groups=8):
super().__init__()
inter_dims = [input_dim,
] + [context_dim // (2**i) for i in range(1, 5)]
weight_attr = paddle.ParamAttr(
initializer=paddle.nn.initializer.KaimingUniform())
bias_attr = paddle.framework.ParamAttr(
initializer=paddle.nn.initializer.Constant())
self.conv0 = self._make_layers(input_dim, input_dim, 3, num_groups,
weight_attr, bias_attr)
self.conv_inter = nn.LayerList()
for in_dims, out_dims in zip(inter_dims[:-1], inter_dims[1:]):
self.conv_inter.append(
self._make_layers(in_dims, out_dims, 3, num_groups, weight_attr,
bias_attr))
self.conv_out = nn.Conv2D(
inter_dims[-1],
1,
3,
padding=1,
weight_attr=weight_attr,
bias_attr=bias_attr)
self.adapter = nn.LayerList()
for i in range(len(fpn_dims)):
self.adapter.append(
nn.Conv2D(
fpn_dims[i],
inter_dims[i + 1],
1,
weight_attr=weight_attr,
bias_attr=bias_attr))
def _make_layers(self,
in_dims,
out_dims,
kernel_size,
num_groups,
weight_attr=None,
bias_attr=None):
return nn.Sequential(
nn.Conv2D(
in_dims,
out_dims,
kernel_size,
padding=kernel_size // 2,
weight_attr=weight_attr,
bias_attr=bias_attr),
nn.GroupNorm(num_groups, out_dims),
nn.ReLU())
def forward(self, x, bbox_attention_map, fpns):
x = paddle.concat([
x.tile([bbox_attention_map.shape[1], 1, 1, 1]),
bbox_attention_map.flatten(0, 1)
], 1)
x = self.conv0(x)
for inter_layer, adapter_layer, feat in zip(self.conv_inter[:-1],
self.adapter, fpns):
feat = adapter_layer(feat).tile(
[bbox_attention_map.shape[1], 1, 1, 1])
x = inter_layer(x)
x = feat + F.interpolate(x, size=feat.shape[-2:])
x = self.conv_inter[-1](x)
x = self.conv_out(x)
return x
@register
class DETRHead(nn.Layer):
__shared__ = ['num_classes', 'hidden_dim', 'use_focal_loss']
__inject__ = ['loss']
def __init__(self,
num_classes=80,
hidden_dim=256,
nhead=8,
num_mlp_layers=3,
loss='DETRLoss',
fpn_dims=[1024, 512, 256],
with_mask_head=False,
use_focal_loss=False):
super(DETRHead, self).__init__()
# add background class
self.num_classes = num_classes if use_focal_loss else num_classes + 1
self.hidden_dim = hidden_dim
self.loss = loss
self.with_mask_head = with_mask_head
self.use_focal_loss = use_focal_loss
self.score_head = nn.Linear(hidden_dim, self.num_classes)
self.bbox_head = MLP(hidden_dim,
hidden_dim,
output_dim=4,
num_layers=num_mlp_layers)
if self.with_mask_head:
self.bbox_attention = MultiHeadAttentionMap(hidden_dim, hidden_dim,
nhead)
self.mask_head = MaskHeadFPNConv(hidden_dim + nhead, fpn_dims,
hidden_dim)
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.score_head)
@classmethod
def from_config(cls, cfg, hidden_dim, nhead, input_shape):
return {
'hidden_dim': hidden_dim,
'nhead': nhead,
'fpn_dims': [i.channels for i in input_shape[::-1]][1:]
}
@staticmethod
def get_gt_mask_from_polygons(gt_poly, pad_mask):
out_gt_mask = []
for polygons, padding in zip(gt_poly, pad_mask):
height, width = int(padding[:, 0].sum()), int(padding[0, :].sum())
masks = []
for obj_poly in polygons:
rles = mask_util.frPyObjects(obj_poly, height, width)
rle = mask_util.merge(rles)
masks.append(
paddle.to_tensor(mask_util.decode(rle)).astype('float32'))
masks = paddle.stack(masks)
masks_pad = paddle.zeros(
[masks.shape[0], pad_mask.shape[1], pad_mask.shape[2]])
masks_pad[:, :height, :width] = masks
out_gt_mask.append(masks_pad)
return out_gt_mask
def forward(self, out_transformer, body_feats, inputs=None):
r"""
Args:
out_transformer (Tuple): (feats: [num_levels, batch_size,
num_queries, hidden_dim],
memory: [batch_size, hidden_dim, h, w],
src_proj: [batch_size, h*w, hidden_dim],
src_mask: [batch_size, 1, 1, h, w])
body_feats (List(Tensor)): list[[B, C, H, W]]
inputs (dict): dict(inputs)
"""
feats, memory, src_proj, src_mask = out_transformer
outputs_logit = self.score_head(feats)
outputs_bbox = F.sigmoid(self.bbox_head(feats))
outputs_seg = None
if self.with_mask_head:
bbox_attention_map = self.bbox_attention(feats[-1], memory,
src_mask)
fpn_feats = [a for a in body_feats[::-1]][1:]
outputs_seg = self.mask_head(src_proj, bbox_attention_map,
fpn_feats)
outputs_seg = outputs_seg.reshape([
feats.shape[1], feats.shape[2], outputs_seg.shape[-2],
outputs_seg.shape[-1]
])
if self.training:
assert inputs is not None
assert 'gt_bbox' in inputs and 'gt_class' in inputs
gt_mask = self.get_gt_mask_from_polygons(
inputs['gt_poly'],
inputs['pad_mask']) if 'gt_poly' in inputs else None
return self.loss(
outputs_bbox,
outputs_logit,
inputs['gt_bbox'],
inputs['gt_class'],
masks=outputs_seg,
gt_mask=gt_mask)
else:
return (outputs_bbox[-1], outputs_logit[-1], outputs_seg)
@register
class DeformableDETRHead(nn.Layer):
__shared__ = ['num_classes', 'hidden_dim']
__inject__ = ['loss']
def __init__(self,
num_classes=80,
hidden_dim=512,
nhead=8,
num_mlp_layers=3,
loss='DETRLoss'):
super(DeformableDETRHead, self).__init__()
self.num_classes = num_classes
self.hidden_dim = hidden_dim
self.nhead = nhead
self.loss = loss
self.score_head = nn.Linear(hidden_dim, self.num_classes)
self.bbox_head = MLP(hidden_dim,
hidden_dim,
output_dim=4,
num_layers=num_mlp_layers)
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.score_head)
constant_(self.score_head.bias, -4.595)
constant_(self.bbox_head.layers[-1].weight)
with paddle.no_grad():
bias = paddle.zeros_like(self.bbox_head.layers[-1].bias)
bias[2:] = -2.0
self.bbox_head.layers[-1].bias.set_value(bias)
@classmethod
def from_config(cls, cfg, hidden_dim, nhead, input_shape):
return {'hidden_dim': hidden_dim, 'nhead': nhead}
def forward(self, out_transformer, body_feats, inputs=None):
r"""
Args:
out_transformer (Tuple): (feats: [num_levels, batch_size,
num_queries, hidden_dim],
memory: [batch_size,
\sum_{l=0}^{L-1} H_l \cdot W_l, hidden_dim],
reference_points: [batch_size, num_queries, 2])
body_feats (List(Tensor)): list[[B, C, H, W]]
inputs (dict): dict(inputs)
"""
feats, memory, reference_points = out_transformer
reference_points = inverse_sigmoid(reference_points.unsqueeze(0))
outputs_bbox = self.bbox_head(feats)
# It's equivalent to "outputs_bbox[:, :, :, :2] += reference_points",
# but the gradient is wrong in paddle.
outputs_bbox = paddle.concat(
[
outputs_bbox[:, :, :, :2] + reference_points,
outputs_bbox[:, :, :, 2:]
],
axis=-1)
outputs_bbox = F.sigmoid(outputs_bbox)
outputs_logit = self.score_head(feats)
if self.training:
assert inputs is not None
assert 'gt_bbox' in inputs and 'gt_class' in inputs
return self.loss(outputs_bbox, outputs_logit, inputs['gt_bbox'],
inputs['gt_class'])
else:
return (outputs_bbox[-1], outputs_logit[-1], None)
@register
class DINOHead(nn.Layer):
__inject__ = ['loss']
def __init__(self, loss='DINOLoss'):
super(DINOHead, self).__init__()
self.loss = loss
def forward(self, out_transformer, body_feats, inputs=None):
(dec_out_bboxes, dec_out_logits, enc_topk_bboxes, enc_topk_logits,
dn_meta) = out_transformer
if self.training:
assert inputs is not None
assert 'gt_bbox' in inputs and 'gt_class' in inputs
if dn_meta is not None:
if isinstance(dn_meta, list):
dual_groups = len(dn_meta) - 1
dec_out_bboxes = paddle.split(
dec_out_bboxes, dual_groups + 1, axis=2)
dec_out_logits = paddle.split(
dec_out_logits, dual_groups + 1, axis=2)
enc_topk_bboxes = paddle.split(
enc_topk_bboxes, dual_groups + 1, axis=1)
enc_topk_logits = paddle.split(
enc_topk_logits, dual_groups + 1, axis=1)
dec_out_bboxes_list = []
dec_out_logits_list = []
dn_out_bboxes_list = []
dn_out_logits_list = []
loss = {}
for g_id in range(dual_groups + 1):
if dn_meta[g_id] is not None:
dn_out_bboxes_gid, dec_out_bboxes_gid = paddle.split(
dec_out_bboxes[g_id],
dn_meta[g_id]['dn_num_split'],
axis=2)
dn_out_logits_gid, dec_out_logits_gid = paddle.split(
dec_out_logits[g_id],
dn_meta[g_id]['dn_num_split'],
axis=2)
else:
dn_out_bboxes_gid, dn_out_logits_gid = None, None
dec_out_bboxes_gid = dec_out_bboxes[g_id]
dec_out_logits_gid = dec_out_logits[g_id]
out_bboxes_gid = paddle.concat([
enc_topk_bboxes[g_id].unsqueeze(0),
dec_out_bboxes_gid
])
out_logits_gid = paddle.concat([
enc_topk_logits[g_id].unsqueeze(0),
dec_out_logits_gid
])
loss_gid = self.loss(
out_bboxes_gid,
out_logits_gid,
inputs['gt_bbox'],
inputs['gt_class'],
dn_out_bboxes=dn_out_bboxes_gid,
dn_out_logits=dn_out_logits_gid,
dn_meta=dn_meta[g_id])
# sum loss
for key, value in loss_gid.items():
loss.update({
key: loss.get(key, paddle.zeros([1])) + value
})
# average across (dual_groups + 1)
for key, value in loss.items():
loss.update({key: value / (dual_groups + 1)})
return loss
else:
dn_out_bboxes, dec_out_bboxes = paddle.split(
dec_out_bboxes, dn_meta['dn_num_split'], axis=2)
dn_out_logits, dec_out_logits = paddle.split(
dec_out_logits, dn_meta['dn_num_split'], axis=2)
else:
dn_out_bboxes, dn_out_logits = None, None
out_bboxes = paddle.concat(
[enc_topk_bboxes.unsqueeze(0), dec_out_bboxes])
out_logits = paddle.concat(
[enc_topk_logits.unsqueeze(0), dec_out_logits])
return self.loss(
out_bboxes,
out_logits,
inputs['gt_bbox'],
inputs['gt_class'],
dn_out_bboxes=dn_out_bboxes,
dn_out_logits=dn_out_logits,
dn_meta=dn_meta)
else:
return (dec_out_bboxes[-1], dec_out_logits[-1], None)
@register
class MaskDINOHead(nn.Layer):
__inject__ = ['loss']
def __init__(self, loss='DINOLoss'):
super(MaskDINOHead, self).__init__()
self.loss = loss
def forward(self, out_transformer, body_feats, inputs=None):
(dec_out_logits, dec_out_bboxes, dec_out_masks, enc_out, init_out,
dn_meta) = out_transformer
if self.training:
assert inputs is not None
assert 'gt_bbox' in inputs and 'gt_class' in inputs
assert 'gt_segm' in inputs
if dn_meta is not None:
dn_out_logits, dec_out_logits = paddle.split(
dec_out_logits, dn_meta['dn_num_split'], axis=2)
dn_out_bboxes, dec_out_bboxes = paddle.split(
dec_out_bboxes, dn_meta['dn_num_split'], axis=2)
dn_out_masks, dec_out_masks = paddle.split(
dec_out_masks, dn_meta['dn_num_split'], axis=2)
if init_out is not None:
init_out_logits, init_out_bboxes, init_out_masks = init_out
init_out_logits_dn, init_out_logits = paddle.split(
init_out_logits, dn_meta['dn_num_split'], axis=1)
init_out_bboxes_dn, init_out_bboxes = paddle.split(
init_out_bboxes, dn_meta['dn_num_split'], axis=1)
init_out_masks_dn, init_out_masks = paddle.split(
init_out_masks, dn_meta['dn_num_split'], axis=1)
dec_out_logits = paddle.concat(
[init_out_logits.unsqueeze(0), dec_out_logits])
dec_out_bboxes = paddle.concat(
[init_out_bboxes.unsqueeze(0), dec_out_bboxes])
dec_out_masks = paddle.concat(
[init_out_masks.unsqueeze(0), dec_out_masks])
dn_out_logits = paddle.concat(
[init_out_logits_dn.unsqueeze(0), dn_out_logits])
dn_out_bboxes = paddle.concat(
[init_out_bboxes_dn.unsqueeze(0), dn_out_bboxes])
dn_out_masks = paddle.concat(
[init_out_masks_dn.unsqueeze(0), dn_out_masks])
else:
dn_out_bboxes, dn_out_logits = None, None
dn_out_masks = None
enc_out_logits, enc_out_bboxes, enc_out_masks = enc_out
out_logits = paddle.concat(
[enc_out_logits.unsqueeze(0), dec_out_logits])
out_bboxes = paddle.concat(
[enc_out_bboxes.unsqueeze(0), dec_out_bboxes])
out_masks = paddle.concat(
[enc_out_masks.unsqueeze(0), dec_out_masks])
return self.loss(
out_bboxes,
out_logits,
inputs['gt_bbox'],
inputs['gt_class'],
masks=out_masks,
gt_mask=inputs['gt_segm'],
dn_out_logits=dn_out_logits,
dn_out_bboxes=dn_out_bboxes,
dn_out_masks=dn_out_masks,
dn_meta=dn_meta)
else:
return (dec_out_bboxes[-1], dec_out_logits[-1], dec_out_masks[-1])

View File

@@ -0,0 +1,325 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file.
"""
import math
import numpy as np
import paddle
import paddle.nn as nn
__all__ = [
'uniform_',
'normal_',
'constant_',
'ones_',
'zeros_',
'xavier_uniform_',
'xavier_normal_',
'kaiming_uniform_',
'kaiming_normal_',
'linear_init_',
'conv_init_',
'reset_initialized_parameter',
]
def _no_grad_uniform_(tensor, a, b):
with paddle.no_grad():
tensor.set_value(
paddle.uniform(
shape=tensor.shape, dtype=tensor.dtype, min=a, max=b))
return tensor
def _no_grad_normal_(tensor, mean=0., std=1.):
with paddle.no_grad():
tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape))
return tensor
def _no_grad_fill_(tensor, value=0.):
with paddle.no_grad():
tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype))
return tensor
def uniform_(tensor, a, b):
"""
Modified tensor inspace using uniform_
Args:
tensor (paddle.Tensor): paddle Tensor
a (float|int): min value.
b (float|int): max value.
Return:
tensor
"""
return _no_grad_uniform_(tensor, a, b)
def normal_(tensor, mean=0., std=1.):
"""
Modified tensor inspace using normal_
Args:
tensor (paddle.Tensor): paddle Tensor
mean (float|int): mean value.
std (float|int): std value.
Return:
tensor
"""
return _no_grad_normal_(tensor, mean, std)
def constant_(tensor, value=0.):
"""
Modified tensor inspace using constant_
Args:
tensor (paddle.Tensor): paddle Tensor
value (float|int): value to fill tensor.
Return:
tensor
"""
return _no_grad_fill_(tensor, value)
def ones_(tensor):
"""
Modified tensor inspace using ones_
Args:
tensor (paddle.Tensor): paddle Tensor
Return:
tensor
"""
return _no_grad_fill_(tensor, 1)
def zeros_(tensor):
"""
Modified tensor inspace using zeros_
Args:
tensor (paddle.Tensor): paddle Tensor
Return:
tensor
"""
return _no_grad_fill_(tensor, 0)
def vector_(tensor, vector):
with paddle.no_grad():
tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype))
return tensor
def _calculate_fan_in_and_fan_out(tensor, reverse=False):
"""
Calculate (fan_in, _fan_out) for tensor
Args:
tensor (Tensor): paddle.Tensor
reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True
Return:
Tuple[fan_in, fan_out]
"""
if tensor.ndim < 2:
raise ValueError(
"Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
)
if reverse:
num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
else:
num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0]
receptive_field_size = 1
if tensor.ndim > 2:
receptive_field_size = np.prod(tensor.shape[2:])
fan_in = num_input_fmaps * receptive_field_size
fan_out = num_output_fmaps * receptive_field_size
return fan_in, fan_out
def xavier_uniform_(tensor, gain=1., reverse=False):
"""
Modified tensor inspace using xavier_uniform_
Args:
tensor (paddle.Tensor): paddle Tensor
gain (float): super parameter, 1. default.
reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
Return:
tensor
"""
fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
k = math.sqrt(3.0) * std
return _no_grad_uniform_(tensor, -k, k)
def xavier_normal_(tensor, gain=1., reverse=False):
"""
Modified tensor inspace using xavier_normal_
Args:
tensor (paddle.Tensor): paddle Tensor
gain (float): super parameter, 1. default.
reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
Return:
tensor
"""
fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
return _no_grad_normal_(tensor, 0, std)
# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html
def _calculate_correct_fan(tensor, mode, reverse=False):
mode = mode.lower()
valid_modes = ['fan_in', 'fan_out']
if mode not in valid_modes:
raise ValueError("Mode {} not supported, please use one of {}".format(
mode, valid_modes))
fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
return fan_in if mode == 'fan_in' else fan_out
def _calculate_gain(nonlinearity, param=None):
linear_fns = [
'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',
'conv_transpose2d', 'conv_transpose3d'
]
if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
return 1
elif nonlinearity == 'tanh':
return 5.0 / 3
elif nonlinearity == 'relu':
return math.sqrt(2.0)
elif nonlinearity == 'leaky_relu':
if param is None:
negative_slope = 0.01
elif not isinstance(param, bool) and isinstance(
param, int) or isinstance(param, float):
# True/False are instances of int, hence check above
negative_slope = param
else:
raise ValueError("negative_slope {} not a valid number".format(
param))
return math.sqrt(2.0 / (1 + negative_slope**2))
elif nonlinearity == 'selu':
return 3.0 / 4
else:
raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
def kaiming_uniform_(tensor,
a=0,
mode='fan_in',
nonlinearity='leaky_relu',
reverse=False):
"""
Modified tensor inspace using kaiming_uniform method
Args:
tensor (paddle.Tensor): paddle Tensor
mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
nonlinearity (str): nonlinearity method name
reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
Return:
tensor
"""
fan = _calculate_correct_fan(tensor, mode, reverse)
gain = _calculate_gain(nonlinearity, a)
std = gain / math.sqrt(fan)
k = math.sqrt(3.0) * std
return _no_grad_uniform_(tensor, -k, k)
def kaiming_normal_(tensor,
a=0,
mode='fan_in',
nonlinearity='leaky_relu',
reverse=False):
"""
Modified tensor inspace using kaiming_normal_
Args:
tensor (paddle.Tensor): paddle Tensor
mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
nonlinearity (str): nonlinearity method name
reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
Return:
tensor
"""
fan = _calculate_correct_fan(tensor, mode, reverse)
gain = _calculate_gain(nonlinearity, a)
std = gain / math.sqrt(fan)
return _no_grad_normal_(tensor, 0, std)
def linear_init_(module):
bound = 1 / math.sqrt(module.weight.shape[0])
uniform_(module.weight, -bound, bound)
if hasattr(module, "bias") and module.bias is not None:
uniform_(module.bias, -bound, bound)
def conv_init_(module):
bound = 1 / np.sqrt(np.prod(module.weight.shape[1:]))
uniform_(module.weight, -bound, bound)
if module.bias is not None:
uniform_(module.bias, -bound, bound)
def bias_init_with_prob(prior_prob=0.01):
"""initialize conv/fc bias value according to a given probability value."""
bias_init = float(-np.log((1 - prior_prob) / prior_prob))
return bias_init
@paddle.no_grad()
def reset_initialized_parameter(model, include_self=True):
"""
Reset initialized parameter using following method for [conv, linear, embedding, bn]
Args:
model (paddle.Layer): paddle Layer
include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself
Return:
None
"""
for _, m in model.named_sublayers(include_self=include_self):
if isinstance(m, nn.Conv2D):
k = float(m._groups) / (m._in_channels * m._kernel_size[0] *
m._kernel_size[1])
k = math.sqrt(k)
_no_grad_uniform_(m.weight, -k, k)
if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
_no_grad_uniform_(m.bias, -k, k)
elif isinstance(m, nn.Linear):
k = math.sqrt(1. / m.weight.shape[0])
_no_grad_uniform_(m.weight, -k, k)
if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
_no_grad_uniform_(m.bias, -k, k)
elif isinstance(m, nn.Embedding):
_no_grad_normal_(m.weight, mean=0., std=1.)
elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)):
_no_grad_fill_(m.weight, 1.)
if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
_no_grad_fill_(m.bias, 0)

View File

@@ -0,0 +1,403 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
this code is based on https://github.com/open-mmlab/mmpose
"""
import cv2
import numpy as np
import paddle.nn.functional as F
def get_affine_mat_kernel(h, w, s, inv=False):
if w < h:
w_ = s
h_ = int(np.ceil((s / w * h) / 64.) * 64)
scale_w = w
scale_h = h_ / w_ * w
else:
h_ = s
w_ = int(np.ceil((s / h * w) / 64.) * 64)
scale_h = h
scale_w = w_ / h_ * h
center = np.array([np.round(w / 2.), np.round(h / 2.)])
size_resized = (w_, h_)
trans = get_affine_transform(
center, np.array([scale_w, scale_h]), 0, size_resized, inv=inv)
return trans, size_resized
def get_affine_transform(center,
input_size,
rot,
output_size,
shift=(0., 0.),
inv=False):
"""Get the affine transform matrix, given the center/scale/rot/output_size.
Args:
center (np.ndarray[2, ]): Center of the bounding box (x, y).
input_size (np.ndarray[2, ]): Size of input feature (width, height).
rot (float): Rotation angle (degree).
output_size (np.ndarray[2, ]): Size of the destination heatmaps.
shift (0-100%): Shift translation ratio wrt the width/height.
Default (0., 0.).
inv (bool): Option to inverse the affine transform direction.
(inv=False: src->dst or inv=True: dst->src)
Returns:
np.ndarray: The transform matrix.
"""
assert len(center) == 2
assert len(output_size) == 2
assert len(shift) == 2
if not isinstance(input_size, (np.ndarray, list)):
input_size = np.array([input_size, input_size], dtype=np.float32)
scale_tmp = input_size
shift = np.array(shift)
src_w = scale_tmp[0]
dst_w = output_size[0]
dst_h = output_size[1]
rot_rad = np.pi * rot / 180
src_dir = rotate_point([0., src_w * -0.5], rot_rad)
dst_dir = np.array([0., dst_w * -0.5])
src = np.zeros((3, 2), dtype=np.float32)
src[0, :] = center + scale_tmp * shift
src[1, :] = center + src_dir + scale_tmp * shift
src[2, :] = _get_3rd_point(src[0, :], src[1, :])
dst = np.zeros((3, 2), dtype=np.float32)
dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
if inv:
trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
else:
trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
return trans
def get_warp_matrix(theta, size_input, size_dst, size_target):
"""This code is based on
https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py
Calculate the transformation matrix under the constraint of unbiased.
Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased
Data Processing for Human Pose Estimation (CVPR 2020).
Args:
theta (float): Rotation angle in degrees.
size_input (np.ndarray): Size of input image [w, h].
size_dst (np.ndarray): Size of output image [w, h].
size_target (np.ndarray): Size of ROI in input plane [w, h].
Returns:
matrix (np.ndarray): A matrix for transformation.
"""
theta = np.deg2rad(theta)
matrix = np.zeros((2, 3), dtype=np.float32)
scale_x = size_dst[0] / size_target[0]
scale_y = size_dst[1] / size_target[1]
matrix[0, 0] = np.cos(theta) * scale_x
matrix[0, 1] = -np.sin(theta) * scale_x
matrix[0, 2] = scale_x * (
-0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] *
np.sin(theta) + 0.5 * size_target[0])
matrix[1, 0] = np.sin(theta) * scale_y
matrix[1, 1] = np.cos(theta) * scale_y
matrix[1, 2] = scale_y * (
-0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] *
np.cos(theta) + 0.5 * size_target[1])
return matrix
def _get_3rd_point(a, b):
"""To calculate the affine matrix, three pairs of points are required. This
function is used to get the 3rd point, given 2D points a & b.
The 3rd point is defined by rotating vector `a - b` by 90 degrees
anticlockwise, using b as the rotation center.
Args:
a (np.ndarray): point(x,y)
b (np.ndarray): point(x,y)
Returns:
np.ndarray: The 3rd point.
"""
assert len(
a) == 2, 'input of _get_3rd_point should be point with length of 2'
assert len(
b) == 2, 'input of _get_3rd_point should be point with length of 2'
direction = a - b
third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)
return third_pt
def rotate_point(pt, angle_rad):
"""Rotate a point by an angle.
Args:
pt (list[float]): 2 dimensional point to be rotated
angle_rad (float): rotation angle by radian
Returns:
list[float]: Rotated point.
"""
assert len(pt) == 2
sn, cs = np.sin(angle_rad), np.cos(angle_rad)
new_x = pt[0] * cs - pt[1] * sn
new_y = pt[0] * sn + pt[1] * cs
rotated_pt = [new_x, new_y]
return rotated_pt
def transpred(kpts, h, w, s):
trans, _ = get_affine_mat_kernel(h, w, s, inv=True)
return warp_affine_joints(kpts[..., :2].copy(), trans)
def warp_affine_joints(joints, mat):
"""Apply affine transformation defined by the transform matrix on the
joints.
Args:
joints (np.ndarray[..., 2]): Origin coordinate of joints.
mat (np.ndarray[3, 2]): The affine matrix.
Returns:
matrix (np.ndarray[..., 2]): Result coordinate of joints.
"""
joints = np.array(joints)
shape = joints.shape
joints = joints.reshape(-1, 2)
return np.dot(np.concatenate(
(joints, joints[:, 0:1] * 0 + 1), axis=1),
mat.T).reshape(shape)
def affine_transform(pt, t):
new_pt = np.array([pt[0], pt[1], 1.]).T
new_pt = np.dot(t, new_pt)
return new_pt[:2]
def transform_preds(coords, center, scale, output_size):
target_coords = np.zeros(coords.shape)
trans = get_affine_transform(center, scale * 200, 0, output_size, inv=1)
for p in range(coords.shape[0]):
target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
return target_coords
def oks_iou(g, d, a_g, a_d, sigmas=None, in_vis_thre=None):
if not isinstance(sigmas, np.ndarray):
sigmas = np.array([
.26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07,
.87, .87, .89, .89
]) / 10.0
vars = (sigmas * 2)**2
xg = g[0::3]
yg = g[1::3]
vg = g[2::3]
ious = np.zeros((d.shape[0]))
for n_d in range(0, d.shape[0]):
xd = d[n_d, 0::3]
yd = d[n_d, 1::3]
vd = d[n_d, 2::3]
dx = xd - xg
dy = yd - yg
e = (dx**2 + dy**2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2
if in_vis_thre is not None:
ind = list(vg > in_vis_thre) and list(vd > in_vis_thre)
e = e[ind]
ious[n_d] = np.sum(np.exp(-e)) / e.shape[0] if e.shape[0] != 0 else 0.0
return ious
def oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None):
"""greedily select boxes with high confidence and overlap with current maximum <= thresh
rule out overlap >= thresh
Args:
kpts_db (list): The predicted keypoints within the image
thresh (float): The threshold to select the boxes
sigmas (np.array): The variance to calculate the oks iou
Default: None
in_vis_thre (float): The threshold to select the high confidence boxes
Default: None
Return:
keep (list): indexes to keep
"""
if len(kpts_db) == 0:
return []
scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))])
kpts = np.array(
[kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))])
areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))])
order = scores.argsort()[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
sigmas, in_vis_thre)
inds = np.where(oks_ovr <= thresh)[0]
order = order[inds + 1]
return keep
def rescore(overlap, scores, thresh, type='gaussian'):
assert overlap.shape[0] == scores.shape[0]
if type == 'linear':
inds = np.where(overlap >= thresh)[0]
scores[inds] = scores[inds] * (1 - overlap[inds])
else:
scores = scores * np.exp(-overlap**2 / thresh)
return scores
def soft_oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None):
"""greedily select boxes with high confidence and overlap with current maximum <= thresh
rule out overlap >= thresh
Args:
kpts_db (list): The predicted keypoints within the image
thresh (float): The threshold to select the boxes
sigmas (np.array): The variance to calculate the oks iou
Default: None
in_vis_thre (float): The threshold to select the high confidence boxes
Default: None
Return:
keep (list): indexes to keep
"""
if len(kpts_db) == 0:
return []
scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))])
kpts = np.array(
[kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))])
areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))])
order = scores.argsort()[::-1]
scores = scores[order]
# max_dets = order.size
max_dets = 20
keep = np.zeros(max_dets, dtype=np.intp)
keep_cnt = 0
while order.size > 0 and keep_cnt < max_dets:
i = order[0]
oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
sigmas, in_vis_thre)
order = order[1:]
scores = rescore(oks_ovr, scores[1:], thresh)
tmp = scores.argsort()[::-1]
order = order[tmp]
scores = scores[tmp]
keep[keep_cnt] = i
keep_cnt += 1
keep = keep[:keep_cnt]
return keep
def resize(input,
size=None,
scale_factor=None,
mode='nearest',
align_corners=None,
warning=True):
if warning:
if size is not None and align_corners:
input_h, input_w = tuple(int(x) for x in input.shape[2:])
output_h, output_w = tuple(int(x) for x in size)
if output_h > input_h or output_w > output_h:
if ((output_h > 1 and output_w > 1 and input_h > 1 and
input_w > 1) and (output_h - 1) % (input_h - 1) and
(output_w - 1) % (input_w - 1)):
warnings.warn(
f'When align_corners={align_corners}, '
'the output would more aligned if '
f'input size {(input_h, input_w)} is `x+1` and '
f'out size {(output_h, output_w)} is `nx+1`')
return F.interpolate(input, size, scale_factor, mode, align_corners)
def flip_back(output_flipped, flip_pairs, target_type='GaussianHeatmap'):
"""Flip the flipped heatmaps back to the original form.
Note:
- batch_size: N
- num_keypoints: K
- heatmap height: H
- heatmap width: W
Args:
output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained
from the flipped images.
flip_pairs (list[tuple()): Pairs of keypoints which are mirrored
(for example, left ear -- right ear).
target_type (str): GaussianHeatmap or CombinedTarget
Returns:
np.ndarray: heatmaps that flipped back to the original image
"""
assert len(output_flipped.shape) == 4, \
'output_flipped should be [batch_size, num_keypoints, height, width]'
shape_ori = output_flipped.shape
channels = 1
if target_type.lower() == 'CombinedTarget'.lower():
channels = 3
output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...]
output_flipped = output_flipped.reshape((shape_ori[0], -1, channels,
shape_ori[2], shape_ori[3]))
output_flipped_back = output_flipped.clone()
# Swap left-right parts
for left, right in flip_pairs:
output_flipped_back[:, left, ...] = output_flipped[:, right, ...]
output_flipped_back[:, right, ...] = output_flipped[:, left, ...]
output_flipped_back = output_flipped_back.reshape(shape_ori)
# Flip horizontally
output_flipped_back = output_flipped_back[..., ::-1]
return output_flipped_back

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,19 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .iou_loss import *
from .gfocal_loss import *
from .detr_loss import *
from .focal_loss import *
from .smooth_l1_loss import *

View File

@@ -0,0 +1,578 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from .iou_loss import GIoULoss
from ..transformers import bbox_cxcywh_to_xyxy, sigmoid_focal_loss, varifocal_loss_with_logits
from ..bbox_utils import bbox_iou
__all__ = ['DETRLoss', 'DINOLoss']
@register
class DETRLoss(nn.Layer):
__shared__ = ['num_classes', 'use_focal_loss']
__inject__ = ['matcher']
def __init__(self,
num_classes=80,
matcher='HungarianMatcher',
loss_coeff={
'class': 1,
'bbox': 5,
'giou': 2,
'no_object': 0.1,
'mask': 1,
'dice': 1
},
aux_loss=True,
use_focal_loss=False,
use_vfl=False,
use_uni_match=False,
uni_match_ind=0):
r"""
Args:
num_classes (int): The number of classes.
matcher (HungarianMatcher): It computes an assignment between the targets
and the predictions of the network.
loss_coeff (dict): The coefficient of loss.
aux_loss (bool): If 'aux_loss = True', loss at each decoder layer are to be used.
use_focal_loss (bool): Use focal loss or not.
"""
super(DETRLoss, self).__init__()
self.num_classes = num_classes
self.matcher = matcher
self.loss_coeff = loss_coeff
self.aux_loss = aux_loss
self.use_focal_loss = use_focal_loss
self.use_vfl = use_vfl
self.use_uni_match = use_uni_match
self.uni_match_ind = uni_match_ind
if not self.use_focal_loss:
self.loss_coeff['class'] = paddle.full([num_classes + 1],
loss_coeff['class'])
self.loss_coeff['class'][-1] = loss_coeff['no_object']
self.giou_loss = GIoULoss()
def _get_loss_class(self,
logits,
gt_class,
match_indices,
bg_index,
num_gts,
postfix="",
iou_score=None):
# logits: [b, query, num_classes], gt_class: list[[n, 1]]
name_class = "loss_class" + postfix
target_label = paddle.full(logits.shape[:2], bg_index, dtype='int64')
bs, num_query_objects = target_label.shape
num_gt = sum(len(a) for a in gt_class)
if num_gt > 0:
index, updates = self._get_index_updates(num_query_objects,
gt_class, match_indices)
target_label = paddle.scatter(
target_label.reshape([-1, 1]), index, updates.astype('int64'))
target_label = target_label.reshape([bs, num_query_objects])
if self.use_focal_loss:
target_label = F.one_hot(target_label,
self.num_classes + 1)[..., :-1]
if iou_score is not None and self.use_vfl:
target_score = paddle.zeros([bs, num_query_objects])
if num_gt > 0:
target_score = paddle.scatter(
target_score.reshape([-1, 1]), index, iou_score)
target_score = target_score.reshape(
[bs, num_query_objects, 1]) * target_label
loss_ = self.loss_coeff['class'] * varifocal_loss_with_logits(
logits, target_score, target_label,
num_gts / num_query_objects)
else:
loss_ = self.loss_coeff['class'] * sigmoid_focal_loss(
logits, target_label, num_gts / num_query_objects)
else:
loss_ = F.cross_entropy(
logits, target_label, weight=self.loss_coeff['class'])
return {name_class: loss_}
def _get_loss_bbox(self, boxes, gt_bbox, match_indices, num_gts,
postfix=""):
# boxes: [b, query, 4], gt_bbox: list[[n, 4]]
name_bbox = "loss_bbox" + postfix
name_giou = "loss_giou" + postfix
loss = dict()
if sum(len(a) for a in gt_bbox) == 0:
loss[name_bbox] = paddle.to_tensor([0.])
loss[name_giou] = paddle.to_tensor([0.])
return loss
src_bbox, target_bbox = self._get_src_target_assign(boxes, gt_bbox,
match_indices)
loss[name_bbox] = self.loss_coeff['bbox'] * F.l1_loss(
src_bbox, target_bbox, reduction='sum') / num_gts
loss[name_giou] = self.giou_loss(
bbox_cxcywh_to_xyxy(src_bbox), bbox_cxcywh_to_xyxy(target_bbox))
loss[name_giou] = loss[name_giou].sum() / num_gts
loss[name_giou] = self.loss_coeff['giou'] * loss[name_giou]
return loss
def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts,
postfix=""):
# masks: [b, query, h, w], gt_mask: list[[n, H, W]]
name_mask = "loss_mask" + postfix
name_dice = "loss_dice" + postfix
loss = dict()
if sum(len(a) for a in gt_mask) == 0:
loss[name_mask] = paddle.to_tensor([0.])
loss[name_dice] = paddle.to_tensor([0.])
return loss
src_masks, target_masks = self._get_src_target_assign(masks, gt_mask,
match_indices)
src_masks = F.interpolate(
src_masks.unsqueeze(0),
size=target_masks.shape[-2:],
mode="bilinear")[0]
loss[name_mask] = self.loss_coeff['mask'] * F.sigmoid_focal_loss(
src_masks,
target_masks,
paddle.to_tensor(
[num_gts], dtype='float32'))
loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss(
src_masks, target_masks, num_gts)
return loss
def _dice_loss(self, inputs, targets, num_gts):
inputs = F.sigmoid(inputs)
inputs = inputs.flatten(1)
targets = targets.flatten(1)
numerator = 2 * (inputs * targets).sum(1)
denominator = inputs.sum(-1) + targets.sum(-1)
loss = 1 - (numerator + 1) / (denominator + 1)
return loss.sum() / num_gts
def _get_loss_aux(self,
boxes,
logits,
gt_bbox,
gt_class,
bg_index,
num_gts,
dn_match_indices=None,
postfix="",
masks=None,
gt_mask=None):
loss_class = []
loss_bbox, loss_giou = [], []
loss_mask, loss_dice = [], []
if dn_match_indices is not None:
match_indices = dn_match_indices
elif self.use_uni_match:
match_indices = self.matcher(
boxes[self.uni_match_ind],
logits[self.uni_match_ind],
gt_bbox,
gt_class,
masks=masks[self.uni_match_ind] if masks is not None else None,
gt_mask=gt_mask)
for i, (aux_boxes, aux_logits) in enumerate(zip(boxes, logits)):
aux_masks = masks[i] if masks is not None else None
if not self.use_uni_match and dn_match_indices is None:
match_indices = self.matcher(
aux_boxes,
aux_logits,
gt_bbox,
gt_class,
masks=aux_masks,
gt_mask=gt_mask)
if self.use_vfl:
if sum(len(a) for a in gt_bbox) > 0:
src_bbox, target_bbox = self._get_src_target_assign(
aux_boxes.detach(), gt_bbox, match_indices)
iou_score = bbox_iou(
bbox_cxcywh_to_xyxy(src_bbox).split(4, -1),
bbox_cxcywh_to_xyxy(target_bbox).split(4, -1))
else:
iou_score = None
else:
iou_score = None
loss_class.append(
self._get_loss_class(aux_logits, gt_class, match_indices,
bg_index, num_gts, postfix, iou_score)[
'loss_class' + postfix])
loss_ = self._get_loss_bbox(aux_boxes, gt_bbox, match_indices,
num_gts, postfix)
loss_bbox.append(loss_['loss_bbox' + postfix])
loss_giou.append(loss_['loss_giou' + postfix])
if masks is not None and gt_mask is not None:
loss_ = self._get_loss_mask(aux_masks, gt_mask, match_indices,
num_gts, postfix)
loss_mask.append(loss_['loss_mask' + postfix])
loss_dice.append(loss_['loss_dice' + postfix])
loss = {
"loss_class_aux" + postfix: paddle.add_n(loss_class),
"loss_bbox_aux" + postfix: paddle.add_n(loss_bbox),
"loss_giou_aux" + postfix: paddle.add_n(loss_giou)
}
if masks is not None and gt_mask is not None:
loss["loss_mask_aux" + postfix] = paddle.add_n(loss_mask)
loss["loss_dice_aux" + postfix] = paddle.add_n(loss_dice)
return loss
def _get_index_updates(self, num_query_objects, target, match_indices):
batch_idx = paddle.concat([
paddle.full_like(src, i) for i, (src, _) in enumerate(match_indices)
])
src_idx = paddle.concat([src for (src, _) in match_indices])
src_idx += (batch_idx * num_query_objects)
target_assign = paddle.concat([
paddle.gather(
t, dst, axis=0) for t, (_, dst) in zip(target, match_indices)
])
return src_idx, target_assign
def _get_src_target_assign(self, src, target, match_indices):
src_assign = paddle.concat([
paddle.gather(
t, I, axis=0) if len(I) > 0 else paddle.zeros([0, t.shape[-1]])
for t, (I, _) in zip(src, match_indices)
])
target_assign = paddle.concat([
paddle.gather(
t, J, axis=0) if len(J) > 0 else paddle.zeros([0, t.shape[-1]])
for t, (_, J) in zip(target, match_indices)
])
return src_assign, target_assign
def _get_num_gts(self, targets, dtype="float32"):
num_gts = sum(len(a) for a in targets)
num_gts = paddle.to_tensor([num_gts], dtype=dtype)
if paddle.distributed.get_world_size() > 1:
paddle.distributed.all_reduce(num_gts)
num_gts /= paddle.distributed.get_world_size()
num_gts = paddle.clip(num_gts, min=1.)
return num_gts
def _get_prediction_loss(self,
boxes,
logits,
gt_bbox,
gt_class,
masks=None,
gt_mask=None,
postfix="",
dn_match_indices=None,
num_gts=1):
if dn_match_indices is None:
match_indices = self.matcher(
boxes, logits, gt_bbox, gt_class, masks=masks, gt_mask=gt_mask)
else:
match_indices = dn_match_indices
if self.use_vfl:
if sum(len(a) for a in gt_bbox) > 0:
src_bbox, target_bbox = self._get_src_target_assign(
boxes.detach(), gt_bbox, match_indices)
iou_score = bbox_iou(
bbox_cxcywh_to_xyxy(src_bbox).split(4, -1),
bbox_cxcywh_to_xyxy(target_bbox).split(4, -1))
else:
iou_score = None
else:
iou_score = None
loss = dict()
loss.update(
self._get_loss_class(logits, gt_class, match_indices,
self.num_classes, num_gts, postfix, iou_score))
loss.update(
self._get_loss_bbox(boxes, gt_bbox, match_indices, num_gts,
postfix))
if masks is not None and gt_mask is not None:
loss.update(
self._get_loss_mask(masks, gt_mask, match_indices, num_gts,
postfix))
return loss
def forward(self,
boxes,
logits,
gt_bbox,
gt_class,
masks=None,
gt_mask=None,
postfix="",
**kwargs):
r"""
Args:
boxes (Tensor): [l, b, query, 4]
logits (Tensor): [l, b, query, num_classes]
gt_bbox (List(Tensor)): list[[n, 4]]
gt_class (List(Tensor)): list[[n, 1]]
masks (Tensor, optional): [l, b, query, h, w]
gt_mask (List(Tensor), optional): list[[n, H, W]]
postfix (str): postfix of loss name
"""
dn_match_indices = kwargs.get("dn_match_indices", None)
num_gts = kwargs.get("num_gts", None)
if num_gts is None:
num_gts = self._get_num_gts(gt_class)
total_loss = self._get_prediction_loss(
boxes[-1],
logits[-1],
gt_bbox,
gt_class,
masks=masks[-1] if masks is not None else None,
gt_mask=gt_mask,
postfix=postfix,
dn_match_indices=dn_match_indices,
num_gts=num_gts)
if self.aux_loss:
total_loss.update(
self._get_loss_aux(
boxes[:-1],
logits[:-1],
gt_bbox,
gt_class,
self.num_classes,
num_gts,
dn_match_indices,
postfix,
masks=masks[:-1] if masks is not None else None,
gt_mask=gt_mask))
return total_loss
@register
class DINOLoss(DETRLoss):
def forward(self,
boxes,
logits,
gt_bbox,
gt_class,
masks=None,
gt_mask=None,
postfix="",
dn_out_bboxes=None,
dn_out_logits=None,
dn_meta=None,
**kwargs):
num_gts = self._get_num_gts(gt_class)
total_loss = super(DINOLoss, self).forward(
boxes, logits, gt_bbox, gt_class, num_gts=num_gts)
if dn_meta is not None:
dn_positive_idx, dn_num_group = \
dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
assert len(gt_class) == len(dn_positive_idx)
# denoising match indices
dn_match_indices = self.get_dn_match_indices(
gt_class, dn_positive_idx, dn_num_group)
# compute denoising training loss
num_gts *= dn_num_group
dn_loss = super(DINOLoss, self).forward(
dn_out_bboxes,
dn_out_logits,
gt_bbox,
gt_class,
postfix="_dn",
dn_match_indices=dn_match_indices,
num_gts=num_gts)
total_loss.update(dn_loss)
else:
total_loss.update(
{k + '_dn': paddle.to_tensor([0.])
for k in total_loss.keys()})
return total_loss
@staticmethod
def get_dn_match_indices(labels, dn_positive_idx, dn_num_group):
dn_match_indices = []
for i in range(len(labels)):
num_gt = len(labels[i])
if num_gt > 0:
gt_idx = paddle.arange(end=num_gt, dtype="int64")
gt_idx = gt_idx.tile([dn_num_group])
assert len(dn_positive_idx[i]) == len(gt_idx)
dn_match_indices.append((dn_positive_idx[i], gt_idx))
else:
dn_match_indices.append((paddle.zeros(
[0], dtype="int64"), paddle.zeros(
[0], dtype="int64")))
return dn_match_indices
@register
class MaskDINOLoss(DETRLoss):
__shared__ = ['num_classes', 'use_focal_loss', 'num_sample_points']
__inject__ = ['matcher']
def __init__(self,
num_classes=80,
matcher='HungarianMatcher',
loss_coeff={
'class': 4,
'bbox': 5,
'giou': 2,
'mask': 5,
'dice': 5
},
aux_loss=True,
use_focal_loss=False,
num_sample_points=12544,
oversample_ratio=3.0,
important_sample_ratio=0.75):
super(MaskDINOLoss, self).__init__(num_classes, matcher, loss_coeff,
aux_loss, use_focal_loss)
assert oversample_ratio >= 1
assert important_sample_ratio <= 1 and important_sample_ratio >= 0
self.num_sample_points = num_sample_points
self.oversample_ratio = oversample_ratio
self.important_sample_ratio = important_sample_ratio
self.num_oversample_points = int(num_sample_points * oversample_ratio)
self.num_important_points = int(num_sample_points *
important_sample_ratio)
self.num_random_points = num_sample_points - self.num_important_points
def forward(self,
boxes,
logits,
gt_bbox,
gt_class,
masks=None,
gt_mask=None,
postfix="",
dn_out_bboxes=None,
dn_out_logits=None,
dn_out_masks=None,
dn_meta=None,
**kwargs):
num_gts = self._get_num_gts(gt_class)
total_loss = super(MaskDINOLoss, self).forward(
boxes,
logits,
gt_bbox,
gt_class,
masks=masks,
gt_mask=gt_mask,
num_gts=num_gts)
if dn_meta is not None:
dn_positive_idx, dn_num_group = \
dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
assert len(gt_class) == len(dn_positive_idx)
# denoising match indices
dn_match_indices = DINOLoss.get_dn_match_indices(
gt_class, dn_positive_idx, dn_num_group)
# compute denoising training loss
num_gts *= dn_num_group
dn_loss = super(MaskDINOLoss, self).forward(
dn_out_bboxes,
dn_out_logits,
gt_bbox,
gt_class,
masks=dn_out_masks,
gt_mask=gt_mask,
postfix="_dn",
dn_match_indices=dn_match_indices,
num_gts=num_gts)
total_loss.update(dn_loss)
else:
total_loss.update(
{k + '_dn': paddle.to_tensor([0.])
for k in total_loss.keys()})
return total_loss
def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts,
postfix=""):
# masks: [b, query, h, w], gt_mask: list[[n, H, W]]
name_mask = "loss_mask" + postfix
name_dice = "loss_dice" + postfix
loss = dict()
if sum(len(a) for a in gt_mask) == 0:
loss[name_mask] = paddle.to_tensor([0.])
loss[name_dice] = paddle.to_tensor([0.])
return loss
src_masks, target_masks = self._get_src_target_assign(masks, gt_mask,
match_indices)
# sample points
sample_points = self._get_point_coords_by_uncertainty(src_masks)
sample_points = 2.0 * sample_points.unsqueeze(1) - 1.0
src_masks = F.grid_sample(
src_masks.unsqueeze(1), sample_points,
align_corners=False).squeeze([1, 2])
target_masks = F.grid_sample(
target_masks.unsqueeze(1), sample_points,
align_corners=False).squeeze([1, 2]).detach()
loss[name_mask] = self.loss_coeff[
'mask'] * F.binary_cross_entropy_with_logits(
src_masks, target_masks,
reduction='none').mean(1).sum() / num_gts
loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss(
src_masks, target_masks, num_gts)
return loss
def _get_point_coords_by_uncertainty(self, masks):
# Sample points based on their uncertainty.
masks = masks.detach()
num_masks = masks.shape[0]
sample_points = paddle.rand(
[num_masks, 1, self.num_oversample_points, 2])
out_mask = F.grid_sample(
masks.unsqueeze(1), 2.0 * sample_points - 1.0,
align_corners=False).squeeze([1, 2])
out_mask = -paddle.abs(out_mask)
_, topk_ind = paddle.topk(out_mask, self.num_important_points, axis=1)
batch_ind = paddle.arange(end=num_masks, dtype=topk_ind.dtype)
batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_important_points])
topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
sample_points = paddle.gather_nd(sample_points.squeeze(1), topk_ind)
if self.num_random_points > 0:
sample_points = paddle.concat(
[
sample_points,
paddle.rand([num_masks, self.num_random_points, 2])
],
axis=1)
return sample_points

View File

@@ -0,0 +1,138 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn.functional as F
import paddle.nn as nn
from ppdet.core.workspace import register
__all__ = ['FocalLoss', 'Weighted_FocalLoss']
@register
class FocalLoss(nn.Layer):
"""A wrapper around paddle.nn.functional.sigmoid_focal_loss.
Args:
use_sigmoid (bool): currently only support use_sigmoid=True
alpha (float): parameter alpha in Focal Loss
gamma (float): parameter gamma in Focal Loss
loss_weight (float): final loss will be multiplied by this
"""
def __init__(self,
use_sigmoid=True,
alpha=0.25,
gamma=2.0,
loss_weight=1.0):
super(FocalLoss, self).__init__()
assert use_sigmoid == True, \
'Focal Loss only supports sigmoid at the moment'
self.use_sigmoid = use_sigmoid
self.alpha = alpha
self.gamma = gamma
self.loss_weight = loss_weight
def forward(self, pred, target, reduction='none'):
"""forward function.
Args:
pred (Tensor): logits of class prediction, of shape (N, num_classes)
target (Tensor): target class label, of shape (N, )
reduction (str): the way to reduce loss, one of (none, sum, mean)
"""
num_classes = pred.shape[1]
target = F.one_hot(target, num_classes+1).cast(pred.dtype)
target = target[:, :-1].detach()
loss = F.sigmoid_focal_loss(
pred, target, alpha=self.alpha, gamma=self.gamma,
reduction=reduction)
return loss * self.loss_weight
@register
class Weighted_FocalLoss(FocalLoss):
"""A wrapper around paddle.nn.functional.sigmoid_focal_loss.
Args:
use_sigmoid (bool): currently only support use_sigmoid=True
alpha (float): parameter alpha in Focal Loss
gamma (float): parameter gamma in Focal Loss
loss_weight (float): final loss will be multiplied by this
"""
def __init__(self,
use_sigmoid=True,
alpha=0.25,
gamma=2.0,
loss_weight=1.0,
reduction="mean"):
super(FocalLoss, self).__init__()
assert use_sigmoid == True, \
'Focal Loss only supports sigmoid at the moment'
self.use_sigmoid = use_sigmoid
self.alpha = alpha
self.gamma = gamma
self.loss_weight = loss_weight
self.reduction = reduction
def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None):
"""forward function.
Args:
pred (Tensor): logits of class prediction, of shape (N, num_classes)
target (Tensor): target class label, of shape (N, )
reduction (str): the way to reduce loss, one of (none, sum, mean)
"""
assert reduction_override in (None, 'none', 'mean', 'sum')
reduction = (
reduction_override if reduction_override else self.reduction)
num_classes = pred.shape[1]
target = F.one_hot(target, num_classes + 1).astype(pred.dtype)
target = target[:, :-1].detach()
loss = F.sigmoid_focal_loss(
pred, target, alpha=self.alpha, gamma=self.gamma,
reduction='none')
if weight is not None:
if weight.shape != loss.shape:
if weight.shape[0] == loss.shape[0]:
# For most cases, weight is of shape (num_priors, ),
# which means it does not have the second axis num_class
weight = weight.reshape((-1, 1))
else:
# Sometimes, weight per anchor per class is also needed. e.g.
# in FSAF. But it may be flattened of shape
# (num_priors x num_class, ), while loss is still of shape
# (num_priors, num_class).
assert weight.numel() == loss.numel()
weight = weight.reshape((loss.shape[0], -1))
assert weight.ndim == loss.ndim
loss = loss * weight
# if avg_factor is not specified, just reduce the loss
if avg_factor is None:
if reduction == 'mean':
loss = loss.mean()
elif reduction == 'sum':
loss = loss.sum()
else:
# if reduction is mean, then average the loss by avg_factor
if reduction == 'mean':
# Avoid causing ZeroDivisionError when avg_factor is 0.0,
# i.e., all labels of an image belong to ignore index.
eps = 1e-10
loss = loss.sum() / (avg_factor + eps)
# if reduction is 'none', then do nothing, otherwise raise an error
elif reduction != 'none':
raise ValueError('avg_factor can not be used with reduction="sum"')
return loss * self.loss_weight

View File

@@ -0,0 +1,217 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# The code is based on:
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/gfocal_loss.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from ppdet.modeling import ops
__all__ = ['QualityFocalLoss', 'DistributionFocalLoss']
def quality_focal_loss(pred, target, beta=2.0, use_sigmoid=True):
"""
Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning
Qualified and Distributed Bounding Boxes for Dense Object Detection
<https://arxiv.org/abs/2006.04388>`_.
Args:
pred (Tensor): Predicted joint representation of classification
and quality (IoU) estimation with shape (N, C), C is the number of
classes.
target (tuple([Tensor])): Target category label with shape (N,)
and target quality label with shape (N,).
beta (float): The beta parameter for calculating the modulating factor.
Defaults to 2.0.
Returns:
Tensor: Loss tensor with shape (N,).
"""
assert len(target) == 2, """target for QFL must be a tuple of two elements,
including category label and quality label, respectively"""
# label denotes the category id, score denotes the quality score
label, score = target
if use_sigmoid:
func = F.binary_cross_entropy_with_logits
else:
func = F.binary_cross_entropy
# negatives are supervised by 0 quality score
pred_sigmoid = F.sigmoid(pred) if use_sigmoid else pred
scale_factor = pred_sigmoid
zerolabel = paddle.zeros(pred.shape, dtype='float32')
loss = func(pred, zerolabel, reduction='none') * scale_factor.pow(beta)
# FG cat_id: [0, num_classes -1], BG cat_id: num_classes
bg_class_ind = pred.shape[1]
pos = paddle.logical_and((label >= 0),
(label < bg_class_ind)).nonzero().squeeze(1)
if pos.shape[0] == 0:
return loss.sum(axis=1)
pos_label = paddle.gather(label, pos, axis=0)
pos_mask = np.zeros(pred.shape, dtype=np.int32)
pos_mask[pos.numpy(), pos_label.numpy()] = 1
pos_mask = paddle.to_tensor(pos_mask, dtype='bool')
score = score.unsqueeze(-1).expand([-1, pred.shape[1]]).cast('float32')
# positives are supervised by bbox quality (IoU) score
scale_factor_new = score - pred_sigmoid
loss_pos = func(
pred, score, reduction='none') * scale_factor_new.abs().pow(beta)
loss = loss * paddle.logical_not(pos_mask) + loss_pos * pos_mask
loss = loss.sum(axis=1)
return loss
def distribution_focal_loss(pred, label):
"""Distribution Focal Loss (DFL) is from `Generalized Focal Loss: Learning
Qualified and Distributed Bounding Boxes for Dense Object Detection
<https://arxiv.org/abs/2006.04388>`_.
Args:
pred (Tensor): Predicted general distribution of bounding boxes
(before softmax) with shape (N, n+1), n is the max value of the
integral set `{0, ..., n}` in paper.
label (Tensor): Target distance label for bounding boxes with
shape (N,).
Returns:
Tensor: Loss tensor with shape (N,).
"""
dis_left = label.cast('int64')
dis_right = dis_left + 1
weight_left = dis_right.cast('float32') - label
weight_right = label - dis_left.cast('float32')
loss = F.cross_entropy(pred, dis_left, reduction='none') * weight_left \
+ F.cross_entropy(pred, dis_right, reduction='none') * weight_right
return loss
@register
@serializable
class QualityFocalLoss(nn.Layer):
r"""Quality Focal Loss (QFL) is a variant of `Generalized Focal Loss:
Learning Qualified and Distributed Bounding Boxes for Dense Object
Detection <https://arxiv.org/abs/2006.04388>`_.
Args:
use_sigmoid (bool): Whether sigmoid operation is conducted in QFL.
Defaults to True.
beta (float): The beta parameter for calculating the modulating factor.
Defaults to 2.0.
reduction (str): Options are "none", "mean" and "sum".
loss_weight (float): Loss weight of current loss.
"""
def __init__(self,
use_sigmoid=True,
beta=2.0,
reduction='mean',
loss_weight=1.0):
super(QualityFocalLoss, self).__init__()
self.use_sigmoid = use_sigmoid
self.beta = beta
assert reduction in ('none', 'mean', 'sum')
self.reduction = reduction
self.loss_weight = loss_weight
def forward(self, pred, target, weight=None, avg_factor=None):
"""Forward function.
Args:
pred (Tensor): Predicted joint representation of
classification and quality (IoU) estimation with shape (N, C),
C is the number of classes.
target (tuple([Tensor])): Target category label with shape
(N,) and target quality label with shape (N,).
weight (Tensor, optional): The weight of loss for each
prediction. Defaults to None.
avg_factor (int, optional): Average factor that is used to average
the loss. Defaults to None.
"""
loss = self.loss_weight * quality_focal_loss(
pred, target, beta=self.beta, use_sigmoid=self.use_sigmoid)
if weight is not None:
loss = loss * weight
if avg_factor is None:
if self.reduction == 'none':
return loss
elif self.reduction == 'mean':
return loss.mean()
elif self.reduction == 'sum':
return loss.sum()
else:
# if reduction is mean, then average the loss by avg_factor
if self.reduction == 'mean':
loss = loss.sum() / avg_factor
# if reduction is 'none', then do nothing, otherwise raise an error
elif self.reduction != 'none':
raise ValueError(
'avg_factor can not be used with reduction="sum"')
return loss
@register
@serializable
class DistributionFocalLoss(nn.Layer):
"""Distribution Focal Loss (DFL) is a variant of `Generalized Focal Loss:
Learning Qualified and Distributed Bounding Boxes for Dense Object
Detection <https://arxiv.org/abs/2006.04388>`_.
Args:
reduction (str): Options are `'none'`, `'mean'` and `'sum'`.
loss_weight (float): Loss weight of current loss.
"""
def __init__(self, reduction='mean', loss_weight=1.0):
super(DistributionFocalLoss, self).__init__()
assert reduction in ('none', 'mean', 'sum')
self.reduction = reduction
self.loss_weight = loss_weight
def forward(self, pred, target, weight=None, avg_factor=None):
"""Forward function.
Args:
pred (Tensor): Predicted general distribution of bounding
boxes (before softmax) with shape (N, n+1), n is the max value
of the integral set `{0, ..., n}` in paper.
target (Tensor): Target distance label for bounding boxes
with shape (N,).
weight (Tensor, optional): The weight of loss for each
prediction. Defaults to None.
avg_factor (int, optional): Average factor that is used to average
the loss. Defaults to None.
"""
loss = self.loss_weight * distribution_focal_loss(pred, target)
if weight is not None:
loss = loss * weight
if avg_factor is None:
if self.reduction == 'none':
return loss
elif self.reduction == 'mean':
return loss.mean()
elif self.reduction == 'sum':
return loss.sum()
else:
# if reduction is mean, then average the loss by avg_factor
if self.reduction == 'mean':
loss = loss.sum() / avg_factor
# if reduction is 'none', then do nothing, otherwise raise an error
elif self.reduction != 'none':
raise ValueError(
'avg_factor can not be used with reduction="sum"')
return loss

View File

@@ -0,0 +1,295 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import math
import paddle
from ppdet.core.workspace import register, serializable
from ..bbox_utils import bbox_iou
__all__ = ['IouLoss', 'GIoULoss', 'DIouLoss', 'SIoULoss']
@register
@serializable
class IouLoss(object):
"""
iou loss, see https://arxiv.org/abs/1908.03851
loss = 1.0 - iou * iou
Args:
loss_weight (float): iou loss weight, default is 2.5
max_height (int): max height of input to support random shape input
max_width (int): max width of input to support random shape input
ciou_term (bool): whether to add ciou_term
loss_square (bool): whether to square the iou term
"""
def __init__(self,
loss_weight=2.5,
giou=False,
diou=False,
ciou=False,
loss_square=True):
self.loss_weight = loss_weight
self.giou = giou
self.diou = diou
self.ciou = ciou
self.loss_square = loss_square
def __call__(self, pbox, gbox):
iou = bbox_iou(
pbox, gbox, giou=self.giou, diou=self.diou, ciou=self.ciou)
if self.loss_square:
loss_iou = 1 - iou * iou
else:
loss_iou = 1 - iou
loss_iou = loss_iou * self.loss_weight
return loss_iou
@register
@serializable
class GIoULoss(object):
"""
Generalized Intersection over Union, see https://arxiv.org/abs/1902.09630
Args:
loss_weight (float): giou loss weight, default as 1
eps (float): epsilon to avoid divide by zero, default as 1e-10
reduction (string): Options are "none", "mean" and "sum". default as none
"""
def __init__(self, loss_weight=1., eps=1e-10, reduction='none'):
self.loss_weight = loss_weight
self.eps = eps
assert reduction in ('none', 'mean', 'sum')
self.reduction = reduction
def bbox_overlap(self, box1, box2, eps=1e-10):
"""calculate the iou of box1 and box2
Args:
box1 (Tensor): box1 with the shape (..., 4)
box2 (Tensor): box1 with the shape (..., 4)
eps (float): epsilon to avoid divide by zero
Return:
iou (Tensor): iou of box1 and box2
overlap (Tensor): overlap of box1 and box2
union (Tensor): union of box1 and box2
"""
x1, y1, x2, y2 = box1
x1g, y1g, x2g, y2g = box2
xkis1 = paddle.maximum(x1, x1g)
ykis1 = paddle.maximum(y1, y1g)
xkis2 = paddle.minimum(x2, x2g)
ykis2 = paddle.minimum(y2, y2g)
w_inter = (xkis2 - xkis1).clip(0)
h_inter = (ykis2 - ykis1).clip(0)
overlap = w_inter * h_inter
area1 = (x2 - x1) * (y2 - y1)
area2 = (x2g - x1g) * (y2g - y1g)
union = area1 + area2 - overlap + eps
iou = overlap / union
return iou, overlap, union
def __call__(self, pbox, gbox, iou_weight=1., loc_reweight=None):
x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)
box1 = [x1, y1, x2, y2]
box2 = [x1g, y1g, x2g, y2g]
iou, overlap, union = self.bbox_overlap(box1, box2, self.eps)
xc1 = paddle.minimum(x1, x1g)
yc1 = paddle.minimum(y1, y1g)
xc2 = paddle.maximum(x2, x2g)
yc2 = paddle.maximum(y2, y2g)
area_c = (xc2 - xc1) * (yc2 - yc1) + self.eps
miou = iou - ((area_c - union) / area_c)
if loc_reweight is not None:
loc_reweight = paddle.reshape(loc_reweight, shape=(-1, 1))
loc_thresh = 0.9
giou = 1 - (1 - loc_thresh
) * miou - loc_thresh * miou * loc_reweight
else:
giou = 1 - miou
if self.reduction == 'none':
loss = giou
elif self.reduction == 'sum':
loss = paddle.sum(giou * iou_weight)
else:
loss = paddle.mean(giou * iou_weight)
return loss * self.loss_weight
@register
@serializable
class DIouLoss(GIoULoss):
"""
Distance-IoU Loss, see https://arxiv.org/abs/1911.08287
Args:
loss_weight (float): giou loss weight, default as 1
eps (float): epsilon to avoid divide by zero, default as 1e-10
use_complete_iou_loss (bool): whether to use complete iou loss
"""
def __init__(self, loss_weight=1., eps=1e-10, use_complete_iou_loss=True):
super(DIouLoss, self).__init__(loss_weight=loss_weight, eps=eps)
self.use_complete_iou_loss = use_complete_iou_loss
def __call__(self, pbox, gbox, iou_weight=1.):
x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)
cx = (x1 + x2) / 2
cy = (y1 + y2) / 2
w = x2 - x1
h = y2 - y1
cxg = (x1g + x2g) / 2
cyg = (y1g + y2g) / 2
wg = x2g - x1g
hg = y2g - y1g
x2 = paddle.maximum(x1, x2)
y2 = paddle.maximum(y1, y2)
# A and B
xkis1 = paddle.maximum(x1, x1g)
ykis1 = paddle.maximum(y1, y1g)
xkis2 = paddle.minimum(x2, x2g)
ykis2 = paddle.minimum(y2, y2g)
# A or B
xc1 = paddle.minimum(x1, x1g)
yc1 = paddle.minimum(y1, y1g)
xc2 = paddle.maximum(x2, x2g)
yc2 = paddle.maximum(y2, y2g)
intsctk = (xkis2 - xkis1) * (ykis2 - ykis1)
intsctk = intsctk * paddle.greater_than(
xkis2, xkis1) * paddle.greater_than(ykis2, ykis1)
unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g
) - intsctk + self.eps
iouk = intsctk / unionk
# DIOU term
dist_intersection = (cx - cxg) * (cx - cxg) + (cy - cyg) * (cy - cyg)
dist_union = (xc2 - xc1) * (xc2 - xc1) + (yc2 - yc1) * (yc2 - yc1)
diou_term = (dist_intersection + self.eps) / (dist_union + self.eps)
# CIOU term
ciou_term = 0
if self.use_complete_iou_loss:
ar_gt = wg / hg
ar_pred = w / h
arctan = paddle.atan(ar_gt) - paddle.atan(ar_pred)
ar_loss = 4. / np.pi / np.pi * arctan * arctan
alpha = ar_loss / (1 - iouk + ar_loss + self.eps)
alpha.stop_gradient = True
ciou_term = alpha * ar_loss
diou = paddle.mean((1 - iouk + ciou_term + diou_term) * iou_weight)
return diou * self.loss_weight
@register
@serializable
class SIoULoss(GIoULoss):
"""
see https://arxiv.org/pdf/2205.12740.pdf
Args:
loss_weight (float): siou loss weight, default as 1
eps (float): epsilon to avoid divide by zero, default as 1e-10
theta (float): default as 4
reduction (str): Options are "none", "mean" and "sum". default as none
"""
def __init__(self, loss_weight=1., eps=1e-10, theta=4., reduction='none'):
super(SIoULoss, self).__init__(loss_weight=loss_weight, eps=eps)
self.loss_weight = loss_weight
self.eps = eps
self.theta = theta
self.reduction = reduction
def __call__(self, pbox, gbox):
x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)
box1 = [x1, y1, x2, y2]
box2 = [x1g, y1g, x2g, y2g]
iou = bbox_iou(box1, box2)
cx = (x1 + x2) / 2
cy = (y1 + y2) / 2
w = x2 - x1 + self.eps
h = y2 - y1 + self.eps
cxg = (x1g + x2g) / 2
cyg = (y1g + y2g) / 2
wg = x2g - x1g + self.eps
hg = y2g - y1g + self.eps
x2 = paddle.maximum(x1, x2)
y2 = paddle.maximum(y1, y2)
# A or B
xc1 = paddle.minimum(x1, x1g)
yc1 = paddle.minimum(y1, y1g)
xc2 = paddle.maximum(x2, x2g)
yc2 = paddle.maximum(y2, y2g)
cw_out = xc2 - xc1
ch_out = yc2 - yc1
ch = paddle.maximum(cy, cyg) - paddle.minimum(cy, cyg)
cw = paddle.maximum(cx, cxg) - paddle.minimum(cx, cxg)
# angle cost
dist_intersection = paddle.sqrt((cx - cxg)**2 + (cy - cyg)**2)
sin_angle_alpha = ch / dist_intersection
sin_angle_beta = cw / dist_intersection
thred = paddle.pow(paddle.to_tensor(2), 0.5) / 2
thred.stop_gradient = True
sin_alpha = paddle.where(sin_angle_alpha > thred, sin_angle_beta,
sin_angle_alpha)
angle_cost = paddle.cos(paddle.asin(sin_alpha) * 2 - math.pi / 2)
# distance cost
gamma = 2 - angle_cost
# gamma.stop_gradient = True
beta_x = ((cxg - cx) / cw_out)**2
beta_y = ((cyg - cy) / ch_out)**2
dist_cost = 1 - paddle.exp(-gamma * beta_x) + 1 - paddle.exp(-gamma *
beta_y)
# shape cost
omega_w = paddle.abs(w - wg) / paddle.maximum(w, wg)
omega_h = paddle.abs(hg - h) / paddle.maximum(h, hg)
omega = (1 - paddle.exp(-omega_w))**self.theta + (
1 - paddle.exp(-omega_h))**self.theta
siou_loss = 1 - iou + (omega + dist_cost) / 2
if self.reduction == 'mean':
siou_loss = paddle.mean(siou_loss)
elif self.reduction == 'sum':
siou_loss = paddle.sum(siou_loss)
return siou_loss * self.loss_weight

View File

@@ -0,0 +1,60 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
__all__ = ['SmoothL1Loss']
@register
class SmoothL1Loss(nn.Layer):
"""Smooth L1 Loss.
Args:
beta (float): controls smooth region, it becomes L1 Loss when beta=0.0
loss_weight (float): the final loss will be multiplied by this
"""
def __init__(self,
beta=1.0,
loss_weight=1.0):
super(SmoothL1Loss, self).__init__()
assert beta >= 0
self.beta = beta
self.loss_weight = loss_weight
def forward(self, pred, target, reduction='none'):
"""forward function, based on fvcore.
Args:
pred (Tensor): prediction tensor
target (Tensor): target tensor, pred.shape must be the same as target.shape
reduction (str): the way to reduce loss, one of (none, sum, mean)
"""
assert reduction in ('none', 'sum', 'mean')
target = target.detach()
if self.beta < 1e-5:
loss = paddle.abs(pred - target)
else:
n = paddle.abs(pred - target)
cond = n < self.beta
loss = paddle.where(cond, 0.5 * n ** 2 / self.beta, n - 0.5 * self.beta)
if reduction == 'mean':
loss = loss.mean() if loss.size > 0 else 0.0 * loss.sum()
elif reduction == 'sum':
loss = loss.sum()
return loss * self.loss_weight

View File

@@ -0,0 +1,152 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# The code is based on:
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/varifocal_loss.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from ppdet.modeling import ops
__all__ = ['VarifocalLoss']
def varifocal_loss(pred,
target,
alpha=0.75,
gamma=2.0,
iou_weighted=True,
use_sigmoid=True):
"""`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_
Args:
pred (Tensor): The prediction with shape (N, C), C is the
number of classes
target (Tensor): The learning target of the iou-aware
classification score with shape (N, C), C is the number of classes.
alpha (float, optional): A balance factor for the negative part of
Varifocal Loss, which is different from the alpha of Focal Loss.
Defaults to 0.75.
gamma (float, optional): The gamma for calculating the modulating
factor. Defaults to 2.0.
iou_weighted (bool, optional): Whether to weight the loss of the
positive example with the iou target. Defaults to True.
"""
# pred and target should be of the same size
assert pred.shape == target.shape
if use_sigmoid:
pred_new = F.sigmoid(pred)
else:
pred_new = pred
target = target.cast(pred.dtype)
if iou_weighted:
focal_weight = target * (target > 0.0).cast('float32') + \
alpha * (pred_new - target).abs().pow(gamma) * \
(target <= 0.0).cast('float32')
else:
focal_weight = (target > 0.0).cast('float32') + \
alpha * (pred_new - target).abs().pow(gamma) * \
(target <= 0.0).cast('float32')
if use_sigmoid:
loss = F.binary_cross_entropy_with_logits(
pred, target, reduction='none') * focal_weight
else:
loss = F.binary_cross_entropy(
pred, target, reduction='none') * focal_weight
loss = loss.sum(axis=1)
return loss
@register
@serializable
class VarifocalLoss(nn.Layer):
def __init__(self,
use_sigmoid=True,
alpha=0.75,
gamma=2.0,
iou_weighted=True,
reduction='mean',
loss_weight=1.0):
"""`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_
Args:
use_sigmoid (bool, optional): Whether the prediction is
used for sigmoid or softmax. Defaults to True.
alpha (float, optional): A balance factor for the negative part of
Varifocal Loss, which is different from the alpha of Focal
Loss. Defaults to 0.75.
gamma (float, optional): The gamma for calculating the modulating
factor. Defaults to 2.0.
iou_weighted (bool, optional): Whether to weight the loss of the
positive examples with the iou target. Defaults to True.
reduction (str, optional): The method used to reduce the loss into
a scalar. Defaults to 'mean'. Options are "none", "mean" and
"sum".
loss_weight (float, optional): Weight of loss. Defaults to 1.0.
"""
super(VarifocalLoss, self).__init__()
assert alpha >= 0.0
self.use_sigmoid = use_sigmoid
self.alpha = alpha
self.gamma = gamma
self.iou_weighted = iou_weighted
self.reduction = reduction
self.loss_weight = loss_weight
def forward(self, pred, target, weight=None, avg_factor=None):
"""Forward function.
Args:
pred (Tensor): The prediction.
target (Tensor): The learning target of the prediction.
weight (Tensor, optional): The weight of loss for each
prediction. Defaults to None.
avg_factor (int, optional): Average factor that is used to average
the loss. Defaults to None.
Returns:
Tensor: The calculated loss
"""
loss = self.loss_weight * varifocal_loss(
pred,
target,
alpha=self.alpha,
gamma=self.gamma,
iou_weighted=self.iou_weighted,
use_sigmoid=self.use_sigmoid)
if weight is not None:
loss = loss * weight
if avg_factor is None:
if self.reduction == 'none':
return loss
elif self.reduction == 'mean':
return loss.mean()
elif self.reduction == 'sum':
return loss.sum()
else:
# if reduction is mean, then average the loss by avg_factor
if self.reduction == 'mean':
loss = loss.sum() / avg_factor
# if reduction is 'none', then do nothing, otherwise raise an error
elif self.reduction != 'none':
raise ValueError(
'avg_factor can not be used with reduction="sum"')
return loss

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,244 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import paddle
import paddle.nn.functional as F
from ppdet.core.workspace import register
from .transformers import bbox_cxcywh_to_xyxy
__all__ = [
'DETRPostProcess',
]
@register
class DETRPostProcess(object):
__shared__ = ['num_classes', 'use_focal_loss', 'with_mask']
__inject__ = []
def __init__(self,
num_classes=80,
num_top_queries=100,
dual_queries=False,
dual_groups=0,
use_focal_loss=False,
with_mask=False,
mask_threshold=0.5,
use_avg_mask_score=False,
bbox_decode_type='origin'):
super(DETRPostProcess, self).__init__()
assert bbox_decode_type in ['origin', 'pad']
self.num_classes = num_classes
self.num_top_queries = num_top_queries
self.dual_queries = dual_queries
self.dual_groups = dual_groups
self.use_focal_loss = use_focal_loss
self.with_mask = with_mask
self.mask_threshold = mask_threshold
self.use_avg_mask_score = use_avg_mask_score
self.bbox_decode_type = bbox_decode_type
def _mask_postprocess(self, mask_pred, score_pred, index):
mask_score = F.sigmoid(paddle.gather_nd(mask_pred, index))
mask_pred = (mask_score > self.mask_threshold).astype(mask_score.dtype)
if self.use_avg_mask_score:
avg_mask_score = (mask_pred * mask_score).sum([-2, -1]) / (
mask_pred.sum([-2, -1]) + 1e-6)
score_pred *= avg_mask_score
return mask_pred[0].astype('int32'), score_pred
def __call__(self, head_out, im_shape, scale_factor, pad_shape):
"""
Decode the bbox and mask.
Args:
head_out (tuple): bbox_pred, cls_logit and masks of bbox_head output.
im_shape (Tensor): The shape of the input image without padding.
scale_factor (Tensor): The scale factor of the input image.
pad_shape (Tensor): The shape of the input image with padding.
Returns:
bbox_pred (Tensor): The output prediction with shape [N, 6], including
labels, scores and bboxes. The size of bboxes are corresponding
to the input image, the bboxes may be used in other branch.
bbox_num (Tensor): The number of prediction boxes of each batch with
shape [bs], and is N.
"""
bboxes, logits, masks = head_out
if self.dual_queries:
num_queries = logits.shape[1]
logits, bboxes = logits[:, :int(num_queries // (self.dual_groups + 1)), :], \
bboxes[:, :int(num_queries // (self.dual_groups + 1)), :]
bbox_pred = bbox_cxcywh_to_xyxy(bboxes)
# calculate the original shape of the image
origin_shape = paddle.floor(im_shape / scale_factor + 0.5)
img_h, img_w = paddle.split(origin_shape, 2, axis=-1)
if self.bbox_decode_type == 'pad':
# calculate the shape of the image with padding
out_shape = pad_shape / im_shape * origin_shape
out_shape = out_shape.flip(1).tile([1, 2]).unsqueeze(1)
elif self.bbox_decode_type == 'origin':
out_shape = origin_shape.flip(1).tile([1, 2]).unsqueeze(1)
else:
raise Exception(
f'Wrong `bbox_decode_type`: {self.bbox_decode_type}.')
bbox_pred *= out_shape
scores = F.sigmoid(logits) if self.use_focal_loss else F.softmax(
logits)[:, :, :-1]
if not self.use_focal_loss:
scores, labels = scores.max(-1), scores.argmax(-1)
if scores.shape[1] > self.num_top_queries:
scores, index = paddle.topk(
scores, self.num_top_queries, axis=-1)
batch_ind = paddle.arange(
end=scores.shape[0]).unsqueeze(-1).tile(
[1, self.num_top_queries])
index = paddle.stack([batch_ind, index], axis=-1)
labels = paddle.gather_nd(labels, index)
bbox_pred = paddle.gather_nd(bbox_pred, index)
else:
scores, index = paddle.topk(
scores.flatten(1), self.num_top_queries, axis=-1)
labels = index % self.num_classes
index = index // self.num_classes
batch_ind = paddle.arange(end=scores.shape[0]).unsqueeze(-1).tile(
[1, self.num_top_queries])
index = paddle.stack([batch_ind, index], axis=-1)
bbox_pred = paddle.gather_nd(bbox_pred, index)
mask_pred = None
if self.with_mask:
assert masks is not None
masks = F.interpolate(
masks, scale_factor=4, mode="bilinear", align_corners=False)
# TODO: Support prediction with bs>1.
# remove padding for input image
h, w = im_shape.astype('int32')[0]
masks = masks[..., :h, :w]
# get pred_mask in the original resolution.
img_h = img_h[0].astype('int32')
img_w = img_w[0].astype('int32')
masks = F.interpolate(
masks,
size=(img_h, img_w),
mode="bilinear",
align_corners=False)
mask_pred, scores = self._mask_postprocess(masks, scores, index)
bbox_pred = paddle.concat(
[
labels.unsqueeze(-1).astype('float32'), scores.unsqueeze(-1),
bbox_pred
],
axis=-1)
bbox_num = paddle.to_tensor(
self.num_top_queries, dtype='int32').tile([bbox_pred.shape[0]])
bbox_pred = bbox_pred.reshape([-1, 6])
return bbox_pred, bbox_num, mask_pred
def paste_mask(masks, boxes, im_h, im_w, assign_on_cpu=False):
"""
Paste the mask prediction to the original image.
"""
x0_int, y0_int = 0, 0
x1_int, y1_int = im_w, im_h
x0, y0, x1, y1 = paddle.split(boxes, 4, axis=1)
N = masks.shape[0]
img_y = paddle.arange(y0_int, y1_int) + 0.5
img_x = paddle.arange(x0_int, x1_int) + 0.5
img_y = (img_y - y0) / (y1 - y0) * 2 - 1
img_x = (img_x - x0) / (x1 - x0) * 2 - 1
# img_x, img_y have shapes (N, w), (N, h)
if assign_on_cpu:
paddle.set_device('cpu')
gx = img_x[:, None, :].expand(
[N, paddle.shape(img_y)[1], paddle.shape(img_x)[1]])
gy = img_y[:, :, None].expand(
[N, paddle.shape(img_y)[1], paddle.shape(img_x)[1]])
grid = paddle.stack([gx, gy], axis=3)
img_masks = F.grid_sample(masks, grid, align_corners=False)
return img_masks[:, 0]
def multiclass_nms(bboxs, num_classes, match_threshold=0.6, match_metric='iou'):
final_boxes = []
for c in range(num_classes):
idxs = bboxs[:, 0] == c
if np.count_nonzero(idxs) == 0: continue
r = nms(bboxs[idxs, 1:], match_threshold, match_metric)
final_boxes.append(np.concatenate([np.full((r.shape[0], 1), c), r], 1))
return final_boxes
def nms(dets, match_threshold=0.6, match_metric='iou'):
""" Apply NMS to avoid detecting too many overlapping bounding boxes.
Args:
dets: shape [N, 5], [score, x1, y1, x2, y2]
match_metric: 'iou' or 'ios'
match_threshold: overlap thresh for match metric.
"""
if dets.shape[0] == 0:
return dets[[], :]
scores = dets[:, 0]
x1 = dets[:, 1]
y1 = dets[:, 2]
x2 = dets[:, 3]
y2 = dets[:, 4]
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
order = scores.argsort()[::-1]
ndets = dets.shape[0]
suppressed = np.zeros((ndets), dtype=np.int32)
for _i in range(ndets):
i = order[_i]
if suppressed[i] == 1:
continue
ix1 = x1[i]
iy1 = y1[i]
ix2 = x2[i]
iy2 = y2[i]
iarea = areas[i]
for _j in range(_i + 1, ndets):
j = order[_j]
if suppressed[j] == 1:
continue
xx1 = max(ix1, x1[j])
yy1 = max(iy1, y1[j])
xx2 = min(ix2, x2[j])
yy2 = min(iy2, y2[j])
w = max(0.0, xx2 - xx1 + 1)
h = max(0.0, yy2 - yy1 + 1)
inter = w * h
if match_metric == 'iou':
union = iarea + areas[j] - inter
match_value = inter / union
elif match_metric == 'ios':
smaller = min(iarea, areas[j])
match_value = inter / smaller
else:
raise ValueError()
if match_value >= match_threshold:
suppressed[j] = 1
keep = np.where(suppressed == 0)[0]
dets = dets[keep, :]
return dets

View File

@@ -0,0 +1,25 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# The code is based on:
# https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/shape_spec.py
from collections import namedtuple
class ShapeSpec(
namedtuple("_ShapeSpec", ["channels", "height", "width", "stride"])):
def __new__(cls, channels=None, height=None, width=None, stride=None):
return super(ShapeSpec, cls).__new__(cls, channels, height, width,
stride)

View File

@@ -0,0 +1,20 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .utils import *
from .matchers import *
from .position_encoding import *
from .rtdetr_transformer import *
from .dino_transformer import *
from .hybrid_encoder import *

View File

@@ -0,0 +1,537 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
# Copyright (c) 2020 SenseTime. All Rights Reserved.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from ppdet.core.workspace import register
from ..layers import MultiHeadAttention
from .position_encoding import PositionEmbedding
from .utils import _get_clones, get_valid_ratio
from ..initializer import linear_init_, constant_, xavier_uniform_, normal_
__all__ = ['DeformableTransformer']
class MSDeformableAttention(nn.Layer):
def __init__(self,
embed_dim=256,
num_heads=8,
num_levels=4,
num_points=4,
lr_mult=0.1):
"""
Multi-Scale Deformable Attention Module
"""
super(MSDeformableAttention, self).__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.num_levels = num_levels
self.num_points = num_points
self.total_points = num_heads * num_levels * num_points
self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
self.sampling_offsets = nn.Linear(
embed_dim,
self.total_points * 2,
weight_attr=ParamAttr(learning_rate=lr_mult),
bias_attr=ParamAttr(learning_rate=lr_mult))
self.attention_weights = nn.Linear(embed_dim, self.total_points)
self.value_proj = nn.Linear(embed_dim, embed_dim)
self.output_proj = nn.Linear(embed_dim, embed_dim)
try:
# use cuda op
from deformable_detr_ops import ms_deformable_attn
except:
# use paddle func
from .utils import deformable_attention_core_func as ms_deformable_attn
self.ms_deformable_attn_core = ms_deformable_attn
self._reset_parameters()
def _reset_parameters(self):
# sampling_offsets
constant_(self.sampling_offsets.weight)
thetas = paddle.arange(
self.num_heads,
dtype=paddle.float32) * (2.0 * math.pi / self.num_heads)
grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1)
grid_init = grid_init / grid_init.abs().max(-1, keepdim=True)
grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile(
[1, self.num_levels, self.num_points, 1])
scaling = paddle.arange(
1, self.num_points + 1,
dtype=paddle.float32).reshape([1, 1, -1, 1])
grid_init *= scaling
self.sampling_offsets.bias.set_value(grid_init.flatten())
# attention_weights
constant_(self.attention_weights.weight)
constant_(self.attention_weights.bias)
# proj
xavier_uniform_(self.value_proj.weight)
constant_(self.value_proj.bias)
xavier_uniform_(self.output_proj.weight)
constant_(self.output_proj.bias)
def forward(self,
query,
reference_points,
value,
value_spatial_shapes,
value_level_start_index,
value_mask=None):
"""
Args:
query (Tensor): [bs, query_length, C]
reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
bottom-right (1, 1), including padding area
value (Tensor): [bs, value_length, C]
value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
Returns:
output (Tensor): [bs, Length_{query}, C]
"""
bs, Len_q = query.shape[:2]
Len_v = value.shape[1]
assert int(value_spatial_shapes.prod(1).sum()) == Len_v
value = self.value_proj(value)
if value_mask is not None:
value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
value *= value_mask
value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])
sampling_offsets = self.sampling_offsets(query).reshape(
[bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
attention_weights = self.attention_weights(query).reshape(
[bs, Len_q, self.num_heads, self.num_levels * self.num_points])
attention_weights = F.softmax(attention_weights).reshape(
[bs, Len_q, self.num_heads, self.num_levels, self.num_points])
if reference_points.shape[-1] == 2:
offset_normalizer = value_spatial_shapes.flip([1]).reshape(
[1, 1, 1, self.num_levels, 1, 2])
sampling_locations = reference_points.reshape([
bs, Len_q, 1, self.num_levels, 1, 2
]) + sampling_offsets / offset_normalizer
elif reference_points.shape[-1] == 4:
sampling_locations = (
reference_points[:, :, None, :, None, :2] + sampling_offsets /
self.num_points * reference_points[:, :, None, :, None, 2:] *
0.5)
else:
raise ValueError(
"Last dim of reference_points must be 2 or 4, but get {} instead.".
format(reference_points.shape[-1]))
output = self.ms_deformable_attn_core(
value, value_spatial_shapes, value_level_start_index,
sampling_locations, attention_weights)
output = self.output_proj(output)
return output
class DeformableTransformerEncoderLayer(nn.Layer):
def __init__(self,
d_model=256,
n_head=8,
dim_feedforward=1024,
dropout=0.1,
activation="relu",
n_levels=4,
n_points=4,
lr_mult=0.1,
weight_attr=None,
bias_attr=None):
super(DeformableTransformerEncoderLayer, self).__init__()
# self attention
self.self_attn = MSDeformableAttention(d_model, n_head, n_levels,
n_points, lr_mult)
self.dropout1 = nn.Dropout(dropout)
self.norm1 = nn.LayerNorm(
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
# ffn
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.activation = getattr(F, activation)
self.dropout2 = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.dropout3 = nn.Dropout(dropout)
self.norm2 = nn.LayerNorm(
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.linear1)
linear_init_(self.linear2)
xavier_uniform_(self.linear1.weight)
xavier_uniform_(self.linear2.weight)
def with_pos_embed(self, tensor, pos):
return tensor if pos is None else tensor + pos
def forward_ffn(self, src):
src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
src = src + self.dropout3(src2)
src = self.norm2(src)
return src
def forward(self,
src,
reference_points,
spatial_shapes,
level_start_index,
src_mask=None,
query_pos_embed=None):
# self attention
src2 = self.self_attn(
self.with_pos_embed(src, query_pos_embed), reference_points, src,
spatial_shapes, level_start_index, src_mask)
src = src + self.dropout1(src2)
src = self.norm1(src)
# ffn
src = self.forward_ffn(src)
return src
class DeformableTransformerEncoder(nn.Layer):
def __init__(self, encoder_layer, num_layers):
super(DeformableTransformerEncoder, self).__init__()
self.layers = _get_clones(encoder_layer, num_layers)
self.num_layers = num_layers
@staticmethod
def get_reference_points(spatial_shapes, valid_ratios, offset=0.5):
valid_ratios = valid_ratios.unsqueeze(1)
reference_points = []
for i, (H, W) in enumerate(spatial_shapes):
ref_y, ref_x = paddle.meshgrid(
paddle.arange(end=H) + offset, paddle.arange(end=W) + offset)
ref_y = ref_y.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 1] *
H)
ref_x = ref_x.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 0] *
W)
reference_points.append(paddle.stack((ref_x, ref_y), axis=-1))
reference_points = paddle.concat(reference_points, 1).unsqueeze(2)
reference_points = reference_points * valid_ratios
return reference_points
def forward(self,
feat,
spatial_shapes,
level_start_index,
feat_mask=None,
query_pos_embed=None,
valid_ratios=None):
if valid_ratios is None:
valid_ratios = paddle.ones(
[feat.shape[0], spatial_shapes.shape[0], 2])
reference_points = self.get_reference_points(spatial_shapes,
valid_ratios)
for layer in self.layers:
feat = layer(feat, reference_points, spatial_shapes,
level_start_index, feat_mask, query_pos_embed)
return feat
class DeformableTransformerDecoderLayer(nn.Layer):
def __init__(self,
d_model=256,
n_head=8,
dim_feedforward=1024,
dropout=0.1,
activation="relu",
n_levels=4,
n_points=4,
lr_mult=0.1,
weight_attr=None,
bias_attr=None):
super(DeformableTransformerDecoderLayer, self).__init__()
# self attention
self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
self.dropout1 = nn.Dropout(dropout)
self.norm1 = nn.LayerNorm(
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
# cross attention
self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,
n_points, lr_mult)
self.dropout2 = nn.Dropout(dropout)
self.norm2 = nn.LayerNorm(
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
# ffn
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.activation = getattr(F, activation)
self.dropout3 = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.dropout4 = nn.Dropout(dropout)
self.norm3 = nn.LayerNorm(
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.linear1)
linear_init_(self.linear2)
xavier_uniform_(self.linear1.weight)
xavier_uniform_(self.linear2.weight)
def with_pos_embed(self, tensor, pos):
return tensor if pos is None else tensor + pos
def forward_ffn(self, tgt):
tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
tgt = tgt + self.dropout4(tgt2)
tgt = self.norm3(tgt)
return tgt
def forward(self,
tgt,
reference_points,
memory,
memory_spatial_shapes,
memory_level_start_index,
memory_mask=None,
query_pos_embed=None):
# self attention
q = k = self.with_pos_embed(tgt, query_pos_embed)
tgt2 = self.self_attn(q, k, value=tgt)
tgt = tgt + self.dropout1(tgt2)
tgt = self.norm1(tgt)
# cross attention
tgt2 = self.cross_attn(
self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
memory_spatial_shapes, memory_level_start_index, memory_mask)
tgt = tgt + self.dropout2(tgt2)
tgt = self.norm2(tgt)
# ffn
tgt = self.forward_ffn(tgt)
return tgt
class DeformableTransformerDecoder(nn.Layer):
def __init__(self, decoder_layer, num_layers, return_intermediate=False):
super(DeformableTransformerDecoder, self).__init__()
self.layers = _get_clones(decoder_layer, num_layers)
self.num_layers = num_layers
self.return_intermediate = return_intermediate
def forward(self,
tgt,
reference_points,
memory,
memory_spatial_shapes,
memory_level_start_index,
memory_mask=None,
query_pos_embed=None):
output = tgt
intermediate = []
for lid, layer in enumerate(self.layers):
output = layer(output, reference_points, memory,
memory_spatial_shapes, memory_level_start_index,
memory_mask, query_pos_embed)
if self.return_intermediate:
intermediate.append(output)
if self.return_intermediate:
return paddle.stack(intermediate)
return output.unsqueeze(0)
@register
class DeformableTransformer(nn.Layer):
__shared__ = ['hidden_dim']
def __init__(self,
num_queries=300,
position_embed_type='sine',
return_intermediate_dec=True,
in_feats_channel=[512, 1024, 2048],
num_feature_levels=4,
num_encoder_points=4,
num_decoder_points=4,
hidden_dim=256,
nhead=8,
num_encoder_layers=6,
num_decoder_layers=6,
dim_feedforward=1024,
dropout=0.1,
activation="relu",
lr_mult=0.1,
pe_temperature=10000,
pe_offset=-0.5):
super(DeformableTransformer, self).__init__()
assert position_embed_type in ['sine', 'learned'], \
f'ValueError: position_embed_type not supported {position_embed_type}!'
assert len(in_feats_channel) <= num_feature_levels
self.hidden_dim = hidden_dim
self.nhead = nhead
self.num_feature_levels = num_feature_levels
encoder_layer = DeformableTransformerEncoderLayer(
hidden_dim, nhead, dim_feedforward, dropout, activation,
num_feature_levels, num_encoder_points, lr_mult)
self.encoder = DeformableTransformerEncoder(encoder_layer,
num_encoder_layers)
decoder_layer = DeformableTransformerDecoderLayer(
hidden_dim, nhead, dim_feedforward, dropout, activation,
num_feature_levels, num_decoder_points)
self.decoder = DeformableTransformerDecoder(
decoder_layer, num_decoder_layers, return_intermediate_dec)
self.level_embed = nn.Embedding(num_feature_levels, hidden_dim)
self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)
self.reference_points = nn.Linear(
hidden_dim,
2,
weight_attr=ParamAttr(learning_rate=lr_mult),
bias_attr=ParamAttr(learning_rate=lr_mult))
self.input_proj = nn.LayerList()
for in_channels in in_feats_channel:
self.input_proj.append(
nn.Sequential(
nn.Conv2D(
in_channels, hidden_dim, kernel_size=1),
nn.GroupNorm(32, hidden_dim)))
in_channels = in_feats_channel[-1]
for _ in range(num_feature_levels - len(in_feats_channel)):
self.input_proj.append(
nn.Sequential(
nn.Conv2D(
in_channels,
hidden_dim,
kernel_size=3,
stride=2,
padding=1),
nn.GroupNorm(32, hidden_dim)))
in_channels = hidden_dim
self.position_embedding = PositionEmbedding(
hidden_dim // 2,
temperature=pe_temperature,
normalize=True if position_embed_type == 'sine' else False,
embed_type=position_embed_type,
offset=pe_offset,
eps=1e-4)
self._reset_parameters()
def _reset_parameters(self):
normal_(self.level_embed.weight)
normal_(self.tgt_embed.weight)
normal_(self.query_pos_embed.weight)
xavier_uniform_(self.reference_points.weight)
constant_(self.reference_points.bias)
for l in self.input_proj:
xavier_uniform_(l[0].weight)
constant_(l[0].bias)
@classmethod
def from_config(cls, cfg, input_shape):
return {'in_feats_channel': [i.channels for i in input_shape], }
def forward(self, src_feats, src_mask=None, *args, **kwargs):
srcs = []
for i in range(len(src_feats)):
srcs.append(self.input_proj[i](src_feats[i]))
if self.num_feature_levels > len(srcs):
len_srcs = len(srcs)
for i in range(len_srcs, self.num_feature_levels):
if i == len_srcs:
srcs.append(self.input_proj[i](src_feats[-1]))
else:
srcs.append(self.input_proj[i](srcs[-1]))
src_flatten = []
mask_flatten = []
lvl_pos_embed_flatten = []
spatial_shapes = []
valid_ratios = []
for level, src in enumerate(srcs):
src_shape = paddle.shape(src)
bs = src_shape[0:1]
h = src_shape[2:3]
w = src_shape[3:4]
spatial_shapes.append(paddle.concat([h, w]))
src = src.flatten(2).transpose([0, 2, 1])
src_flatten.append(src)
if src_mask is not None:
mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]
else:
mask = paddle.ones([bs, h, w])
valid_ratios.append(get_valid_ratio(mask))
pos_embed = self.position_embedding(mask).flatten(1, 2)
lvl_pos_embed = pos_embed + self.level_embed.weight[level]
lvl_pos_embed_flatten.append(lvl_pos_embed)
mask = mask.flatten(1)
mask_flatten.append(mask)
src_flatten = paddle.concat(src_flatten, 1)
mask_flatten = None if src_mask is None else paddle.concat(mask_flatten,
1)
lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
# [l, 2]
spatial_shapes = paddle.to_tensor(
paddle.stack(spatial_shapes).astype('int64'))
# [l], 每一个level的起始index
level_start_index = paddle.concat([
paddle.zeros(
[1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
])
# [b, l, 2]
valid_ratios = paddle.stack(valid_ratios, 1)
# encoder
memory = self.encoder(src_flatten, spatial_shapes, level_start_index,
mask_flatten, lvl_pos_embed_flatten, valid_ratios)
# prepare input for decoder
bs, _, c = memory.shape
query_embed = self.query_pos_embed.weight.unsqueeze(0).tile([bs, 1, 1])
tgt = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
reference_points = F.sigmoid(self.reference_points(query_embed))
reference_points_input = reference_points.unsqueeze(
2) * valid_ratios.unsqueeze(1)
# decoder
hs = self.decoder(tgt, reference_points_input, memory, spatial_shapes,
level_start_index, mask_flatten, query_embed)
return (hs, memory, reference_points)

View File

@@ -0,0 +1,359 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from ..layers import MultiHeadAttention, _convert_attention_mask
from .position_encoding import PositionEmbedding
from .utils import _get_clones
from ..initializer import linear_init_, conv_init_, xavier_uniform_, normal_
__all__ = ['DETRTransformer']
class TransformerEncoderLayer(nn.Layer):
def __init__(self,
d_model,
nhead,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
attn_dropout=None,
act_dropout=None,
normalize_before=False):
super(TransformerEncoderLayer, self).__init__()
attn_dropout = dropout if attn_dropout is None else attn_dropout
act_dropout = dropout if act_dropout is None else act_dropout
self.normalize_before = normalize_before
self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
self.activation = getattr(F, activation)
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.linear1)
linear_init_(self.linear2)
@staticmethod
def with_pos_embed(tensor, pos_embed):
return tensor if pos_embed is None else tensor + pos_embed
def forward(self, src, src_mask=None, pos_embed=None):
residual = src
if self.normalize_before:
src = self.norm1(src)
q = k = self.with_pos_embed(src, pos_embed)
src = self.self_attn(q, k, value=src, attn_mask=src_mask)
src = residual + self.dropout1(src)
if not self.normalize_before:
src = self.norm1(src)
residual = src
if self.normalize_before:
src = self.norm2(src)
src = self.linear2(self.dropout(self.activation(self.linear1(src))))
src = residual + self.dropout2(src)
if not self.normalize_before:
src = self.norm2(src)
return src
class TransformerEncoder(nn.Layer):
def __init__(self, encoder_layer, num_layers, norm=None):
super(TransformerEncoder, self).__init__()
self.layers = _get_clones(encoder_layer, num_layers)
self.num_layers = num_layers
self.norm = norm
def forward(self, src, src_mask=None, pos_embed=None):
output = src
for layer in self.layers:
output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
if self.norm is not None:
output = self.norm(output)
return output
class TransformerDecoderLayer(nn.Layer):
def __init__(self,
d_model,
nhead,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
attn_dropout=None,
act_dropout=None,
normalize_before=False):
super(TransformerDecoderLayer, self).__init__()
attn_dropout = dropout if attn_dropout is None else attn_dropout
act_dropout = dropout if act_dropout is None else act_dropout
self.normalize_before = normalize_before
self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
self.dropout3 = nn.Dropout(dropout, mode="upscale_in_train")
self.activation = getattr(F, activation)
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.linear1)
linear_init_(self.linear2)
@staticmethod
def with_pos_embed(tensor, pos_embed):
return tensor if pos_embed is None else tensor + pos_embed
def forward(self,
tgt,
memory,
tgt_mask=None,
memory_mask=None,
pos_embed=None,
query_pos_embed=None):
tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
residual = tgt
if self.normalize_before:
tgt = self.norm1(tgt)
q = k = self.with_pos_embed(tgt, query_pos_embed)
tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask)
tgt = residual + self.dropout1(tgt)
if not self.normalize_before:
tgt = self.norm1(tgt)
residual = tgt
if self.normalize_before:
tgt = self.norm2(tgt)
q = self.with_pos_embed(tgt, query_pos_embed)
k = self.with_pos_embed(memory, pos_embed)
tgt = self.cross_attn(q, k, value=memory, attn_mask=memory_mask)
tgt = residual + self.dropout2(tgt)
if not self.normalize_before:
tgt = self.norm2(tgt)
residual = tgt
if self.normalize_before:
tgt = self.norm3(tgt)
tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
tgt = residual + self.dropout3(tgt)
if not self.normalize_before:
tgt = self.norm3(tgt)
return tgt
class TransformerDecoder(nn.Layer):
def __init__(self,
decoder_layer,
num_layers,
norm=None,
return_intermediate=False):
super(TransformerDecoder, self).__init__()
self.layers = _get_clones(decoder_layer, num_layers)
self.num_layers = num_layers
self.norm = norm
self.return_intermediate = return_intermediate
def forward(self,
tgt,
memory,
tgt_mask=None,
memory_mask=None,
pos_embed=None,
query_pos_embed=None):
tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
output = tgt
intermediate = []
for layer in self.layers:
output = layer(
output,
memory,
tgt_mask=tgt_mask,
memory_mask=memory_mask,
pos_embed=pos_embed,
query_pos_embed=query_pos_embed)
if self.return_intermediate:
intermediate.append(self.norm(output))
if self.norm is not None:
output = self.norm(output)
if self.return_intermediate:
return paddle.stack(intermediate)
return output.unsqueeze(0)
@register
class DETRTransformer(nn.Layer):
__shared__ = ['hidden_dim']
def __init__(self,
num_queries=100,
position_embed_type='sine',
return_intermediate_dec=True,
backbone_num_channels=2048,
hidden_dim=256,
nhead=8,
num_encoder_layers=6,
num_decoder_layers=6,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
pe_temperature=10000,
pe_offset=0.,
attn_dropout=None,
act_dropout=None,
normalize_before=False):
super(DETRTransformer, self).__init__()
assert position_embed_type in ['sine', 'learned'],\
f'ValueError: position_embed_type not supported {position_embed_type}!'
self.hidden_dim = hidden_dim
self.nhead = nhead
encoder_layer = TransformerEncoderLayer(
hidden_dim, nhead, dim_feedforward, dropout, activation,
attn_dropout, act_dropout, normalize_before)
encoder_norm = nn.LayerNorm(hidden_dim) if normalize_before else None
self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
encoder_norm)
decoder_layer = TransformerDecoderLayer(
hidden_dim, nhead, dim_feedforward, dropout, activation,
attn_dropout, act_dropout, normalize_before)
decoder_norm = nn.LayerNorm(hidden_dim)
self.decoder = TransformerDecoder(
decoder_layer,
num_decoder_layers,
decoder_norm,
return_intermediate=return_intermediate_dec)
self.input_proj = nn.Conv2D(
backbone_num_channels, hidden_dim, kernel_size=1)
self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)
self.position_embedding = PositionEmbedding(
hidden_dim // 2,
temperature=pe_temperature,
normalize=True if position_embed_type == 'sine' else False,
embed_type=position_embed_type,
offset=pe_offset)
self._reset_parameters()
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
xavier_uniform_(p)
conv_init_(self.input_proj)
normal_(self.query_pos_embed.weight)
@classmethod
def from_config(cls, cfg, input_shape):
return {
'backbone_num_channels': [i.channels for i in input_shape][-1],
}
def _convert_attention_mask(self, mask):
return (mask - 1.0) * 1e9
def forward(self, src, src_mask=None, *args, **kwargs):
r"""
Applies a Transformer model on the inputs.
Parameters:
src (List(Tensor)): Backbone feature maps with shape [[bs, c, h, w]].
src_mask (Tensor, optional): A tensor used in multi-head attention
to prevents attention to some unwanted positions, usually the
paddings or the subsequent positions. It is a tensor with shape
[bs, H, W]`. When the data type is bool, the unwanted positions
have `False` values and the others have `True` values. When the
data type is int, the unwanted positions have 0 values and the
others have 1 values. When the data type is float, the unwanted
positions have `-INF` values and the others have 0 values. It
can be None when nothing wanted or needed to be prevented
attention to. Default None.
Returns:
output (Tensor): [num_levels, batch_size, num_queries, hidden_dim]
memory (Tensor): [batch_size, hidden_dim, h, w]
"""
# use last level feature map
src_proj = self.input_proj(src[-1])
bs, c, h, w = paddle.shape(src_proj)
# flatten [B, C, H, W] to [B, HxW, C]
src_flatten = src_proj.flatten(2).transpose([0, 2, 1])
if src_mask is not None:
src_mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]
else:
src_mask = paddle.ones([bs, h, w])
pos_embed = self.position_embedding(src_mask).flatten(1, 2)
if self.training:
src_mask = self._convert_attention_mask(src_mask)
src_mask = src_mask.reshape([bs, 1, 1, h * w])
else:
src_mask = None
memory = self.encoder(
src_flatten, src_mask=src_mask, pos_embed=pos_embed)
query_pos_embed = self.query_pos_embed.weight.unsqueeze(0).tile(
[bs, 1, 1])
tgt = paddle.zeros_like(query_pos_embed)
output = self.decoder(
tgt,
memory,
memory_mask=src_mask,
pos_embed=pos_embed,
query_pos_embed=query_pos_embed)
if self.training:
src_mask = src_mask.reshape([bs, 1, 1, h, w])
else:
src_mask = None
return (output, memory.transpose([0, 2, 1]).reshape([bs, c, h, w]),
src_proj, src_mask)

View File

@@ -0,0 +1,527 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Modified from detrex (https://github.com/IDEA-Research/detrex)
# Copyright 2022 The IDEA Authors. All rights reserved.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register
from ..layers import MultiHeadAttention
from .position_encoding import PositionEmbedding
from .deformable_transformer import (MSDeformableAttention,
DeformableTransformerEncoderLayer,
DeformableTransformerEncoder)
from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,
bias_init_with_prob)
from .utils import (_get_clones, get_valid_ratio,
get_contrastive_denoising_training_group,
get_sine_pos_embed, inverse_sigmoid, MLP)
__all__ = ['DINOTransformer']
class DINOTransformerDecoderLayer(nn.Layer):
def __init__(self,
d_model=256,
n_head=8,
dim_feedforward=1024,
dropout=0.,
activation="relu",
n_levels=4,
n_points=4,
lr_mult=1.0,
weight_attr=None,
bias_attr=None):
super(DINOTransformerDecoderLayer, self).__init__()
# self attention
self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
self.dropout1 = nn.Dropout(dropout)
self.norm1 = nn.LayerNorm(
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
# cross attention
self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,
n_points, lr_mult)
self.dropout2 = nn.Dropout(dropout)
self.norm2 = nn.LayerNorm(
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
# ffn
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.activation = getattr(F, activation)
self.dropout3 = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.dropout4 = nn.Dropout(dropout)
self.norm3 = nn.LayerNorm(
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.linear1)
linear_init_(self.linear2)
xavier_uniform_(self.linear1.weight)
xavier_uniform_(self.linear2.weight)
def with_pos_embed(self, tensor, pos):
return tensor if pos is None else tensor + pos
def forward_ffn(self, tgt):
return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
def forward(self,
tgt,
reference_points,
memory,
memory_spatial_shapes,
memory_level_start_index,
attn_mask=None,
memory_mask=None,
query_pos_embed=None):
# self attention
q = k = self.with_pos_embed(tgt, query_pos_embed)
if attn_mask is not None:
attn_mask = paddle.where(
attn_mask.astype('bool'),
paddle.zeros(attn_mask.shape, tgt.dtype),
paddle.full(attn_mask.shape, float("-inf"), tgt.dtype))
tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
tgt = tgt + self.dropout1(tgt2)
tgt = self.norm1(tgt)
# cross attention
tgt2 = self.cross_attn(
self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
memory_spatial_shapes, memory_level_start_index, memory_mask)
tgt = tgt + self.dropout2(tgt2)
tgt = self.norm2(tgt)
# ffn
tgt2 = self.forward_ffn(tgt)
tgt = tgt + self.dropout4(tgt2)
tgt = self.norm3(tgt)
return tgt
class DINOTransformerDecoder(nn.Layer):
def __init__(self,
hidden_dim,
decoder_layer,
num_layers,
weight_attr=None,
bias_attr=None):
super(DINOTransformerDecoder, self).__init__()
self.layers = _get_clones(decoder_layer, num_layers)
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.norm = nn.LayerNorm(
hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)
def forward(self,
tgt,
ref_points_unact,
memory,
memory_spatial_shapes,
memory_level_start_index,
bbox_head,
query_pos_head,
valid_ratios=None,
attn_mask=None,
memory_mask=None):
if valid_ratios is None:
valid_ratios = paddle.ones(
[memory.shape[0], memory_spatial_shapes.shape[0], 2])
output = tgt
intermediate = []
inter_bboxes = []
ref_points = F.sigmoid(ref_points_unact)
for i, layer in enumerate(self.layers):
reference_points_input = ref_points.detach().unsqueeze(
2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1)
query_pos_embed = get_sine_pos_embed(
reference_points_input[..., 0, :], self.hidden_dim // 2)
query_pos_embed = query_pos_head(query_pos_embed)
output = layer(output, reference_points_input, memory,
memory_spatial_shapes, memory_level_start_index,
attn_mask, memory_mask, query_pos_embed)
ref_points = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
ref_points.detach()))
intermediate.append(self.norm(output))
inter_bboxes.append(ref_points)
return paddle.stack(intermediate), paddle.stack(inter_bboxes)
@register
class DINOTransformer(nn.Layer):
__shared__ = ['num_classes', 'hidden_dim']
def __init__(self,
num_classes=80,
hidden_dim=256,
num_queries=900,
position_embed_type='sine',
in_feats_channel=[512, 1024, 2048],
num_levels=4,
num_encoder_points=4,
num_decoder_points=4,
nhead=8,
num_encoder_layers=6,
num_decoder_layers=6,
dim_feedforward=1024,
dropout=0.,
activation="relu",
lr_mult=1.0,
pe_temperature=10000,
pe_offset=-0.5,
num_denoising=100,
label_noise_ratio=0.5,
box_noise_scale=1.0,
learnt_init_query=True,
eps=1e-2):
super(DINOTransformer, self).__init__()
assert position_embed_type in ['sine', 'learned'], \
f'ValueError: position_embed_type not supported {position_embed_type}!'
assert len(in_feats_channel) <= num_levels
self.hidden_dim = hidden_dim
self.nhead = nhead
self.num_levels = num_levels
self.num_classes = num_classes
self.num_queries = num_queries
self.eps = eps
self.num_decoder_layers = num_decoder_layers
weight_attr = ParamAttr(regularizer=L2Decay(0.0))
bias_attr = ParamAttr(regularizer=L2Decay(0.0))
# backbone feature projection
self._build_input_proj_layer(in_feats_channel, weight_attr, bias_attr)
# Transformer module
encoder_layer = DeformableTransformerEncoderLayer(
hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
num_encoder_points, lr_mult, weight_attr, bias_attr)
self.encoder = DeformableTransformerEncoder(encoder_layer,
num_encoder_layers)
decoder_layer = DINOTransformerDecoderLayer(
hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
num_decoder_points, lr_mult, weight_attr, bias_attr)
self.decoder = DINOTransformerDecoder(hidden_dim, decoder_layer,
num_decoder_layers, weight_attr,
bias_attr)
# denoising part
self.denoising_class_embed = nn.Embedding(
num_classes,
hidden_dim,
weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
self.num_denoising = num_denoising
self.label_noise_ratio = label_noise_ratio
self.box_noise_scale = box_noise_scale
# position embedding
self.position_embedding = PositionEmbedding(
hidden_dim // 2,
temperature=pe_temperature,
normalize=True if position_embed_type == 'sine' else False,
embed_type=position_embed_type,
offset=pe_offset)
self.level_embed = nn.Embedding(num_levels, hidden_dim)
# decoder embedding
self.learnt_init_query = learnt_init_query
if learnt_init_query:
self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
self.query_pos_head = MLP(2 * hidden_dim,
hidden_dim,
hidden_dim,
num_layers=2)
# encoder head
self.enc_output = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
nn.LayerNorm(
hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr))
self.enc_score_head = nn.Linear(hidden_dim, num_classes)
self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
# decoder head
self.dec_score_head = nn.LayerList([
nn.Linear(hidden_dim, num_classes)
for _ in range(num_decoder_layers)
])
self.dec_bbox_head = nn.LayerList([
MLP(hidden_dim, hidden_dim, 4, num_layers=3)
for _ in range(num_decoder_layers)
])
self._reset_parameters()
def _reset_parameters(self):
# class and bbox head init
bias_cls = bias_init_with_prob(0.01)
linear_init_(self.enc_score_head)
constant_(self.enc_score_head.bias, bias_cls)
constant_(self.enc_bbox_head.layers[-1].weight)
constant_(self.enc_bbox_head.layers[-1].bias)
for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
linear_init_(cls_)
constant_(cls_.bias, bias_cls)
constant_(reg_.layers[-1].weight)
constant_(reg_.layers[-1].bias)
linear_init_(self.enc_output[0])
xavier_uniform_(self.enc_output[0].weight)
normal_(self.level_embed.weight)
if self.learnt_init_query:
xavier_uniform_(self.tgt_embed.weight)
xavier_uniform_(self.query_pos_head.layers[0].weight)
xavier_uniform_(self.query_pos_head.layers[1].weight)
for l in self.input_proj:
xavier_uniform_(l[0].weight)
constant_(l[0].bias)
@classmethod
def from_config(cls, cfg, input_shape):
return {'in_feats_channel': [i.channels for i in input_shape], }
def _build_input_proj_layer(self,
in_feats_channel,
weight_attr=None,
bias_attr=None):
self.input_proj = nn.LayerList()
for in_channels in in_feats_channel:
self.input_proj.append(
nn.Sequential(
('conv', nn.Conv2D(
in_channels, self.hidden_dim, kernel_size=1)), (
'norm', nn.GroupNorm(
32,
self.hidden_dim,
weight_attr=weight_attr,
bias_attr=bias_attr))))
in_channels = in_feats_channel[-1]
for _ in range(self.num_levels - len(in_feats_channel)):
self.input_proj.append(
nn.Sequential(
('conv', nn.Conv2D(
in_channels,
self.hidden_dim,
kernel_size=3,
stride=2,
padding=1)), ('norm', nn.GroupNorm(
32,
self.hidden_dim,
weight_attr=weight_attr,
bias_attr=bias_attr))))
in_channels = self.hidden_dim
def _get_encoder_input(self, feats, pad_mask=None):
# get projection features
proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
if self.num_levels > len(proj_feats):
len_srcs = len(proj_feats)
for i in range(len_srcs, self.num_levels):
if i == len_srcs:
proj_feats.append(self.input_proj[i](feats[-1]))
else:
proj_feats.append(self.input_proj[i](proj_feats[-1]))
# get encoder inputs
feat_flatten = []
mask_flatten = []
lvl_pos_embed_flatten = []
spatial_shapes = []
valid_ratios = []
for i, feat in enumerate(proj_feats):
bs, _, h, w = paddle.shape(feat)
spatial_shapes.append(paddle.stack([h, w]))
# [b,c,h,w] -> [b,h*w,c]
feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
if pad_mask is not None:
mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0]
else:
mask = paddle.ones([bs, h, w])
valid_ratios.append(get_valid_ratio(mask))
# [b, h*w, c]
pos_embed = self.position_embedding(mask).flatten(1, 2)
lvl_pos_embed = pos_embed + self.level_embed.weight[i]
lvl_pos_embed_flatten.append(lvl_pos_embed)
if pad_mask is not None:
# [b, h*w]
mask_flatten.append(mask.flatten(1))
# [b, l, c]
feat_flatten = paddle.concat(feat_flatten, 1)
# [b, l]
mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten,
1)
# [b, l, c]
lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
# [num_levels, 2]
spatial_shapes = paddle.to_tensor(
paddle.stack(spatial_shapes).astype('int64'))
# [l] start index of each level
level_start_index = paddle.concat([
paddle.zeros(
[1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
])
# [b, num_levels, 2]
valid_ratios = paddle.stack(valid_ratios, 1)
return (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
lvl_pos_embed_flatten, valid_ratios)
def forward(self, feats, pad_mask=None, gt_meta=None):
# input projection and embedding
(feat_flatten, spatial_shapes, level_start_index, mask_flatten,
lvl_pos_embed_flatten,
valid_ratios) = self._get_encoder_input(feats, pad_mask)
# encoder
memory = self.encoder(feat_flatten, spatial_shapes, level_start_index,
mask_flatten, lvl_pos_embed_flatten, valid_ratios)
# prepare denoising training
if self.training:
denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
get_contrastive_denoising_training_group(gt_meta,
self.num_classes,
self.num_queries,
self.denoising_class_embed.weight,
self.num_denoising,
self.label_noise_ratio,
self.box_noise_scale)
else:
denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
self._get_decoder_input(
memory, spatial_shapes, mask_flatten, denoising_class,
denoising_bbox_unact)
# decoder
inter_feats, inter_bboxes = self.decoder(
target, init_ref_points_unact, memory, spatial_shapes,
level_start_index, self.dec_bbox_head, self.query_pos_head,
valid_ratios, attn_mask, mask_flatten)
out_bboxes = []
out_logits = []
for i in range(self.num_decoder_layers):
out_logits.append(self.dec_score_head[i](inter_feats[i]))
if i == 0:
out_bboxes.append(
F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
init_ref_points_unact))
else:
out_bboxes.append(
F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
inverse_sigmoid(inter_bboxes[i - 1])))
out_bboxes = paddle.stack(out_bboxes)
out_logits = paddle.stack(out_logits)
return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
dn_meta)
def _get_encoder_output_anchors(self,
memory,
spatial_shapes,
memory_mask=None,
grid_size=0.05):
output_anchors = []
idx = 0
for lvl, (h, w) in enumerate(spatial_shapes):
if memory_mask is not None:
mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w])
valid_H = paddle.sum(mask_[:, :, 0], 1)
valid_W = paddle.sum(mask_[:, 0, :], 1)
else:
valid_H, valid_W = h, w
grid_y, grid_x = paddle.meshgrid(
paddle.arange(end=h), paddle.arange(end=w))
grid_xy = paddle.stack([grid_x, grid_y], -1).astype(memory.dtype)
valid_WH = paddle.stack([valid_W, valid_H], -1).reshape(
[-1, 1, 1, 2]).astype(grid_xy.dtype)
grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
output_anchors.append(
paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
idx += h * w
output_anchors = paddle.concat(output_anchors, 1)
valid_mask = ((output_anchors > self.eps) *
(output_anchors < 1 - self.eps)).all(-1, keepdim=True)
output_anchors = paddle.log(output_anchors / (1 - output_anchors))
if memory_mask is not None:
valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0
output_anchors = paddle.where(valid_mask, output_anchors,
paddle.to_tensor(float("inf")))
memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
output_memory = self.enc_output(memory)
return output_memory, output_anchors
def _get_decoder_input(self,
memory,
spatial_shapes,
memory_mask=None,
denoising_class=None,
denoising_bbox_unact=None):
bs, _, _ = memory.shape
# prepare input for decoder
output_memory, output_anchors = self._get_encoder_output_anchors(
memory, spatial_shapes, memory_mask)
enc_outputs_class = self.enc_score_head(output_memory)
enc_outputs_coord_unact = self.enc_bbox_head(
output_memory) + output_anchors
_, topk_ind = paddle.topk(
enc_outputs_class.max(-1), self.num_queries, axis=1)
# extract region proposal boxes
batch_ind = paddle.arange(end=bs).astype(topk_ind.dtype)
batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact,
topk_ind) # unsigmoided.
enc_topk_bboxes = F.sigmoid(reference_points_unact)
if denoising_bbox_unact is not None:
reference_points_unact = paddle.concat(
[denoising_bbox_unact, reference_points_unact], 1)
enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)
# extract region features
if self.learnt_init_query:
target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
else:
target = paddle.gather_nd(output_memory, topk_ind).detach()
if denoising_class is not None:
target = paddle.concat([denoising_class, target], 1)
return target, reference_points_unact.detach(
), enc_topk_bboxes, enc_topk_logits

View File

@@ -0,0 +1,85 @@
# Multi-scale deformable attention自定义OP编译
该自定义OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。
## 1. 环境依赖
- Paddle >= 2.3.2
- gcc 8.2
## 2. 安装
请在当前路径下进行编译安装
```
cd rtdetr_paddle/ppdet/modeling/transformers/ext_op/
python setup_ms_deformable_attn_op.py install
```
编译完成后即可使用,以下为`ms_deformable_attn`的使用示例
```
# 引入自定义op
from deformable_detr_ops import ms_deformable_attn
# 构造fake input tensor
bs, n_heads, c = 2, 8, 8
query_length, n_levels, n_points = 2, 2, 2
spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
level_start_index = paddle.concat((paddle.to_tensor(
[0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
value_length = sum([(H * W).item() for H, W in spatial_shapes])
def get_test_tensors(channels):
value = paddle.rand(
[bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
sampling_locations = paddle.rand(
[bs, query_length, n_heads, n_levels, n_points, 2],
dtype=paddle.float32)
attention_weights = paddle.rand(
[bs, query_length, n_heads, n_levels, n_points],
dtype=paddle.float32) + 1e-5
attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
-2, keepdim=True)
return [value, sampling_locations, attention_weights]
value, sampling_locations, attention_weights = get_test_tensors(c)
output = ms_deformable_attn(value,
spatial_shapes,
level_start_index,
sampling_locations,
attention_weights)
```
## 3. 单元测试
可以通过执行单元测试来确认自定义算子功能的正确性,执行单元测试的示例如下所示:
```
python test_ms_deformable_attn_op.py
```
运行成功后,打印如下:
```
*True check_forward_equal_with_paddle_float: max_abs_err 6.98e-10 max_rel_err 2.03e-07
*tensor1 True check_gradient_numerical(D=30)
*tensor2 True check_gradient_numerical(D=30)
*tensor3 True check_gradient_numerical(D=30)
*tensor1 True check_gradient_numerical(D=32)
*tensor2 True check_gradient_numerical(D=32)
*tensor3 True check_gradient_numerical(D=32)
*tensor1 True check_gradient_numerical(D=64)
*tensor2 True check_gradient_numerical(D=64)
*tensor3 True check_gradient_numerical(D=64)
*tensor1 True check_gradient_numerical(D=71)
*tensor2 True check_gradient_numerical(D=71)
*tensor3 True check_gradient_numerical(D=71)
*tensor1 True check_gradient_numerical(D=128)
*tensor2 True check_gradient_numerical(D=128)
*tensor3 True check_gradient_numerical(D=128)
*tensor1 True check_gradient_numerical(D=1024)
*tensor2 True check_gradient_numerical(D=1024)
*tensor3 True check_gradient_numerical(D=1024)
*tensor1 True check_gradient_numerical(D=1025)
*tensor2 True check_gradient_numerical(D=1025)
*tensor3 True check_gradient_numerical(D=1025)
*tensor1 True check_gradient_numerical(D=2048)
*tensor2 True check_gradient_numerical(D=2048)
*tensor3 True check_gradient_numerical(D=2048)
*tensor1 True check_gradient_numerical(D=3096)
*tensor2 True check_gradient_numerical(D=3096)
*tensor3 True check_gradient_numerical(D=3096)
```

View File

@@ -0,0 +1,65 @@
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/extension.h"
#include <vector>
// declare GPU implementation
std::vector<paddle::Tensor>
MSDeformableAttnCUDAForward(const paddle::Tensor &value,
const paddle::Tensor &value_spatial_shapes,
const paddle::Tensor &value_level_start_index,
const paddle::Tensor &sampling_locations,
const paddle::Tensor &attention_weights);
std::vector<paddle::Tensor> MSDeformableAttnCUDABackward(
const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes,
const paddle::Tensor &value_level_start_index,
const paddle::Tensor &sampling_locations,
const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out);
//// CPU not implemented
std::vector<std::vector<int64_t>>
MSDeformableAttnInferShape(std::vector<int64_t> value_shape,
std::vector<int64_t> value_spatial_shapes_shape,
std::vector<int64_t> value_level_start_index_shape,
std::vector<int64_t> sampling_locations_shape,
std::vector<int64_t> attention_weights_shape) {
return {{value_shape[0], sampling_locations_shape[1],
value_shape[2] * value_shape[3]}};
}
std::vector<paddle::DataType>
MSDeformableAttnInferDtype(paddle::DataType value_dtype,
paddle::DataType value_spatial_shapes_dtype,
paddle::DataType value_level_start_index_dtype,
paddle::DataType sampling_locations_dtype,
paddle::DataType attention_weights_dtype) {
return {value_dtype};
}
PD_BUILD_OP(ms_deformable_attn)
.Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
"AttentionWeights"})
.Outputs({"Out"})
.SetKernelFn(PD_KERNEL(MSDeformableAttnCUDAForward))
.SetInferShapeFn(PD_INFER_SHAPE(MSDeformableAttnInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(MSDeformableAttnInferDtype));
PD_BUILD_GRAD_OP(ms_deformable_attn)
.Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
"AttentionWeights", paddle::Grad("Out")})
.Outputs({paddle::Grad("Value"), paddle::Grad("SpatialShapes"),
paddle::Grad("LevelIndex"), paddle::Grad("SamplingLocations"),
paddle::Grad("AttentionWeights")})
.SetKernelFn(PD_KERNEL(MSDeformableAttnCUDABackward));

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,7 @@
from paddle.utils.cpp_extension import CUDAExtension, setup
if __name__ == "__main__":
setup(
name='deformable_detr_ops',
ext_modules=CUDAExtension(
sources=['ms_deformable_attn_op.cc', 'ms_deformable_attn_op.cu']))

View File

@@ -0,0 +1,140 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import os
import sys
import random
import numpy as np
import paddle
# add python path of PaddleDetection to sys.path
parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 5)))
if parent_path not in sys.path:
sys.path.append(parent_path)
from ppdet.modeling.transformers.utils import deformable_attention_core_func
ms_deform_attn_core_paddle = deformable_attention_core_func
try:
gpu_index = int(sys.argv[1])
except:
gpu_index = 0
print(f'Use gpu {gpu_index} to test...')
paddle.set_device(f'gpu:{gpu_index}')
try:
from deformable_detr_ops import ms_deformable_attn
except Exception as e:
print('import deformable_detr_ops error', e)
sys.exit(-1)
paddle.seed(1)
random.seed(1)
np.random.seed(1)
bs, n_heads, c = 2, 8, 8
query_length, n_levels, n_points = 2, 2, 2
spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
level_start_index = paddle.concat((paddle.to_tensor(
[0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
value_length = sum([(H * W).item() for H, W in spatial_shapes])
def get_test_tensors(channels):
value = paddle.rand(
[bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
sampling_locations = paddle.rand(
[bs, query_length, n_heads, n_levels, n_points, 2],
dtype=paddle.float32)
attention_weights = paddle.rand(
[bs, query_length, n_heads, n_levels, n_points],
dtype=paddle.float32) + 1e-5
attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
-2, keepdim=True)
return [value, sampling_locations, attention_weights]
@paddle.no_grad()
def check_forward_equal_with_paddle_float():
value, sampling_locations, attention_weights = get_test_tensors(c)
output_paddle = ms_deform_attn_core_paddle(
value, spatial_shapes, level_start_index, sampling_locations,
attention_weights).detach().cpu()
output_cuda = ms_deformable_attn(value, spatial_shapes, level_start_index,
sampling_locations,
attention_weights).detach().cpu()
fwdok = paddle.allclose(
output_cuda, output_paddle, rtol=1e-2, atol=1e-3).item()
max_abs_err = (output_cuda - output_paddle).abs().max().item()
max_rel_err = (
(output_cuda - output_paddle).abs() / output_paddle.abs()).max().item()
print(
f'*{fwdok} check_forward_equal_with_paddle_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}'
)
def check_gradient_numerical(channels=4):
value_paddle, sampling_locations_paddle, attention_weights_paddle = get_test_tensors(
channels)
value_paddle.stop_gradient = False
sampling_locations_paddle.stop_gradient = False
attention_weights_paddle.stop_gradient = False
value_cuda = value_paddle.detach().clone()
sampling_locations_cuda = sampling_locations_paddle.detach().clone()
attention_weights_cuda = attention_weights_paddle.detach().clone()
value_cuda.stop_gradient = False
sampling_locations_cuda.stop_gradient = False
attention_weights_cuda.stop_gradient = False
output_paddle = ms_deform_attn_core_paddle(
value_paddle, spatial_shapes, level_start_index,
sampling_locations_paddle, attention_weights_paddle)
output_paddle.sum().backward()
output_cuda = ms_deformable_attn(value_cuda, spatial_shapes,
level_start_index, sampling_locations_cuda,
attention_weights_cuda)
output_cuda.sum().backward()
res = paddle.allclose(
value_paddle.grad, value_cuda.grad, rtol=1e-2, atol=1e-3).item()
print(f'*tensor1 {res} check_gradient_numerical(D={channels})')
res = paddle.allclose(
sampling_locations_paddle.grad,
sampling_locations_cuda.grad,
rtol=1e-2,
atol=1e-3).item()
print(f'*tensor2 {res} check_gradient_numerical(D={channels})')
res = paddle.allclose(
attention_weights_paddle.grad,
attention_weights_cuda.grad,
rtol=1e-2,
atol=1e-3).item()
print(f'*tensor3 {res} check_gradient_numerical(D={channels})')
if __name__ == '__main__':
check_forward_equal_with_paddle_float()
for channels in [30, 32, 64, 71, 128, 1024, 1025, 2048, 3096]:
check_gradient_numerical(channels)

View File

@@ -0,0 +1,287 @@
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from ppdet.modeling.ops import get_act_fn
from ..shape_spec import ShapeSpec
from ..backbones.csp_darknet import BaseConv
from ..backbones.cspresnet import RepVggBlock
from ppdet.modeling.transformers.detr_transformer import TransformerEncoder
from ..initializer import xavier_uniform_, linear_init_
from ..layers import MultiHeadAttention
from paddle import ParamAttr
from paddle.regularizer import L2Decay
__all__ = ['HybridEncoder']
class CSPRepLayer(nn.Layer):
def __init__(self,
in_channels,
out_channels,
num_blocks=3,
expansion=1.0,
bias=False,
act="silu"):
super(CSPRepLayer, self).__init__()
hidden_channels = int(out_channels * expansion)
self.conv1 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.conv2 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.bottlenecks = nn.Sequential(*[
RepVggBlock(
hidden_channels, hidden_channels, act=act)
for _ in range(num_blocks)
])
if hidden_channels != out_channels:
self.conv3 = BaseConv(
hidden_channels,
out_channels,
ksize=1,
stride=1,
bias=bias,
act=act)
else:
self.conv3 = nn.Identity()
def forward(self, x):
x_1 = self.conv1(x)
x_1 = self.bottlenecks(x_1)
x_2 = self.conv2(x)
return self.conv3(x_1 + x_2)
@register
class TransformerLayer(nn.Layer):
def __init__(self,
d_model,
nhead,
dim_feedforward=1024,
dropout=0.,
activation="relu",
attn_dropout=None,
act_dropout=None,
normalize_before=False):
super(TransformerLayer, self).__init__()
attn_dropout = dropout if attn_dropout is None else attn_dropout
act_dropout = dropout if act_dropout is None else act_dropout
self.normalize_before = normalize_before
self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
self.activation = getattr(F, activation)
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.linear1)
linear_init_(self.linear2)
@staticmethod
def with_pos_embed(tensor, pos_embed):
return tensor if pos_embed is None else tensor + pos_embed
def forward(self, src, src_mask=None, pos_embed=None):
residual = src
if self.normalize_before:
src = self.norm1(src)
q = k = self.with_pos_embed(src, pos_embed)
src = self.self_attn(q, k, value=src, attn_mask=src_mask)
src = residual + self.dropout1(src)
if not self.normalize_before:
src = self.norm1(src)
residual = src
if self.normalize_before:
src = self.norm2(src)
src = self.linear2(self.dropout(self.activation(self.linear1(src))))
src = residual + self.dropout2(src)
if not self.normalize_before:
src = self.norm2(src)
return src
@register
@serializable
class HybridEncoder(nn.Layer):
__shared__ = ['depth_mult', 'act', 'trt', 'eval_size']
__inject__ = ['encoder_layer']
def __init__(self,
in_channels=[512, 1024, 2048],
feat_strides=[8, 16, 32],
hidden_dim=256,
use_encoder_idx=[2],
num_encoder_layers=1,
encoder_layer='TransformerLayer',
pe_temperature=10000,
expansion=1.0,
depth_mult=1.0,
act='silu',
trt=False,
eval_size=None):
super(HybridEncoder, self).__init__()
self.in_channels = in_channels
self.feat_strides = feat_strides
self.hidden_dim = hidden_dim
self.use_encoder_idx = use_encoder_idx
self.num_encoder_layers = num_encoder_layers
self.pe_temperature = pe_temperature
self.eval_size = eval_size
# channel projection
self.input_proj = nn.LayerList()
for in_channel in in_channels:
self.input_proj.append(
nn.Sequential(
nn.Conv2D(
in_channel, hidden_dim, kernel_size=1, bias_attr=False),
nn.BatchNorm2D(
hidden_dim,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))
# encoder transformer
self.encoder = nn.LayerList([
TransformerEncoder(encoder_layer, num_encoder_layers)
for _ in range(len(use_encoder_idx))
])
act = get_act_fn(
act, trt=trt) if act is None or isinstance(act,
(str, dict)) else act
# top-down fpn
self.lateral_convs = nn.LayerList()
self.fpn_blocks = nn.LayerList()
for idx in range(len(in_channels) - 1, 0, -1):
self.lateral_convs.append(
BaseConv(
hidden_dim, hidden_dim, 1, 1, act=act))
self.fpn_blocks.append(
CSPRepLayer(
hidden_dim * 2,
hidden_dim,
round(3 * depth_mult),
act=act,
expansion=expansion))
# bottom-up pan
self.downsample_convs = nn.LayerList()
self.pan_blocks = nn.LayerList()
for idx in range(len(in_channels) - 1):
self.downsample_convs.append(
BaseConv(
hidden_dim, hidden_dim, 3, stride=2, act=act))
self.pan_blocks.append(
CSPRepLayer(
hidden_dim * 2,
hidden_dim,
round(3 * depth_mult),
act=act,
expansion=expansion))
self._reset_parameters()
def _reset_parameters(self):
if self.eval_size:
for idx in self.use_encoder_idx:
stride = self.feat_strides[idx]
pos_embed = self.build_2d_sincos_position_embedding(
self.eval_size[1] // stride, self.eval_size[0] // stride,
self.hidden_dim, self.pe_temperature)
setattr(self, f'pos_embed{idx}', pos_embed)
@staticmethod
def build_2d_sincos_position_embedding(w,
h,
embed_dim=256,
temperature=10000.):
grid_w = paddle.arange(int(w), dtype=paddle.float32)
grid_h = paddle.arange(int(h), dtype=paddle.float32)
grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
assert embed_dim % 4 == 0, \
'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
pos_dim = embed_dim // 4
omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
omega = 1. / (temperature**omega)
out_w = grid_w.flatten()[..., None] @omega[None]
out_h = grid_h.flatten()[..., None] @omega[None]
return paddle.concat(
[
paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
paddle.cos(out_h)
],
axis=1)[None, :, :]
def forward(self, feats, for_mot=False):
assert len(feats) == len(self.in_channels)
# get projection features
proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
# encoder
if self.num_encoder_layers > 0:
for i, enc_ind in enumerate(self.use_encoder_idx):
h, w = proj_feats[enc_ind].shape[2:]
# flatten [B, C, H, W] to [B, HxW, C]
src_flatten = proj_feats[enc_ind].flatten(2).transpose(
[0, 2, 1])
if self.training or self.eval_size is None:
pos_embed = self.build_2d_sincos_position_embedding(
w, h, self.hidden_dim, self.pe_temperature)
else:
pos_embed = getattr(self, f'pos_embed{enc_ind}', None)
memory = self.encoder[i](src_flatten, pos_embed=pos_embed)
proj_feats[enc_ind] = memory.transpose([0, 2, 1]).reshape(
[-1, self.hidden_dim, h, w])
# top-down fpn
inner_outs = [proj_feats[-1]]
for idx in range(len(self.in_channels) - 1, 0, -1):
feat_heigh = inner_outs[0]
feat_low = proj_feats[idx - 1]
feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
feat_heigh)
inner_outs[0] = feat_heigh
upsample_feat = F.interpolate(
feat_heigh, scale_factor=2., mode="nearest")
inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
paddle.concat(
[upsample_feat, feat_low], axis=1))
inner_outs.insert(0, inner_out)
# bottom-up pan
outs = [inner_outs[0]]
for idx in range(len(self.in_channels) - 1):
feat_low = outs[-1]
feat_height = inner_outs[idx + 1]
downsample_feat = self.downsample_convs[idx](feat_low)
out = self.pan_blocks[idx](paddle.concat(
[downsample_feat, feat_height], axis=1))
outs.append(out)
return outs
@classmethod
def from_config(cls, cfg, input_shape):
return {
'in_channels': [i.channels for i in input_shape],
'feat_strides': [i.stride for i in input_shape]
}
@property
def out_shape(self):
return [
ShapeSpec(
channels=self.hidden_dim, stride=self.feat_strides[idx])
for idx in range(len(self.in_channels))
]

View File

@@ -0,0 +1,184 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from scipy.optimize import linear_sum_assignment
from ppdet.core.workspace import register, serializable
from ..losses.iou_loss import GIoULoss
from .utils import bbox_cxcywh_to_xyxy
__all__ = ['HungarianMatcher']
@register
@serializable
class HungarianMatcher(nn.Layer):
__shared__ = ['use_focal_loss', 'with_mask', 'num_sample_points']
def __init__(self,
matcher_coeff={
'class': 1,
'bbox': 5,
'giou': 2,
'mask': 1,
'dice': 1
},
use_focal_loss=False,
with_mask=False,
num_sample_points=12544,
alpha=0.25,
gamma=2.0):
r"""
Args:
matcher_coeff (dict): The coefficient of hungarian matcher cost.
"""
super(HungarianMatcher, self).__init__()
self.matcher_coeff = matcher_coeff
self.use_focal_loss = use_focal_loss
self.with_mask = with_mask
self.num_sample_points = num_sample_points
self.alpha = alpha
self.gamma = gamma
self.giou_loss = GIoULoss()
def forward(self,
boxes,
logits,
gt_bbox,
gt_class,
masks=None,
gt_mask=None):
r"""
Args:
boxes (Tensor): [b, query, 4]
logits (Tensor): [b, query, num_classes]
gt_bbox (List(Tensor)): list[[n, 4]]
gt_class (List(Tensor)): list[[n, 1]]
masks (Tensor|None): [b, query, h, w]
gt_mask (List(Tensor)): list[[n, H, W]]
Returns:
A list of size batch_size, containing tuples of (index_i, index_j) where:
- index_i is the indices of the selected predictions (in order)
- index_j is the indices of the corresponding selected targets (in order)
For each batch element, it holds:
len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
"""
bs, num_queries = boxes.shape[:2]
num_gts = [len(a) for a in gt_class]
if sum(num_gts) == 0:
return [(paddle.to_tensor(
[], dtype=paddle.int64), paddle.to_tensor(
[], dtype=paddle.int64)) for _ in range(bs)]
# We flatten to compute the cost matrices in a batch
# [batch_size * num_queries, num_classes]
logits = logits.detach()
out_prob = F.sigmoid(logits.flatten(
0, 1)) if self.use_focal_loss else F.softmax(logits.flatten(0, 1))
# [batch_size * num_queries, 4]
out_bbox = boxes.detach().flatten(0, 1)
# Also concat the target labels and boxes
tgt_ids = paddle.concat(gt_class).flatten()
tgt_bbox = paddle.concat(gt_bbox)
# Compute the classification cost
out_prob = paddle.gather(out_prob, tgt_ids, axis=1)
if self.use_focal_loss:
neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-(
1 - out_prob + 1e-8).log())
pos_cost_class = self.alpha * (
(1 - out_prob)**self.gamma) * (-(out_prob + 1e-8).log())
cost_class = pos_cost_class - neg_cost_class
else:
cost_class = -out_prob
# Compute the L1 cost between boxes
cost_bbox = (
out_bbox.unsqueeze(1) - tgt_bbox.unsqueeze(0)).abs().sum(-1)
# Compute the giou cost betwen boxes
cost_giou = self.giou_loss(
bbox_cxcywh_to_xyxy(out_bbox.unsqueeze(1)),
bbox_cxcywh_to_xyxy(tgt_bbox.unsqueeze(0))).squeeze(-1)
# Final cost matrix
C = self.matcher_coeff['class'] * cost_class + \
self.matcher_coeff['bbox'] * cost_bbox + \
self.matcher_coeff['giou'] * cost_giou
# Compute the mask cost and dice cost
if self.with_mask:
assert (masks is not None and gt_mask is not None,
'Make sure the input has `mask` and `gt_mask`')
# all masks share the same set of points for efficient matching
sample_points = paddle.rand([bs, 1, self.num_sample_points, 2])
sample_points = 2.0 * sample_points - 1.0
out_mask = F.grid_sample(
masks.detach(), sample_points, align_corners=False).squeeze(-2)
out_mask = out_mask.flatten(0, 1)
tgt_mask = paddle.concat(gt_mask).unsqueeze(1)
sample_points = paddle.concat([
a.tile([b, 1, 1, 1]) for a, b in zip(sample_points, num_gts)
if b > 0
])
tgt_mask = F.grid_sample(
tgt_mask, sample_points, align_corners=False).squeeze([1, 2])
with paddle.amp.auto_cast(enable=False):
# binary cross entropy cost
pos_cost_mask = F.binary_cross_entropy_with_logits(
out_mask, paddle.ones_like(out_mask), reduction='none')
neg_cost_mask = F.binary_cross_entropy_with_logits(
out_mask, paddle.zeros_like(out_mask), reduction='none')
cost_mask = paddle.matmul(
pos_cost_mask, tgt_mask, transpose_y=True) + paddle.matmul(
neg_cost_mask, 1 - tgt_mask, transpose_y=True)
cost_mask /= self.num_sample_points
# dice cost
out_mask = F.sigmoid(out_mask)
numerator = 2 * paddle.matmul(
out_mask, tgt_mask, transpose_y=True)
denominator = out_mask.sum(
-1, keepdim=True) + tgt_mask.sum(-1).unsqueeze(0)
cost_dice = 1 - (numerator + 1) / (denominator + 1)
C = C + self.matcher_coeff['mask'] * cost_mask + \
self.matcher_coeff['dice'] * cost_dice
C = C.reshape([bs, num_queries, -1])
C = [a.squeeze(0) for a in C.chunk(bs)]
sizes = [a.shape[0] for a in gt_bbox]
indices = [
linear_sum_assignment(c.split(sizes, -1)[i].numpy())
for i, c in enumerate(C)
]
return [(paddle.to_tensor(
i, dtype=paddle.int64), paddle.to_tensor(
j, dtype=paddle.int64)) for i, j in indices]

View File

@@ -0,0 +1,100 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
import paddle.nn as nn
from ppdet.core.workspace import register, serializable
@register
@serializable
class PositionEmbedding(nn.Layer):
def __init__(self,
num_pos_feats=128,
temperature=10000,
normalize=True,
scale=2 * math.pi,
embed_type='sine',
num_embeddings=50,
offset=0.,
eps=1e-6):
super(PositionEmbedding, self).__init__()
assert embed_type in ['sine', 'learned']
self.embed_type = embed_type
self.offset = offset
self.eps = eps
if self.embed_type == 'sine':
self.num_pos_feats = num_pos_feats
self.temperature = temperature
self.normalize = normalize
self.scale = scale
elif self.embed_type == 'learned':
self.row_embed = nn.Embedding(num_embeddings, num_pos_feats)
self.col_embed = nn.Embedding(num_embeddings, num_pos_feats)
else:
raise ValueError(f"{self.embed_type} is not supported.")
def forward(self, mask):
"""
Args:
mask (Tensor): [B, H, W]
Returns:
pos (Tensor): [B, H, W, C]
"""
if self.embed_type == 'sine':
y_embed = mask.cumsum(1)
x_embed = mask.cumsum(2)
if self.normalize:
y_embed = (y_embed + self.offset) / (
y_embed[:, -1:, :] + self.eps) * self.scale
x_embed = (x_embed + self.offset) / (
x_embed[:, :, -1:] + self.eps) * self.scale
dim_t = 2 * (paddle.arange(self.num_pos_feats) //
2).astype('float32')
dim_t = self.temperature**(dim_t / self.num_pos_feats)
pos_x = x_embed.unsqueeze(-1) / dim_t
pos_y = y_embed.unsqueeze(-1) / dim_t
pos_x = paddle.stack(
(pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
axis=4).flatten(3)
pos_y = paddle.stack(
(pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
axis=4).flatten(3)
return paddle.concat((pos_y, pos_x), axis=3)
elif self.embed_type == 'learned':
h, w = mask.shape[-2:]
i = paddle.arange(w)
j = paddle.arange(h)
x_emb = self.col_embed(i)
y_emb = self.row_embed(j)
return paddle.concat(
[
x_emb.unsqueeze(0).tile([h, 1, 1]),
y_emb.unsqueeze(1).tile([1, w, 1]),
],
axis=-1).unsqueeze(0)
else:
raise ValueError(f"not supported {self.embed_type}")

View File

@@ -0,0 +1,523 @@
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register
from ..layers import MultiHeadAttention
from .deformable_transformer import MSDeformableAttention
from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,
bias_init_with_prob)
from .utils import (_get_clones, get_sine_pos_embed,
get_contrastive_denoising_training_group, inverse_sigmoid, MLP)
__all__ = ['RTDETRTransformer']
class PPMSDeformableAttention(MSDeformableAttention):
def forward(self,
query,
reference_points,
value,
value_spatial_shapes,
value_level_start_index,
value_mask=None):
"""
Args:
query (Tensor): [bs, query_length, C]
reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
bottom-right (1, 1), including padding area
value (Tensor): [bs, value_length, C]
value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
value_level_start_index (List): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
Returns:
output (Tensor): [bs, Length_{query}, C]
"""
bs, Len_q = query.shape[:2]
Len_v = value.shape[1]
value = self.value_proj(value)
if value_mask is not None:
value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
value *= value_mask
value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])
sampling_offsets = self.sampling_offsets(query).reshape(
[bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
attention_weights = self.attention_weights(query).reshape(
[bs, Len_q, self.num_heads, self.num_levels * self.num_points])
attention_weights = F.softmax(attention_weights).reshape(
[bs, Len_q, self.num_heads, self.num_levels, self.num_points])
if reference_points.shape[-1] == 2:
offset_normalizer = paddle.to_tensor(value_spatial_shapes)
offset_normalizer = offset_normalizer.flip([1]).reshape(
[1, 1, 1, self.num_levels, 1, 2])
sampling_locations = reference_points.reshape([
bs, Len_q, 1, self.num_levels, 1, 2
]) + sampling_offsets / offset_normalizer
elif reference_points.shape[-1] == 4:
sampling_locations = (
reference_points[:, :, None, :, None, :2] + sampling_offsets /
self.num_points * reference_points[:, :, None, :, None, 2:] *
0.5)
else:
raise ValueError(
"Last dim of reference_points must be 2 or 4, but get {} instead.".
format(reference_points.shape[-1]))
if not isinstance(query, paddle.Tensor):
from ppdet.modeling.transformers.utils import deformable_attention_core_func
output = deformable_attention_core_func(
value, value_spatial_shapes, value_level_start_index,
sampling_locations, attention_weights)
else:
value_spatial_shapes = paddle.to_tensor(value_spatial_shapes)
value_level_start_index = paddle.to_tensor(value_level_start_index)
output = self.ms_deformable_attn_core(
value, value_spatial_shapes, value_level_start_index,
sampling_locations, attention_weights)
output = self.output_proj(output)
return output
class TransformerDecoderLayer(nn.Layer):
def __init__(self,
d_model=256,
n_head=8,
dim_feedforward=1024,
dropout=0.,
activation="relu",
n_levels=4,
n_points=4,
weight_attr=None,
bias_attr=None):
super(TransformerDecoderLayer, self).__init__()
# self attention
self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
self.dropout1 = nn.Dropout(dropout)
self.norm1 = nn.LayerNorm(
d_model,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
# cross attention
self.cross_attn = PPMSDeformableAttention(d_model, n_head, n_levels,
n_points, 1.0)
self.dropout2 = nn.Dropout(dropout)
self.norm2 = nn.LayerNorm(
d_model,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
# ffn
self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr,
bias_attr)
self.activation = getattr(F, activation)
self.dropout3 = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr,
bias_attr)
self.dropout4 = nn.Dropout(dropout)
self.norm3 = nn.LayerNorm(
d_model,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.linear1)
linear_init_(self.linear2)
xavier_uniform_(self.linear1.weight)
xavier_uniform_(self.linear2.weight)
def with_pos_embed(self, tensor, pos):
return tensor if pos is None else tensor + pos
def forward_ffn(self, tgt):
return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
def forward(self,
tgt,
reference_points,
memory,
memory_spatial_shapes,
memory_level_start_index,
attn_mask=None,
memory_mask=None,
query_pos_embed=None):
# self attention
q = k = self.with_pos_embed(tgt, query_pos_embed)
if attn_mask is not None:
attn_mask = paddle.where(
attn_mask.astype('bool'),
paddle.zeros(attn_mask.shape, tgt.dtype),
paddle.full(attn_mask.shape, float("-inf"), tgt.dtype))
tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
tgt = tgt + self.dropout1(tgt2)
tgt = self.norm1(tgt)
# cross attention
tgt2 = self.cross_attn(
self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
memory_spatial_shapes, memory_level_start_index, memory_mask)
tgt = tgt + self.dropout2(tgt2)
tgt = self.norm2(tgt)
# ffn
tgt2 = self.forward_ffn(tgt)
tgt = tgt + self.dropout4(tgt2)
tgt = self.norm3(tgt)
return tgt
class TransformerDecoder(nn.Layer):
def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
super(TransformerDecoder, self).__init__()
self.layers = _get_clones(decoder_layer, num_layers)
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
def forward(self,
tgt,
ref_points_unact,
memory,
memory_spatial_shapes,
memory_level_start_index,
bbox_head,
score_head,
query_pos_head,
attn_mask=None,
memory_mask=None):
output = tgt
dec_out_bboxes = []
dec_out_logits = []
ref_points_detach = F.sigmoid(ref_points_unact)
for i, layer in enumerate(self.layers):
ref_points_input = ref_points_detach.unsqueeze(2)
query_pos_embed = query_pos_head(ref_points_detach)
output = layer(output, ref_points_input, memory,
memory_spatial_shapes, memory_level_start_index,
attn_mask, memory_mask, query_pos_embed)
inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
ref_points_detach))
if self.training:
dec_out_logits.append(score_head[i](output))
if i == 0:
dec_out_bboxes.append(inter_ref_bbox)
else:
dec_out_bboxes.append(
F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
ref_points)))
elif i == self.eval_idx:
dec_out_logits.append(score_head[i](output))
dec_out_bboxes.append(inter_ref_bbox)
break
ref_points = inter_ref_bbox
ref_points_detach = inter_ref_bbox.detach(
) if self.training else inter_ref_bbox
return paddle.stack(dec_out_bboxes), paddle.stack(dec_out_logits)
@register
class RTDETRTransformer(nn.Layer):
__shared__ = ['num_classes', 'hidden_dim', 'eval_size']
def __init__(self,
num_classes=80,
hidden_dim=256,
num_queries=300,
position_embed_type='sine',
backbone_feat_channels=[512, 1024, 2048],
feat_strides=[8, 16, 32],
num_levels=3,
num_decoder_points=4,
nhead=8,
num_decoder_layers=6,
dim_feedforward=1024,
dropout=0.,
activation="relu",
num_denoising=100,
label_noise_ratio=0.5,
box_noise_scale=1.0,
learnt_init_query=True,
eval_size=None,
eval_idx=-1,
eps=1e-2):
super(RTDETRTransformer, self).__init__()
assert position_embed_type in ['sine', 'learned'], \
f'ValueError: position_embed_type not supported {position_embed_type}!'
assert len(backbone_feat_channels) <= num_levels
assert len(feat_strides) == len(backbone_feat_channels)
for _ in range(num_levels - len(feat_strides)):
feat_strides.append(feat_strides[-1] * 2)
self.hidden_dim = hidden_dim
self.nhead = nhead
self.feat_strides = feat_strides
self.num_levels = num_levels
self.num_classes = num_classes
self.num_queries = num_queries
self.eps = eps
self.num_decoder_layers = num_decoder_layers
self.eval_size = eval_size
# backbone feature projection
self._build_input_proj_layer(backbone_feat_channels)
# Transformer module
decoder_layer = TransformerDecoderLayer(
hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
num_decoder_points)
self.decoder = TransformerDecoder(hidden_dim, decoder_layer,
num_decoder_layers, eval_idx)
# denoising part
self.denoising_class_embed = nn.Embedding(
num_classes,
hidden_dim,
weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
self.num_denoising = num_denoising
self.label_noise_ratio = label_noise_ratio
self.box_noise_scale = box_noise_scale
# decoder embedding
self.learnt_init_query = learnt_init_query
if learnt_init_query:
self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)
# encoder head
self.enc_output = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
nn.LayerNorm(
hidden_dim,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0))))
self.enc_score_head = nn.Linear(hidden_dim, num_classes)
self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
# decoder head
self.dec_score_head = nn.LayerList([
nn.Linear(hidden_dim, num_classes)
for _ in range(num_decoder_layers)
])
self.dec_bbox_head = nn.LayerList([
MLP(hidden_dim, hidden_dim, 4, num_layers=3)
for _ in range(num_decoder_layers)
])
self._reset_parameters()
def _reset_parameters(self):
# class and bbox head init
bias_cls = bias_init_with_prob(0.01)
linear_init_(self.enc_score_head)
constant_(self.enc_score_head.bias, bias_cls)
constant_(self.enc_bbox_head.layers[-1].weight)
constant_(self.enc_bbox_head.layers[-1].bias)
for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
linear_init_(cls_)
constant_(cls_.bias, bias_cls)
constant_(reg_.layers[-1].weight)
constant_(reg_.layers[-1].bias)
linear_init_(self.enc_output[0])
xavier_uniform_(self.enc_output[0].weight)
if self.learnt_init_query:
xavier_uniform_(self.tgt_embed.weight)
xavier_uniform_(self.query_pos_head.layers[0].weight)
xavier_uniform_(self.query_pos_head.layers[1].weight)
for l in self.input_proj:
xavier_uniform_(l[0].weight)
# init encoder output anchors and valid_mask
if self.eval_size:
self.anchors, self.valid_mask = self._generate_anchors()
@classmethod
def from_config(cls, cfg, input_shape):
return {'backbone_feat_channels': [i.channels for i in input_shape]}
def _build_input_proj_layer(self, backbone_feat_channels):
self.input_proj = nn.LayerList()
for in_channels in backbone_feat_channels:
self.input_proj.append(
nn.Sequential(
('conv', nn.Conv2D(
in_channels,
self.hidden_dim,
kernel_size=1,
bias_attr=False)), ('norm', nn.BatchNorm2D(
self.hidden_dim,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
in_channels = backbone_feat_channels[-1]
for _ in range(self.num_levels - len(backbone_feat_channels)):
self.input_proj.append(
nn.Sequential(
('conv', nn.Conv2D(
in_channels,
self.hidden_dim,
kernel_size=3,
stride=2,
padding=1,
bias_attr=False)), ('norm', nn.BatchNorm2D(
self.hidden_dim,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
in_channels = self.hidden_dim
def _get_encoder_input(self, feats):
# get projection features
proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
if self.num_levels > len(proj_feats):
len_srcs = len(proj_feats)
for i in range(len_srcs, self.num_levels):
if i == len_srcs:
proj_feats.append(self.input_proj[i](feats[-1]))
else:
proj_feats.append(self.input_proj[i](proj_feats[-1]))
# get encoder inputs
feat_flatten = []
spatial_shapes = []
level_start_index = [0, ]
for i, feat in enumerate(proj_feats):
_, _, h, w = feat.shape
# [b, c, h, w] -> [b, h*w, c]
feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
# [num_levels, 2]
spatial_shapes.append([h, w])
# [l], start index of each level
level_start_index.append(h * w + level_start_index[-1])
# [b, l, c]
feat_flatten = paddle.concat(feat_flatten, 1)
level_start_index.pop()
return (feat_flatten, spatial_shapes, level_start_index)
def forward(self, feats, pad_mask=None, gt_meta=None):
# input projection and embedding
(memory, spatial_shapes,
level_start_index) = self._get_encoder_input(feats)
# prepare denoising training
if self.training:
denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
get_contrastive_denoising_training_group(gt_meta,
self.num_classes,
self.num_queries,
self.denoising_class_embed.weight,
self.num_denoising,
self.label_noise_ratio,
self.box_noise_scale)
else:
denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
self._get_decoder_input(
memory, spatial_shapes, denoising_class, denoising_bbox_unact)
# decoder
out_bboxes, out_logits = self.decoder(
target,
init_ref_points_unact,
memory,
spatial_shapes,
level_start_index,
self.dec_bbox_head,
self.dec_score_head,
self.query_pos_head,
attn_mask=attn_mask)
return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
dn_meta)
def _generate_anchors(self,
spatial_shapes=None,
grid_size=0.05,
dtype="float32"):
if spatial_shapes is None:
spatial_shapes = [
[int(self.eval_size[0] / s), int(self.eval_size[1] / s)]
for s in self.feat_strides
]
anchors = []
for lvl, (h, w) in enumerate(spatial_shapes):
grid_y, grid_x = paddle.meshgrid(
paddle.arange(
end=h, dtype=dtype),
paddle.arange(
end=w, dtype=dtype))
grid_xy = paddle.stack([grid_x, grid_y], -1)
valid_WH = paddle.to_tensor([w, h]).astype(dtype)
grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
anchors.append(
paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
anchors = paddle.concat(anchors, 1)
valid_mask = ((anchors > self.eps) *
(anchors < 1 - self.eps)).all(-1, keepdim=True)
anchors = paddle.log(anchors / (1 - anchors))
anchors = paddle.where(valid_mask, anchors,
paddle.to_tensor(float("inf")))
return anchors, valid_mask
def _get_decoder_input(self,
memory,
spatial_shapes,
denoising_class=None,
denoising_bbox_unact=None):
bs, _, _ = memory.shape
# prepare input for decoder
if self.training or self.eval_size is None:
anchors, valid_mask = self._generate_anchors(spatial_shapes)
else:
anchors, valid_mask = self.anchors, self.valid_mask
memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
output_memory = self.enc_output(memory)
enc_outputs_class = self.enc_score_head(output_memory)
enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors
_, topk_ind = paddle.topk(
enc_outputs_class.max(-1), self.num_queries, axis=1)
# extract region proposal boxes
batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype)
batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact,
topk_ind) # unsigmoided.
enc_topk_bboxes = F.sigmoid(reference_points_unact)
if denoising_bbox_unact is not None:
reference_points_unact = paddle.concat(
[denoising_bbox_unact, reference_points_unact], 1)
if self.training:
reference_points_unact = reference_points_unact.detach()
enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)
# extract region features
if self.learnt_init_query:
target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
else:
target = paddle.gather_nd(output_memory, topk_ind)
if self.training:
target = target.detach()
if denoising_class is not None:
target = paddle.concat([denoising_class, target], 1)
return target, reference_points_unact, enc_topk_bboxes, enc_topk_logits

View File

@@ -0,0 +1,481 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# Modified from detrex (https://github.com/IDEA-Research/detrex)
# Copyright 2022 The IDEA Authors. All rights reserved.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import copy
import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
__all__ = [
'_get_clones', 'bbox_cxcywh_to_xyxy',
'bbox_xyxy_to_cxcywh', 'sigmoid_focal_loss', 'inverse_sigmoid',
'deformable_attention_core_func', 'varifocal_loss_with_logits'
]
def bbox_area(boxes):
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
def bbox_overlaps(boxes1, boxes2):
"""
Calculate overlaps between boxes1 and boxes2
Args:
boxes1 (Tensor): boxes with shape [M, 4]
boxes2 (Tensor): boxes with shape [N, 4]
Return:
overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N]
"""
M = boxes1.shape[0]
N = boxes2.shape[0]
if M * N == 0:
return paddle.zeros([M, N], dtype='float32')
area1 = bbox_area(boxes1)
area2 = bbox_area(boxes2)
xy_max = paddle.minimum(
paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:])
xy_min = paddle.maximum(
paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2])
width_height = xy_max - xy_min
width_height = width_height.clip(min=0)
inter = width_height.prod(axis=2)
overlaps = paddle.where(inter > 0, inter /
(paddle.unsqueeze(area1, 1) + area2 - inter),
paddle.zeros_like(inter))
return overlaps
def _get_clones(module, N):
return nn.LayerList([copy.deepcopy(module) for _ in range(N)])
def bbox_cxcywh_to_xyxy(x):
cxcy, wh = paddle.split(x, 2, axis=-1)
return paddle.concat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], axis=-1)
def bbox_xyxy_to_cxcywh(x):
x1, y1, x2, y2 = x.split(4, axis=-1)
return paddle.concat(
[(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)], axis=-1)
def sigmoid_focal_loss(logit, label, normalizer=1.0, alpha=0.25, gamma=2.0):
prob = F.sigmoid(logit)
ce_loss = F.binary_cross_entropy_with_logits(logit, label, reduction="none")
p_t = prob * label + (1 - prob) * (1 - label)
loss = ce_loss * ((1 - p_t)**gamma)
if alpha >= 0:
alpha_t = alpha * label + (1 - alpha) * (1 - label)
loss = alpha_t * loss
return loss.mean(1).sum() / normalizer
def inverse_sigmoid(x, eps=1e-5):
x = x.clip(min=0., max=1.)
return paddle.log(x.clip(min=eps) / (1 - x).clip(min=eps))
def deformable_attention_core_func(value, value_spatial_shapes,
value_level_start_index, sampling_locations,
attention_weights):
"""
Args:
value (Tensor): [bs, value_length, n_head, c]
value_spatial_shapes (Tensor|List): [n_levels, 2]
value_level_start_index (Tensor|List): [n_levels]
sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2]
attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points]
Returns:
output (Tensor): [bs, Length_{query}, C]
"""
bs, _, n_head, c = value.shape
_, Len_q, _, n_levels, n_points, _ = sampling_locations.shape
split_shape = [h * w for h, w in value_spatial_shapes]
value_list = value.split(split_shape, axis=1)
sampling_grids = 2 * sampling_locations - 1
sampling_value_list = []
for level, (h, w) in enumerate(value_spatial_shapes):
# N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
value_l_ = value_list[level].flatten(2).transpose(
[0, 2, 1]).reshape([bs * n_head, c, h, w])
# N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(
[0, 2, 1, 3, 4]).flatten(0, 1)
# N_*M_, D_, Lq_, P_
sampling_value_l_ = F.grid_sample(
value_l_,
sampling_grid_l_,
mode='bilinear',
padding_mode='zeros',
align_corners=False)
sampling_value_list.append(sampling_value_l_)
# (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_)
attention_weights = attention_weights.transpose([0, 2, 1, 3, 4]).reshape(
[bs * n_head, 1, Len_q, n_levels * n_points])
output = (paddle.stack(
sampling_value_list, axis=-2).flatten(-2) *
attention_weights).sum(-1).reshape([bs, n_head * c, Len_q])
return output.transpose([0, 2, 1])
def get_valid_ratio(mask):
_, H, W = paddle.shape(mask)
valid_ratio_h = paddle.sum(mask[:, :, 0], 1) / H
valid_ratio_w = paddle.sum(mask[:, 0, :], 1) / W
# [b, 2]
return paddle.stack([valid_ratio_w, valid_ratio_h], -1)
def get_denoising_training_group(targets,
num_classes,
num_queries,
class_embed,
num_denoising=100,
label_noise_ratio=0.5,
box_noise_scale=1.0):
if num_denoising <= 0:
return None, None, None, None
num_gts = [len(t) for t in targets["gt_class"]]
max_gt_num = max(num_gts)
if max_gt_num == 0:
return None, None, None, None
num_group = num_denoising // max_gt_num
num_group = 1 if num_group == 0 else num_group
# pad gt to max_num of a batch
bs = len(targets["gt_class"])
input_query_class = paddle.full(
[bs, max_gt_num], num_classes, dtype='int32')
input_query_bbox = paddle.zeros([bs, max_gt_num, 4])
pad_gt_mask = paddle.zeros([bs, max_gt_num])
for i in range(bs):
num_gt = num_gts[i]
if num_gt > 0:
input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1)
input_query_bbox[i, :num_gt] = targets["gt_bbox"][i]
pad_gt_mask[i, :num_gt] = 1
input_query_class = input_query_class.tile([1, num_group])
input_query_bbox = input_query_bbox.tile([1, num_group, 1])
pad_gt_mask = pad_gt_mask.tile([1, num_group])
dn_positive_idx = paddle.nonzero(pad_gt_mask)[:, 1]
dn_positive_idx = paddle.split(dn_positive_idx,
[n * num_group for n in num_gts])
# total denoising queries
num_denoising = int(max_gt_num * num_group)
if label_noise_ratio > 0:
input_query_class = input_query_class.flatten()
pad_gt_mask = pad_gt_mask.flatten()
# half of bbox prob
mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5)
chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1)
# randomly put a new one here
new_label = paddle.randint_like(
chosen_idx, 0, num_classes, dtype=input_query_class.dtype)
input_query_class.scatter_(chosen_idx, new_label)
input_query_class.reshape_([bs, num_denoising])
pad_gt_mask.reshape_([bs, num_denoising])
if box_noise_scale > 0:
diff = paddle.concat(
[input_query_bbox[..., 2:] * 0.5, input_query_bbox[..., 2:]],
axis=-1) * box_noise_scale
diff *= (paddle.rand(input_query_bbox.shape) * 2.0 - 1.0)
input_query_bbox += diff
input_query_bbox = inverse_sigmoid(input_query_bbox)
class_embed = paddle.concat(
[class_embed, paddle.zeros([1, class_embed.shape[-1]])])
input_query_class = paddle.gather(
class_embed, input_query_class.flatten(),
axis=0).reshape([bs, num_denoising, -1])
tgt_size = num_denoising + num_queries
attn_mask = paddle.ones([tgt_size, tgt_size]) < 0
# match query cannot see the reconstruction
attn_mask[num_denoising:, :num_denoising] = True
# reconstruct cannot see each other
for i in range(num_group):
if i == 0:
attn_mask[max_gt_num * i:max_gt_num * (i + 1), max_gt_num * (i + 1):
num_denoising] = True
if i == num_group - 1:
attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num *
i] = True
else:
attn_mask[max_gt_num * i:max_gt_num * (i + 1), max_gt_num * (i + 1):
num_denoising] = True
attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num *
i] = True
attn_mask = ~attn_mask
dn_meta = {
"dn_positive_idx": dn_positive_idx,
"dn_num_group": num_group,
"dn_num_split": [num_denoising, num_queries]
}
return input_query_class, input_query_bbox, attn_mask, dn_meta
def get_contrastive_denoising_training_group(targets,
num_classes,
num_queries,
class_embed,
num_denoising=100,
label_noise_ratio=0.5,
box_noise_scale=1.0):
if num_denoising <= 0:
return None, None, None, None
num_gts = [len(t) for t in targets["gt_class"]]
max_gt_num = max(num_gts)
if max_gt_num == 0:
return None, None, None, None
num_group = num_denoising // max_gt_num
num_group = 1 if num_group == 0 else num_group
# pad gt to max_num of a batch
bs = len(targets["gt_class"])
input_query_class = paddle.full(
[bs, max_gt_num], num_classes, dtype='int32')
input_query_bbox = paddle.zeros([bs, max_gt_num, 4])
pad_gt_mask = paddle.zeros([bs, max_gt_num])
for i in range(bs):
num_gt = num_gts[i]
if num_gt > 0:
input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1)
input_query_bbox[i, :num_gt] = targets["gt_bbox"][i]
pad_gt_mask[i, :num_gt] = 1
# each group has positive and negative queries.
input_query_class = input_query_class.tile([1, 2 * num_group])
input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1])
pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group])
# positive and negative mask
negative_gt_mask = paddle.zeros([bs, max_gt_num * 2, 1])
negative_gt_mask[:, max_gt_num:] = 1
negative_gt_mask = negative_gt_mask.tile([1, num_group, 1])
positive_gt_mask = 1 - negative_gt_mask
# contrastive denoising training positive index
positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
dn_positive_idx = paddle.nonzero(positive_gt_mask)[:, 1]
dn_positive_idx = paddle.split(dn_positive_idx,
[n * num_group for n in num_gts])
# total denoising queries
num_denoising = int(max_gt_num * 2 * num_group)
if label_noise_ratio > 0:
input_query_class = input_query_class.flatten()
pad_gt_mask = pad_gt_mask.flatten()
# Convert pad_gt_mask to bool if it's not already
pad_gt_mask = pad_gt_mask.astype('bool')
# half of bbox prob
mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5)
chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1)
# randomly put a new one here
new_label = paddle.randint_like(
chosen_idx, 0, num_classes, dtype=input_query_class.dtype)
input_query_class.scatter_(chosen_idx, new_label)
input_query_class.reshape_([bs, num_denoising])
pad_gt_mask.reshape_([bs, num_denoising])
if box_noise_scale > 0:
known_bbox = bbox_cxcywh_to_xyxy(input_query_bbox)
diff = paddle.tile(input_query_bbox[..., 2:] * 0.5,
[1, 1, 2]) * box_noise_scale
rand_sign = paddle.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
rand_part = paddle.rand(input_query_bbox.shape)
rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (
1 - negative_gt_mask)
rand_part *= rand_sign
known_bbox += rand_part * diff
known_bbox.clip_(min=0.0, max=1.0)
input_query_bbox = bbox_xyxy_to_cxcywh(known_bbox)
input_query_bbox = inverse_sigmoid(input_query_bbox)
class_embed = paddle.concat(
[class_embed, paddle.zeros([1, class_embed.shape[-1]])])
input_query_class = paddle.gather(
class_embed, input_query_class.flatten(),
axis=0).reshape([bs, num_denoising, -1])
tgt_size = num_denoising + num_queries
attn_mask = paddle.ones([tgt_size, tgt_size]) < 0
# match query cannot see the reconstruction
attn_mask[num_denoising:, :num_denoising] = True
# reconstruct cannot see each other
for i in range(num_group):
if i == 0:
attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num *
2 * (i + 1):num_denoising] = True
if i == num_group - 1:
attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *
i * 2] = True
else:
attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num *
2 * (i + 1):num_denoising] = True
attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *
2 * i] = True
attn_mask = ~attn_mask
dn_meta = {
"dn_positive_idx": dn_positive_idx,
"dn_num_group": num_group,
"dn_num_split": [num_denoising, num_queries]
}
return input_query_class, input_query_bbox, attn_mask, dn_meta
def get_sine_pos_embed(pos_tensor,
num_pos_feats=128,
temperature=10000,
exchange_xy=True):
"""generate sine position embedding from a position tensor
Args:
pos_tensor (Tensor): Shape as `(None, n)`.
num_pos_feats (int): projected shape for each float in the tensor. Default: 128
temperature (int): The temperature used for scaling
the position embedding. Default: 10000.
exchange_xy (bool, optional): exchange pos x and pos y. \
For example, input tensor is `[x, y]`, the results will # noqa
be `[pos(y), pos(x)]`. Defaults: True.
Returns:
Tensor: Returned position embedding # noqa
with shape `(None, n * num_pos_feats)`.
"""
scale = 2. * math.pi
dim_t = 2. * paddle.floor_divide(
paddle.arange(num_pos_feats), paddle.to_tensor(2))
dim_t = scale / temperature**(dim_t / num_pos_feats)
def sine_func(x):
x *= dim_t
return paddle.stack(
(x[:, :, 0::2].sin(), x[:, :, 1::2].cos()), axis=3).flatten(2)
pos_res = [sine_func(x) for x in pos_tensor.split(pos_tensor.shape[-1], -1)]
if exchange_xy:
pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
pos_res = paddle.concat(pos_res, axis=2)
return pos_res
def mask_to_box_coordinate(mask,
normalize=False,
format="xyxy",
dtype="float32"):
"""
Compute the bounding boxes around the provided mask.
Args:
mask (Tensor:bool): [b, c, h, w]
Returns:
bbox (Tensor): [b, c, 4]
"""
assert mask.ndim == 4
assert format in ["xyxy", "xywh"]
if mask.sum() == 0:
return paddle.zeros([mask.shape[0], mask.shape[1], 4], dtype=dtype)
h, w = mask.shape[-2:]
y, x = paddle.meshgrid(
paddle.arange(
end=h, dtype=dtype), paddle.arange(
end=w, dtype=dtype))
x_mask = x * mask
x_max = x_mask.flatten(-2).max(-1) + 1
x_min = paddle.where(mask, x_mask,
paddle.to_tensor(1e8)).flatten(-2).min(-1)
y_mask = y * mask
y_max = y_mask.flatten(-2).max(-1) + 1
y_min = paddle.where(mask, y_mask,
paddle.to_tensor(1e8)).flatten(-2).min(-1)
out_bbox = paddle.stack([x_min, y_min, x_max, y_max], axis=-1)
if normalize:
out_bbox /= paddle.to_tensor([w, h, w, h]).astype(dtype)
return out_bbox if format == "xyxy" else bbox_xyxy_to_cxcywh(out_bbox)
def varifocal_loss_with_logits(pred_logits,
gt_score,
label,
normalizer=1.0,
alpha=0.75,
gamma=2.0):
pred_score = F.sigmoid(pred_logits)
weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
loss = F.binary_cross_entropy_with_logits(
pred_logits, gt_score, weight=weight, reduction='none')
return loss.mean(1).sum() / normalizer
from ..initializer import linear_init_
class MLP(nn.Layer):
"""This code is based on
https://github.com/facebookresearch/detr/blob/main/models/detr.py
"""
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
self.layers = nn.LayerList(
nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
self._reset_parameters()
def _reset_parameters(self):
for l in self.layers:
linear_init_(l)
def forward(self, x):
for i, layer in enumerate(self.layers):
x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
return x