first commit
This commit is contained in:
27
rtdetr_paddle/ppdet/modeling/__init__.py
Normal file
27
rtdetr_paddle/ppdet/modeling/__init__.py
Normal file
@@ -0,0 +1,27 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import warnings
|
||||
warnings.filterwarnings(
|
||||
action='ignore', category=DeprecationWarning, module='ops')
|
||||
|
||||
|
||||
from .ops import *
|
||||
from .backbones import *
|
||||
from .heads import *
|
||||
from .losses import *
|
||||
from .architectures import *
|
||||
from .post_process import *
|
||||
from .layers import *
|
||||
from .transformers import *
|
||||
16
rtdetr_paddle/ppdet/modeling/architectures/__init__.py
Normal file
16
rtdetr_paddle/ppdet/modeling/architectures/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .meta_arch import *
|
||||
from .detr import *
|
||||
116
rtdetr_paddle/ppdet/modeling/architectures/detr.py
Normal file
116
rtdetr_paddle/ppdet/modeling/architectures/detr.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
from .meta_arch import BaseArch
|
||||
from ppdet.core.workspace import register, create
|
||||
|
||||
__all__ = ['DETR']
|
||||
# Deformable DETR, DINO use the same architecture as DETR
|
||||
|
||||
|
||||
@register
|
||||
class DETR(BaseArch):
|
||||
__category__ = 'architecture'
|
||||
__inject__ = ['post_process']
|
||||
__shared__ = ['with_mask', 'exclude_post_process']
|
||||
|
||||
def __init__(self,
|
||||
backbone,
|
||||
transformer='DETRTransformer',
|
||||
detr_head='DETRHead',
|
||||
neck=None,
|
||||
post_process='DETRPostProcess',
|
||||
with_mask=False,
|
||||
exclude_post_process=False):
|
||||
super(DETR, self).__init__()
|
||||
self.backbone = backbone
|
||||
self.transformer = transformer
|
||||
self.detr_head = detr_head
|
||||
self.neck = neck
|
||||
self.post_process = post_process
|
||||
self.with_mask = with_mask
|
||||
self.exclude_post_process = exclude_post_process
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, cfg, *args, **kwargs):
|
||||
# backbone
|
||||
backbone = create(cfg['backbone'])
|
||||
# neck
|
||||
kwargs = {'input_shape': backbone.out_shape}
|
||||
neck = create(cfg['neck'], **kwargs) if cfg['neck'] else None
|
||||
|
||||
# transformer
|
||||
if neck is not None:
|
||||
kwargs = {'input_shape': neck.out_shape}
|
||||
transformer = create(cfg['transformer'], **kwargs)
|
||||
# head
|
||||
kwargs = {
|
||||
'hidden_dim': transformer.hidden_dim,
|
||||
'nhead': transformer.nhead,
|
||||
'input_shape': backbone.out_shape
|
||||
}
|
||||
detr_head = create(cfg['detr_head'], **kwargs)
|
||||
|
||||
return {
|
||||
'backbone': backbone,
|
||||
'transformer': transformer,
|
||||
"detr_head": detr_head,
|
||||
"neck": neck
|
||||
}
|
||||
|
||||
def _forward(self):
|
||||
# Backbone
|
||||
body_feats = self.backbone(self.inputs)
|
||||
|
||||
# Neck
|
||||
if self.neck is not None:
|
||||
body_feats = self.neck(body_feats)
|
||||
|
||||
# Transformer
|
||||
pad_mask = self.inputs.get('pad_mask', None)
|
||||
out_transformer = self.transformer(body_feats, pad_mask, self.inputs)
|
||||
|
||||
# DETR Head
|
||||
if self.training:
|
||||
detr_losses = self.detr_head(out_transformer, body_feats,
|
||||
self.inputs)
|
||||
detr_losses.update({
|
||||
'loss': paddle.add_n(
|
||||
[v for k, v in detr_losses.items() if 'log' not in k])
|
||||
})
|
||||
return detr_losses
|
||||
else:
|
||||
preds = self.detr_head(out_transformer, body_feats)
|
||||
if self.exclude_post_process:
|
||||
bbox, bbox_num, mask = preds
|
||||
else:
|
||||
bbox, bbox_num, mask = self.post_process(
|
||||
preds, self.inputs['im_shape'], self.inputs['scale_factor'],
|
||||
paddle.shape(self.inputs['image'])[2:])
|
||||
|
||||
output = {'bbox': bbox, 'bbox_num': bbox_num}
|
||||
if self.with_mask:
|
||||
output['mask'] = mask
|
||||
return output
|
||||
|
||||
def get_loss(self):
|
||||
return self._forward()
|
||||
|
||||
def get_pred(self):
|
||||
return self._forward()
|
||||
132
rtdetr_paddle/ppdet/modeling/architectures/meta_arch.py
Normal file
132
rtdetr_paddle/ppdet/modeling/architectures/meta_arch.py
Normal file
@@ -0,0 +1,132 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import typing
|
||||
|
||||
from ppdet.core.workspace import register
|
||||
from ppdet.modeling.post_process import nms
|
||||
|
||||
__all__ = ['BaseArch']
|
||||
|
||||
|
||||
@register
|
||||
class BaseArch(nn.Layer):
|
||||
def __init__(self, data_format='NCHW', use_extra_data=False):
|
||||
super(BaseArch, self).__init__()
|
||||
self.data_format = data_format
|
||||
self.inputs = {}
|
||||
self.fuse_norm = False
|
||||
self.use_extra_data = use_extra_data
|
||||
|
||||
def load_meanstd(self, cfg_transform):
|
||||
scale = 1.
|
||||
mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
|
||||
std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
|
||||
for item in cfg_transform:
|
||||
if 'NormalizeImage' in item:
|
||||
mean = np.array(
|
||||
item['NormalizeImage']['mean'], dtype=np.float32)
|
||||
std = np.array(item['NormalizeImage']['std'], dtype=np.float32)
|
||||
if item['NormalizeImage'].get('is_scale', True):
|
||||
scale = 1. / 255.
|
||||
break
|
||||
if self.data_format == 'NHWC':
|
||||
self.scale = paddle.to_tensor(scale / std).reshape((1, 1, 1, 3))
|
||||
self.bias = paddle.to_tensor(-mean / std).reshape((1, 1, 1, 3))
|
||||
else:
|
||||
self.scale = paddle.to_tensor(scale / std).reshape((1, 3, 1, 1))
|
||||
self.bias = paddle.to_tensor(-mean / std).reshape((1, 3, 1, 1))
|
||||
|
||||
def forward(self, inputs):
|
||||
if self.data_format == 'NHWC':
|
||||
image = inputs['image']
|
||||
inputs['image'] = paddle.transpose(image, [0, 2, 3, 1])
|
||||
|
||||
if self.fuse_norm:
|
||||
image = inputs['image']
|
||||
self.inputs['image'] = image * self.scale + self.bias
|
||||
self.inputs['im_shape'] = inputs['im_shape']
|
||||
self.inputs['scale_factor'] = inputs['scale_factor']
|
||||
else:
|
||||
self.inputs = inputs
|
||||
|
||||
self.model_arch()
|
||||
|
||||
if self.training:
|
||||
out = self.get_loss()
|
||||
else:
|
||||
inputs_list = []
|
||||
# multi-scale input
|
||||
if not isinstance(inputs, typing.Sequence):
|
||||
inputs_list.append(inputs)
|
||||
else:
|
||||
inputs_list.extend(inputs)
|
||||
outs = []
|
||||
for inp in inputs_list:
|
||||
if self.fuse_norm:
|
||||
self.inputs['image'] = inp['image'] * self.scale + self.bias
|
||||
self.inputs['im_shape'] = inp['im_shape']
|
||||
self.inputs['scale_factor'] = inp['scale_factor']
|
||||
else:
|
||||
self.inputs = inp
|
||||
outs.append(self.get_pred())
|
||||
|
||||
# multi-scale test
|
||||
if len(outs) > 1:
|
||||
out = self.merge_multi_scale_predictions(outs)
|
||||
else:
|
||||
out = outs[0]
|
||||
return out
|
||||
|
||||
def merge_multi_scale_predictions(self, outs):
|
||||
# default values for architectures not included in following list
|
||||
num_classes = 80
|
||||
nms_threshold = 0.5
|
||||
keep_top_k = 100
|
||||
|
||||
if self.__class__.__name__ in ('CascadeRCNN', 'FasterRCNN', 'MaskRCNN'):
|
||||
num_classes = self.bbox_head.num_classes
|
||||
keep_top_k = self.bbox_post_process.nms.keep_top_k
|
||||
nms_threshold = self.bbox_post_process.nms.nms_threshold
|
||||
else:
|
||||
raise Exception(
|
||||
"Multi scale test only supports CascadeRCNN, FasterRCNN and MaskRCNN for now"
|
||||
)
|
||||
|
||||
final_boxes = []
|
||||
all_scale_outs = paddle.concat([o['bbox'] for o in outs]).numpy()
|
||||
for c in range(num_classes):
|
||||
idxs = all_scale_outs[:, 0] == c
|
||||
if np.count_nonzero(idxs) == 0:
|
||||
continue
|
||||
r = nms(all_scale_outs[idxs, 1:], nms_threshold)
|
||||
final_boxes.append(
|
||||
np.concatenate([np.full((r.shape[0], 1), c), r], 1))
|
||||
out = np.concatenate(final_boxes)
|
||||
out = np.concatenate(sorted(
|
||||
out, key=lambda e: e[1])[-keep_top_k:]).reshape((-1, 6))
|
||||
out = {
|
||||
'bbox': paddle.to_tensor(out),
|
||||
'bbox_num': paddle.to_tensor(np.array([out.shape[0], ]))
|
||||
}
|
||||
|
||||
return out
|
||||
|
||||
def build_inputs(self, data, input_def):
|
||||
inputs = {}
|
||||
for i, k in enumerate(input_def):
|
||||
inputs[k] = data[i]
|
||||
return inputs
|
||||
|
||||
def model_arch(self, ):
|
||||
pass
|
||||
|
||||
def get_loss(self, ):
|
||||
raise NotImplementedError("Should implement get_loss method!")
|
||||
|
||||
def get_pred(self, ):
|
||||
raise NotImplementedError("Should implement get_pred method!")
|
||||
30
rtdetr_paddle/ppdet/modeling/backbones/__init__.py
Normal file
30
rtdetr_paddle/ppdet/modeling/backbones/__init__.py
Normal file
@@ -0,0 +1,30 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .resnet import *
|
||||
from .darknet import *
|
||||
from .mobilenet_v1 import *
|
||||
from .mobilenet_v3 import *
|
||||
from .shufflenet_v2 import *
|
||||
from .swin_transformer import *
|
||||
from .lcnet import *
|
||||
from .cspresnet import *
|
||||
from .csp_darknet import *
|
||||
from .convnext import *
|
||||
from .vision_transformer import *
|
||||
from .mobileone import *
|
||||
from .trans_encoder import *
|
||||
from .focalnet import *
|
||||
from .vit_mae import *
|
||||
from .hgnet_v2 import *
|
||||
245
rtdetr_paddle/ppdet/modeling/backbones/convnext.py
Normal file
245
rtdetr_paddle/ppdet/modeling/backbones/convnext.py
Normal file
@@ -0,0 +1,245 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
'''
|
||||
Modified from https://github.com/facebookresearch/ConvNeXt
|
||||
Copyright (c) Meta Platforms, Inc. and affiliates.
|
||||
All rights reserved.
|
||||
This source code is licensed under the license found in the
|
||||
LICENSE file in the root directory of this source tree.
|
||||
'''
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
from paddle.nn.initializer import Constant
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from ..shape_spec import ShapeSpec
|
||||
from .transformer_utils import DropPath, trunc_normal_, zeros_
|
||||
|
||||
__all__ = ['ConvNeXt']
|
||||
|
||||
|
||||
class Block(nn.Layer):
|
||||
r""" ConvNeXt Block. There are two equivalent implementations:
|
||||
(1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
|
||||
(2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
|
||||
We use (2) as we find it slightly faster in Pypaddle
|
||||
|
||||
Args:
|
||||
dim (int): Number of input channels.
|
||||
drop_path (float): Stochastic depth rate. Default: 0.0
|
||||
layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
|
||||
"""
|
||||
|
||||
def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
|
||||
super().__init__()
|
||||
self.dwconv = nn.Conv2D(
|
||||
dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv
|
||||
self.norm = LayerNorm(dim, eps=1e-6)
|
||||
self.pwconv1 = nn.Linear(
|
||||
dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers
|
||||
self.act = nn.GELU()
|
||||
self.pwconv2 = nn.Linear(4 * dim, dim)
|
||||
|
||||
if layer_scale_init_value > 0:
|
||||
self.gamma = self.create_parameter(
|
||||
shape=(dim, ),
|
||||
attr=ParamAttr(initializer=Constant(layer_scale_init_value)))
|
||||
else:
|
||||
self.gamma = None
|
||||
|
||||
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity(
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
input = x
|
||||
x = self.dwconv(x)
|
||||
x = x.transpose([0, 2, 3, 1])
|
||||
x = self.norm(x)
|
||||
x = self.pwconv1(x)
|
||||
x = self.act(x)
|
||||
x = self.pwconv2(x)
|
||||
if self.gamma is not None:
|
||||
x = self.gamma * x
|
||||
x = x.transpose([0, 3, 1, 2])
|
||||
x = input + self.drop_path(x)
|
||||
return x
|
||||
|
||||
|
||||
class LayerNorm(nn.Layer):
|
||||
r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
|
||||
The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
|
||||
shape (batch_size, height, width, channels) while channels_first corresponds to inputs
|
||||
with shape (batch_size, channels, height, width).
|
||||
"""
|
||||
|
||||
def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
|
||||
super().__init__()
|
||||
|
||||
self.weight = self.create_parameter(
|
||||
shape=(normalized_shape, ),
|
||||
attr=ParamAttr(initializer=Constant(1.)))
|
||||
self.bias = self.create_parameter(
|
||||
shape=(normalized_shape, ),
|
||||
attr=ParamAttr(initializer=Constant(0.)))
|
||||
|
||||
self.eps = eps
|
||||
self.data_format = data_format
|
||||
if self.data_format not in ["channels_last", "channels_first"]:
|
||||
raise NotImplementedError
|
||||
self.normalized_shape = (normalized_shape, )
|
||||
|
||||
def forward(self, x):
|
||||
if self.data_format == "channels_last":
|
||||
return F.layer_norm(x, self.normalized_shape, self.weight,
|
||||
self.bias, self.eps)
|
||||
elif self.data_format == "channels_first":
|
||||
u = x.mean(1, keepdim=True)
|
||||
s = (x - u).pow(2).mean(1, keepdim=True)
|
||||
x = (x - u) / paddle.sqrt(s + self.eps)
|
||||
x = self.weight[:, None, None] * x + self.bias[:, None, None]
|
||||
return x
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class ConvNeXt(nn.Layer):
|
||||
r""" ConvNeXt
|
||||
A Pypaddle impl of : `A ConvNet for the 2020s` -
|
||||
https://arxiv.org/pdf/2201.03545.pdf
|
||||
|
||||
Args:
|
||||
in_chans (int): Number of input image channels. Default: 3
|
||||
depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
|
||||
dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
|
||||
drop_path_rate (float): Stochastic depth rate. Default: 0.
|
||||
layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
|
||||
"""
|
||||
|
||||
arch_settings = {
|
||||
'tiny': {
|
||||
'depths': [3, 3, 9, 3],
|
||||
'dims': [96, 192, 384, 768]
|
||||
},
|
||||
'small': {
|
||||
'depths': [3, 3, 27, 3],
|
||||
'dims': [96, 192, 384, 768]
|
||||
},
|
||||
'base': {
|
||||
'depths': [3, 3, 27, 3],
|
||||
'dims': [128, 256, 512, 1024]
|
||||
},
|
||||
'large': {
|
||||
'depths': [3, 3, 27, 3],
|
||||
'dims': [192, 384, 768, 1536]
|
||||
},
|
||||
'xlarge': {
|
||||
'depths': [3, 3, 27, 3],
|
||||
'dims': [256, 512, 1024, 2048]
|
||||
},
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
arch='tiny',
|
||||
in_chans=3,
|
||||
drop_path_rate=0.,
|
||||
layer_scale_init_value=1e-6,
|
||||
return_idx=[1, 2, 3],
|
||||
norm_output=True,
|
||||
pretrained=None, ):
|
||||
super().__init__()
|
||||
depths = self.arch_settings[arch]['depths']
|
||||
dims = self.arch_settings[arch]['dims']
|
||||
self.downsample_layers = nn.LayerList(
|
||||
) # stem and 3 intermediate downsampling conv layers
|
||||
stem = nn.Sequential(
|
||||
nn.Conv2D(
|
||||
in_chans, dims[0], kernel_size=4, stride=4),
|
||||
LayerNorm(
|
||||
dims[0], eps=1e-6, data_format="channels_first"))
|
||||
self.downsample_layers.append(stem)
|
||||
for i in range(3):
|
||||
downsample_layer = nn.Sequential(
|
||||
LayerNorm(
|
||||
dims[i], eps=1e-6, data_format="channels_first"),
|
||||
nn.Conv2D(
|
||||
dims[i], dims[i + 1], kernel_size=2, stride=2), )
|
||||
self.downsample_layers.append(downsample_layer)
|
||||
|
||||
self.stages = nn.LayerList(
|
||||
) # 4 feature resolution stages, each consisting of multiple residual blocks
|
||||
dp_rates = [x for x in np.linspace(0, drop_path_rate, sum(depths))]
|
||||
cur = 0
|
||||
for i in range(4):
|
||||
stage = nn.Sequential(* [
|
||||
Block(
|
||||
dim=dims[i],
|
||||
drop_path=dp_rates[cur + j],
|
||||
layer_scale_init_value=layer_scale_init_value)
|
||||
for j in range(depths[i])
|
||||
])
|
||||
self.stages.append(stage)
|
||||
cur += depths[i]
|
||||
|
||||
self.return_idx = return_idx
|
||||
self.dims = [dims[i] for i in return_idx] # [::-1]
|
||||
|
||||
self.norm_output = norm_output
|
||||
if norm_output:
|
||||
self.norms = nn.LayerList([
|
||||
LayerNorm(
|
||||
c, eps=1e-6, data_format="channels_first")
|
||||
for c in self.dims
|
||||
])
|
||||
|
||||
self.apply(self._init_weights)
|
||||
|
||||
if pretrained is not None:
|
||||
if 'http' in pretrained: #URL
|
||||
path = paddle.utils.download.get_weights_path_from_url(
|
||||
pretrained)
|
||||
else: #model in local path
|
||||
path = pretrained
|
||||
self.set_state_dict(paddle.load(path))
|
||||
|
||||
def _init_weights(self, m):
|
||||
if isinstance(m, (nn.Conv2D, nn.Linear)):
|
||||
trunc_normal_(m.weight)
|
||||
zeros_(m.bias)
|
||||
|
||||
def forward_features(self, x):
|
||||
output = []
|
||||
for i in range(4):
|
||||
x = self.downsample_layers[i](x)
|
||||
x = self.stages[i](x)
|
||||
output.append(x)
|
||||
|
||||
outputs = [output[i] for i in self.return_idx]
|
||||
if self.norm_output:
|
||||
outputs = [self.norms[i](out) for i, out in enumerate(outputs)]
|
||||
|
||||
return outputs
|
||||
|
||||
def forward(self, x):
|
||||
x = self.forward_features(x['image'])
|
||||
return x
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [ShapeSpec(channels=c) for c in self.dims]
|
||||
404
rtdetr_paddle/ppdet/modeling/backbones/csp_darknet.py
Normal file
404
rtdetr_paddle/ppdet/modeling/backbones/csp_darknet.py
Normal file
@@ -0,0 +1,404 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
from paddle.regularizer import L2Decay
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from ppdet.modeling.initializer import conv_init_
|
||||
from ..shape_spec import ShapeSpec
|
||||
|
||||
__all__ = [
|
||||
'CSPDarkNet', 'BaseConv', 'DWConv', 'BottleNeck', 'SPPLayer', 'SPPFLayer'
|
||||
]
|
||||
|
||||
|
||||
class BaseConv(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
ksize,
|
||||
stride,
|
||||
groups=1,
|
||||
bias=False,
|
||||
act="silu"):
|
||||
super(BaseConv, self).__init__()
|
||||
self.conv = nn.Conv2D(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size=ksize,
|
||||
stride=stride,
|
||||
padding=(ksize - 1) // 2,
|
||||
groups=groups,
|
||||
bias_attr=bias)
|
||||
self.bn = nn.BatchNorm2D(
|
||||
out_channels,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
|
||||
|
||||
self._init_weights()
|
||||
|
||||
def _init_weights(self):
|
||||
conv_init_(self.conv)
|
||||
|
||||
def forward(self, x):
|
||||
# use 'x * F.sigmoid(x)' replace 'silu'
|
||||
x = self.bn(self.conv(x))
|
||||
y = x * F.sigmoid(x)
|
||||
return y
|
||||
|
||||
|
||||
class DWConv(nn.Layer):
|
||||
"""Depthwise Conv"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
ksize,
|
||||
stride=1,
|
||||
bias=False,
|
||||
act="silu"):
|
||||
super(DWConv, self).__init__()
|
||||
self.dw_conv = BaseConv(
|
||||
in_channels,
|
||||
in_channels,
|
||||
ksize=ksize,
|
||||
stride=stride,
|
||||
groups=in_channels,
|
||||
bias=bias,
|
||||
act=act)
|
||||
self.pw_conv = BaseConv(
|
||||
in_channels,
|
||||
out_channels,
|
||||
ksize=1,
|
||||
stride=1,
|
||||
groups=1,
|
||||
bias=bias,
|
||||
act=act)
|
||||
|
||||
def forward(self, x):
|
||||
return self.pw_conv(self.dw_conv(x))
|
||||
|
||||
|
||||
class Focus(nn.Layer):
|
||||
"""Focus width and height information into channel space, used in YOLOX."""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
ksize=3,
|
||||
stride=1,
|
||||
bias=False,
|
||||
act="silu"):
|
||||
super(Focus, self).__init__()
|
||||
self.conv = BaseConv(
|
||||
in_channels * 4,
|
||||
out_channels,
|
||||
ksize=ksize,
|
||||
stride=stride,
|
||||
bias=bias,
|
||||
act=act)
|
||||
|
||||
def forward(self, inputs):
|
||||
# inputs [bs, C, H, W] -> outputs [bs, 4C, W/2, H/2]
|
||||
top_left = inputs[:, :, 0::2, 0::2]
|
||||
top_right = inputs[:, :, 0::2, 1::2]
|
||||
bottom_left = inputs[:, :, 1::2, 0::2]
|
||||
bottom_right = inputs[:, :, 1::2, 1::2]
|
||||
outputs = paddle.concat(
|
||||
[top_left, bottom_left, top_right, bottom_right], 1)
|
||||
return self.conv(outputs)
|
||||
|
||||
|
||||
class BottleNeck(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
shortcut=True,
|
||||
expansion=0.5,
|
||||
depthwise=False,
|
||||
bias=False,
|
||||
act="silu"):
|
||||
super(BottleNeck, self).__init__()
|
||||
hidden_channels = int(out_channels * expansion)
|
||||
Conv = DWConv if depthwise else BaseConv
|
||||
self.conv1 = BaseConv(
|
||||
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
|
||||
self.conv2 = Conv(
|
||||
hidden_channels,
|
||||
out_channels,
|
||||
ksize=3,
|
||||
stride=1,
|
||||
bias=bias,
|
||||
act=act)
|
||||
self.add_shortcut = shortcut and in_channels == out_channels
|
||||
|
||||
def forward(self, x):
|
||||
y = self.conv2(self.conv1(x))
|
||||
if self.add_shortcut:
|
||||
y = y + x
|
||||
return y
|
||||
|
||||
|
||||
class SPPLayer(nn.Layer):
|
||||
"""Spatial Pyramid Pooling (SPP) layer used in YOLOv3-SPP and YOLOX"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_sizes=(5, 9, 13),
|
||||
bias=False,
|
||||
act="silu"):
|
||||
super(SPPLayer, self).__init__()
|
||||
hidden_channels = in_channels // 2
|
||||
self.conv1 = BaseConv(
|
||||
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
|
||||
self.maxpoolings = nn.LayerList([
|
||||
nn.MaxPool2D(
|
||||
kernel_size=ks, stride=1, padding=ks // 2)
|
||||
for ks in kernel_sizes
|
||||
])
|
||||
conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
|
||||
self.conv2 = BaseConv(
|
||||
conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
x = paddle.concat([x] + [mp(x) for mp in self.maxpoolings], axis=1)
|
||||
x = self.conv2(x)
|
||||
return x
|
||||
|
||||
|
||||
class SPPFLayer(nn.Layer):
|
||||
""" Spatial Pyramid Pooling - Fast (SPPF) layer used in YOLOv5 by Glenn Jocher,
|
||||
equivalent to SPP(k=(5, 9, 13))
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
ksize=5,
|
||||
bias=False,
|
||||
act='silu'):
|
||||
super(SPPFLayer, self).__init__()
|
||||
hidden_channels = in_channels // 2
|
||||
self.conv1 = BaseConv(
|
||||
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
|
||||
self.maxpooling = nn.MaxPool2D(
|
||||
kernel_size=ksize, stride=1, padding=ksize // 2)
|
||||
conv2_channels = hidden_channels * 4
|
||||
self.conv2 = BaseConv(
|
||||
conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
y1 = self.maxpooling(x)
|
||||
y2 = self.maxpooling(y1)
|
||||
y3 = self.maxpooling(y2)
|
||||
concats = paddle.concat([x, y1, y2, y3], axis=1)
|
||||
out = self.conv2(concats)
|
||||
return out
|
||||
|
||||
|
||||
class CSPLayer(nn.Layer):
|
||||
"""CSP (Cross Stage Partial) layer with 3 convs, named C3 in YOLOv5"""
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
num_blocks=1,
|
||||
shortcut=True,
|
||||
expansion=0.5,
|
||||
depthwise=False,
|
||||
bias=False,
|
||||
act="silu"):
|
||||
super(CSPLayer, self).__init__()
|
||||
hidden_channels = int(out_channels * expansion)
|
||||
self.conv1 = BaseConv(
|
||||
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
|
||||
self.conv2 = BaseConv(
|
||||
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
|
||||
self.bottlenecks = nn.Sequential(* [
|
||||
BottleNeck(
|
||||
hidden_channels,
|
||||
hidden_channels,
|
||||
shortcut=shortcut,
|
||||
expansion=1.0,
|
||||
depthwise=depthwise,
|
||||
bias=bias,
|
||||
act=act) for _ in range(num_blocks)
|
||||
])
|
||||
self.conv3 = BaseConv(
|
||||
hidden_channels * 2,
|
||||
out_channels,
|
||||
ksize=1,
|
||||
stride=1,
|
||||
bias=bias,
|
||||
act=act)
|
||||
|
||||
def forward(self, x):
|
||||
x_1 = self.conv1(x)
|
||||
x_1 = self.bottlenecks(x_1)
|
||||
x_2 = self.conv2(x)
|
||||
x = paddle.concat([x_1, x_2], axis=1)
|
||||
x = self.conv3(x)
|
||||
return x
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class CSPDarkNet(nn.Layer):
|
||||
"""
|
||||
CSPDarkNet backbone.
|
||||
Args:
|
||||
arch (str): Architecture of CSPDarkNet, from {P5, P6, X}, default as X,
|
||||
and 'X' means used in YOLOX, 'P5/P6' means used in YOLOv5.
|
||||
depth_mult (float): Depth multiplier, multiply number of channels in
|
||||
each layer, default as 1.0.
|
||||
width_mult (float): Width multiplier, multiply number of blocks in
|
||||
CSPLayer, default as 1.0.
|
||||
depthwise (bool): Whether to use depth-wise conv layer.
|
||||
act (str): Activation function type, default as 'silu'.
|
||||
return_idx (list): Index of stages whose feature maps are returned.
|
||||
"""
|
||||
|
||||
__shared__ = ['depth_mult', 'width_mult', 'act', 'trt']
|
||||
|
||||
# in_channels, out_channels, num_blocks, add_shortcut, use_spp(use_sppf)
|
||||
# 'X' means setting used in YOLOX, 'P5/P6' means setting used in YOLOv5.
|
||||
arch_settings = {
|
||||
'X': [[64, 128, 3, True, False], [128, 256, 9, True, False],
|
||||
[256, 512, 9, True, False], [512, 1024, 3, False, True]],
|
||||
'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
|
||||
[256, 512, 9, True, False], [512, 1024, 3, True, True]],
|
||||
'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
|
||||
[256, 512, 9, True, False], [512, 768, 3, True, False],
|
||||
[768, 1024, 3, True, True]],
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
arch='X',
|
||||
depth_mult=1.0,
|
||||
width_mult=1.0,
|
||||
depthwise=False,
|
||||
act='silu',
|
||||
trt=False,
|
||||
return_idx=[2, 3, 4]):
|
||||
super(CSPDarkNet, self).__init__()
|
||||
self.arch = arch
|
||||
self.return_idx = return_idx
|
||||
Conv = DWConv if depthwise else BaseConv
|
||||
arch_setting = self.arch_settings[arch]
|
||||
base_channels = int(arch_setting[0][0] * width_mult)
|
||||
|
||||
# Note: differences between the latest YOLOv5 and the original YOLOX
|
||||
# 1. self.stem, use SPPF(in YOLOv5) or SPP(in YOLOX)
|
||||
# 2. use SPPF(in YOLOv5) or SPP(in YOLOX)
|
||||
# 3. put SPPF before(YOLOv5) or SPP after(YOLOX) the last cspdark block's CSPLayer
|
||||
# 4. whether SPPF(SPP)'CSPLayer add shortcut, True in YOLOv5, False in YOLOX
|
||||
if arch in ['P5', 'P6']:
|
||||
# in the latest YOLOv5, use Conv stem, and SPPF (fast, only single spp kernal size)
|
||||
self.stem = Conv(
|
||||
3, base_channels, ksize=6, stride=2, bias=False, act=act)
|
||||
spp_kernal_sizes = 5
|
||||
elif arch in ['X']:
|
||||
# in the original YOLOX, use Focus stem, and SPP (three spp kernal sizes)
|
||||
self.stem = Focus(
|
||||
3, base_channels, ksize=3, stride=1, bias=False, act=act)
|
||||
spp_kernal_sizes = (5, 9, 13)
|
||||
else:
|
||||
raise AttributeError("Unsupported arch type: {}".format(arch))
|
||||
|
||||
_out_channels = [base_channels]
|
||||
layers_num = 1
|
||||
self.csp_dark_blocks = []
|
||||
|
||||
for i, (in_channels, out_channels, num_blocks, shortcut,
|
||||
use_spp) in enumerate(arch_setting):
|
||||
in_channels = int(in_channels * width_mult)
|
||||
out_channels = int(out_channels * width_mult)
|
||||
_out_channels.append(out_channels)
|
||||
num_blocks = max(round(num_blocks * depth_mult), 1)
|
||||
stage = []
|
||||
|
||||
conv_layer = self.add_sublayer(
|
||||
'layers{}.stage{}.conv_layer'.format(layers_num, i + 1),
|
||||
Conv(
|
||||
in_channels, out_channels, 3, 2, bias=False, act=act))
|
||||
stage.append(conv_layer)
|
||||
layers_num += 1
|
||||
|
||||
if use_spp and arch in ['X']:
|
||||
# in YOLOX use SPPLayer
|
||||
spp_layer = self.add_sublayer(
|
||||
'layers{}.stage{}.spp_layer'.format(layers_num, i + 1),
|
||||
SPPLayer(
|
||||
out_channels,
|
||||
out_channels,
|
||||
kernel_sizes=spp_kernal_sizes,
|
||||
bias=False,
|
||||
act=act))
|
||||
stage.append(spp_layer)
|
||||
layers_num += 1
|
||||
|
||||
csp_layer = self.add_sublayer(
|
||||
'layers{}.stage{}.csp_layer'.format(layers_num, i + 1),
|
||||
CSPLayer(
|
||||
out_channels,
|
||||
out_channels,
|
||||
num_blocks=num_blocks,
|
||||
shortcut=shortcut,
|
||||
depthwise=depthwise,
|
||||
bias=False,
|
||||
act=act))
|
||||
stage.append(csp_layer)
|
||||
layers_num += 1
|
||||
|
||||
if use_spp and arch in ['P5', 'P6']:
|
||||
# in latest YOLOv5 use SPPFLayer instead of SPPLayer
|
||||
sppf_layer = self.add_sublayer(
|
||||
'layers{}.stage{}.sppf_layer'.format(layers_num, i + 1),
|
||||
SPPFLayer(
|
||||
out_channels,
|
||||
out_channels,
|
||||
ksize=5,
|
||||
bias=False,
|
||||
act=act))
|
||||
stage.append(sppf_layer)
|
||||
layers_num += 1
|
||||
|
||||
self.csp_dark_blocks.append(nn.Sequential(*stage))
|
||||
|
||||
self._out_channels = [_out_channels[i] for i in self.return_idx]
|
||||
self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx]
|
||||
|
||||
def forward(self, inputs):
|
||||
x = inputs['image']
|
||||
outputs = []
|
||||
x = self.stem(x)
|
||||
for i, layer in enumerate(self.csp_dark_blocks):
|
||||
x = layer(x)
|
||||
if i + 1 in self.return_idx:
|
||||
outputs.append(x)
|
||||
return outputs
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [
|
||||
ShapeSpec(
|
||||
channels=c, stride=s)
|
||||
for c, s in zip(self._out_channels, self.strides)
|
||||
]
|
||||
321
rtdetr_paddle/ppdet/modeling/backbones/cspresnet.py
Normal file
321
rtdetr_paddle/ppdet/modeling/backbones/cspresnet.py
Normal file
@@ -0,0 +1,321 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
from paddle.regularizer import L2Decay
|
||||
from paddle.nn.initializer import Constant
|
||||
|
||||
from ppdet.modeling.ops import get_act_fn
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from ..shape_spec import ShapeSpec
|
||||
|
||||
__all__ = ['CSPResNet', 'BasicBlock', 'EffectiveSELayer', 'ConvBNLayer']
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
ch_in,
|
||||
ch_out,
|
||||
filter_size=3,
|
||||
stride=1,
|
||||
groups=1,
|
||||
padding=0,
|
||||
act=None):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
|
||||
self.conv = nn.Conv2D(
|
||||
in_channels=ch_in,
|
||||
out_channels=ch_out,
|
||||
kernel_size=filter_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
groups=groups,
|
||||
bias_attr=False)
|
||||
|
||||
self.bn = nn.BatchNorm2D(
|
||||
ch_out,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
|
||||
self.act = get_act_fn(act) if act is None or isinstance(act, (
|
||||
str, dict)) else act
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
x = self.act(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class RepVggBlock(nn.Layer):
|
||||
def __init__(self, ch_in, ch_out, act='relu', alpha=False):
|
||||
super(RepVggBlock, self).__init__()
|
||||
self.ch_in = ch_in
|
||||
self.ch_out = ch_out
|
||||
self.conv1 = ConvBNLayer(
|
||||
ch_in, ch_out, 3, stride=1, padding=1, act=None)
|
||||
self.conv2 = ConvBNLayer(
|
||||
ch_in, ch_out, 1, stride=1, padding=0, act=None)
|
||||
self.act = get_act_fn(act) if act is None or isinstance(act, (
|
||||
str, dict)) else act
|
||||
if alpha:
|
||||
self.alpha = self.create_parameter(
|
||||
shape=[1],
|
||||
attr=ParamAttr(initializer=Constant(value=1.)),
|
||||
dtype="float32")
|
||||
else:
|
||||
self.alpha = None
|
||||
|
||||
def forward(self, x):
|
||||
if hasattr(self, 'conv'):
|
||||
y = self.conv(x)
|
||||
else:
|
||||
if self.alpha:
|
||||
y = self.conv1(x) + self.alpha * self.conv2(x)
|
||||
else:
|
||||
y = self.conv1(x) + self.conv2(x)
|
||||
y = self.act(y)
|
||||
return y
|
||||
|
||||
def convert_to_deploy(self):
|
||||
if not hasattr(self, 'conv'):
|
||||
self.conv = nn.Conv2D(
|
||||
in_channels=self.ch_in,
|
||||
out_channels=self.ch_out,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
groups=1)
|
||||
kernel, bias = self.get_equivalent_kernel_bias()
|
||||
self.conv.weight.set_value(kernel)
|
||||
self.conv.bias.set_value(bias)
|
||||
self.__delattr__('conv1')
|
||||
self.__delattr__('conv2')
|
||||
|
||||
def get_equivalent_kernel_bias(self):
|
||||
kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
|
||||
kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
|
||||
if self.alpha:
|
||||
return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor(
|
||||
kernel1x1), bias3x3 + self.alpha * bias1x1
|
||||
else:
|
||||
return kernel3x3 + self._pad_1x1_to_3x3_tensor(
|
||||
kernel1x1), bias3x3 + bias1x1
|
||||
|
||||
def _pad_1x1_to_3x3_tensor(self, kernel1x1):
|
||||
if kernel1x1 is None:
|
||||
return 0
|
||||
else:
|
||||
return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
|
||||
|
||||
def _fuse_bn_tensor(self, branch):
|
||||
if branch is None:
|
||||
return 0, 0
|
||||
kernel = branch.conv.weight
|
||||
running_mean = branch.bn._mean
|
||||
running_var = branch.bn._variance
|
||||
gamma = branch.bn.weight
|
||||
beta = branch.bn.bias
|
||||
eps = branch.bn._epsilon
|
||||
std = (running_var + eps).sqrt()
|
||||
t = (gamma / std).reshape((-1, 1, 1, 1))
|
||||
return kernel * t, beta - running_mean * gamma / std
|
||||
|
||||
|
||||
class BasicBlock(nn.Layer):
|
||||
def __init__(self,
|
||||
ch_in,
|
||||
ch_out,
|
||||
act='relu',
|
||||
shortcut=True,
|
||||
use_alpha=False):
|
||||
super(BasicBlock, self).__init__()
|
||||
assert ch_in == ch_out
|
||||
self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act)
|
||||
self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha)
|
||||
self.shortcut = shortcut
|
||||
|
||||
def forward(self, x):
|
||||
y = self.conv1(x)
|
||||
y = self.conv2(y)
|
||||
if self.shortcut:
|
||||
return paddle.add(x, y)
|
||||
else:
|
||||
return y
|
||||
|
||||
|
||||
class EffectiveSELayer(nn.Layer):
|
||||
""" Effective Squeeze-Excitation
|
||||
From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
|
||||
"""
|
||||
|
||||
def __init__(self, channels, act='hardsigmoid'):
|
||||
super(EffectiveSELayer, self).__init__()
|
||||
self.fc = nn.Conv2D(channels, channels, kernel_size=1, padding=0)
|
||||
self.act = get_act_fn(act) if act is None or isinstance(act, (
|
||||
str, dict)) else act
|
||||
|
||||
def forward(self, x):
|
||||
x_se = x.mean((2, 3), keepdim=True)
|
||||
x_se = self.fc(x_se)
|
||||
return x * self.act(x_se)
|
||||
|
||||
|
||||
class CSPResStage(nn.Layer):
|
||||
def __init__(self,
|
||||
block_fn,
|
||||
ch_in,
|
||||
ch_out,
|
||||
n,
|
||||
stride,
|
||||
act='relu',
|
||||
attn='eca',
|
||||
use_alpha=False):
|
||||
super(CSPResStage, self).__init__()
|
||||
|
||||
ch_mid = (ch_in + ch_out) // 2
|
||||
if stride == 2:
|
||||
self.conv_down = ConvBNLayer(
|
||||
ch_in, ch_mid, 3, stride=2, padding=1, act=act)
|
||||
else:
|
||||
self.conv_down = None
|
||||
self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
|
||||
self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
|
||||
self.blocks = nn.Sequential(*[
|
||||
block_fn(
|
||||
ch_mid // 2,
|
||||
ch_mid // 2,
|
||||
act=act,
|
||||
shortcut=True,
|
||||
use_alpha=use_alpha) for i in range(n)
|
||||
])
|
||||
if attn:
|
||||
self.attn = EffectiveSELayer(ch_mid, act='hardsigmoid')
|
||||
else:
|
||||
self.attn = None
|
||||
|
||||
self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act)
|
||||
|
||||
def forward(self, x):
|
||||
if self.conv_down is not None:
|
||||
x = self.conv_down(x)
|
||||
y1 = self.conv1(x)
|
||||
y2 = self.blocks(self.conv2(x))
|
||||
y = paddle.concat([y1, y2], axis=1)
|
||||
if self.attn is not None:
|
||||
y = self.attn(y)
|
||||
y = self.conv3(y)
|
||||
return y
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class CSPResNet(nn.Layer):
|
||||
__shared__ = ['width_mult', 'depth_mult', 'trt']
|
||||
|
||||
def __init__(self,
|
||||
layers=[3, 6, 6, 3],
|
||||
channels=[64, 128, 256, 512, 1024],
|
||||
act='swish',
|
||||
return_idx=[1, 2, 3],
|
||||
depth_wise=False,
|
||||
use_large_stem=False,
|
||||
width_mult=1.0,
|
||||
depth_mult=1.0,
|
||||
trt=False,
|
||||
use_checkpoint=False,
|
||||
use_alpha=False,
|
||||
**args):
|
||||
super(CSPResNet, self).__init__()
|
||||
self.use_checkpoint = use_checkpoint
|
||||
channels = [max(round(c * width_mult), 1) for c in channels]
|
||||
layers = [max(round(l * depth_mult), 1) for l in layers]
|
||||
act = get_act_fn(
|
||||
act, trt=trt) if act is None or isinstance(act,
|
||||
(str, dict)) else act
|
||||
|
||||
if use_large_stem:
|
||||
self.stem = nn.Sequential(
|
||||
('conv1', ConvBNLayer(
|
||||
3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
|
||||
('conv2', ConvBNLayer(
|
||||
channels[0] // 2,
|
||||
channels[0] // 2,
|
||||
3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
act=act)), ('conv3', ConvBNLayer(
|
||||
channels[0] // 2,
|
||||
channels[0],
|
||||
3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
act=act)))
|
||||
else:
|
||||
self.stem = nn.Sequential(
|
||||
('conv1', ConvBNLayer(
|
||||
3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
|
||||
('conv2', ConvBNLayer(
|
||||
channels[0] // 2,
|
||||
channels[0],
|
||||
3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
act=act)))
|
||||
|
||||
n = len(channels) - 1
|
||||
self.stages = nn.Sequential(*[(str(i), CSPResStage(
|
||||
BasicBlock,
|
||||
channels[i],
|
||||
channels[i + 1],
|
||||
layers[i],
|
||||
2,
|
||||
act=act,
|
||||
use_alpha=use_alpha)) for i in range(n)])
|
||||
|
||||
self._out_channels = channels[1:]
|
||||
self._out_strides = [4 * 2**i for i in range(n)]
|
||||
self.return_idx = return_idx
|
||||
if use_checkpoint:
|
||||
paddle.seed(0)
|
||||
|
||||
def forward(self, inputs):
|
||||
x = inputs['image']
|
||||
x = self.stem(x)
|
||||
outs = []
|
||||
for idx, stage in enumerate(self.stages):
|
||||
if self.use_checkpoint and self.training:
|
||||
x = paddle.distributed.fleet.utils.recompute(
|
||||
stage, x, **{"preserve_rng_state": True})
|
||||
else:
|
||||
x = stage(x)
|
||||
if idx in self.return_idx:
|
||||
outs.append(x)
|
||||
|
||||
return outs
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [
|
||||
ShapeSpec(
|
||||
channels=self._out_channels[i], stride=self._out_strides[i])
|
||||
for i in self.return_idx
|
||||
]
|
||||
345
rtdetr_paddle/ppdet/modeling/backbones/darknet.py
Executable file
345
rtdetr_paddle/ppdet/modeling/backbones/darknet.py
Executable file
@@ -0,0 +1,345 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from ppdet.modeling.ops import batch_norm, mish
|
||||
from ..shape_spec import ShapeSpec
|
||||
|
||||
__all__ = ['DarkNet', 'ConvBNLayer']
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
ch_in,
|
||||
ch_out,
|
||||
filter_size=3,
|
||||
stride=1,
|
||||
groups=1,
|
||||
padding=0,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
act="leaky",
|
||||
freeze_norm=False,
|
||||
data_format='NCHW',
|
||||
name=''):
|
||||
"""
|
||||
conv + bn + activation layer
|
||||
|
||||
Args:
|
||||
ch_in (int): input channel
|
||||
ch_out (int): output channel
|
||||
filter_size (int): filter size, default 3
|
||||
stride (int): stride, default 1
|
||||
groups (int): number of groups of conv layer, default 1
|
||||
padding (int): padding size, default 0
|
||||
norm_type (str): batch norm type, default bn
|
||||
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
|
||||
act (str): activation function type, default 'leaky', which means leaky_relu
|
||||
freeze_norm (bool): whether to freeze norm, default False
|
||||
data_format (str): data format, NCHW or NHWC
|
||||
"""
|
||||
super(ConvBNLayer, self).__init__()
|
||||
|
||||
self.conv = nn.Conv2D(
|
||||
in_channels=ch_in,
|
||||
out_channels=ch_out,
|
||||
kernel_size=filter_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
groups=groups,
|
||||
data_format=data_format,
|
||||
bias_attr=False)
|
||||
self.batch_norm = batch_norm(
|
||||
ch_out,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
data_format=data_format)
|
||||
self.act = act
|
||||
|
||||
def forward(self, inputs):
|
||||
out = self.conv(inputs)
|
||||
out = self.batch_norm(out)
|
||||
if self.act == 'leaky':
|
||||
out = F.leaky_relu(out, 0.1)
|
||||
else:
|
||||
out = getattr(F, self.act)(out)
|
||||
return out
|
||||
|
||||
|
||||
class DownSample(nn.Layer):
|
||||
def __init__(self,
|
||||
ch_in,
|
||||
ch_out,
|
||||
filter_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
freeze_norm=False,
|
||||
data_format='NCHW'):
|
||||
"""
|
||||
downsample layer
|
||||
|
||||
Args:
|
||||
ch_in (int): input channel
|
||||
ch_out (int): output channel
|
||||
filter_size (int): filter size, default 3
|
||||
stride (int): stride, default 2
|
||||
padding (int): padding size, default 1
|
||||
norm_type (str): batch norm type, default bn
|
||||
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
|
||||
freeze_norm (bool): whether to freeze norm, default False
|
||||
data_format (str): data format, NCHW or NHWC
|
||||
"""
|
||||
|
||||
super(DownSample, self).__init__()
|
||||
|
||||
self.conv_bn_layer = ConvBNLayer(
|
||||
ch_in=ch_in,
|
||||
ch_out=ch_out,
|
||||
filter_size=filter_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
data_format=data_format)
|
||||
self.ch_out = ch_out
|
||||
|
||||
def forward(self, inputs):
|
||||
out = self.conv_bn_layer(inputs)
|
||||
return out
|
||||
|
||||
|
||||
class BasicBlock(nn.Layer):
|
||||
def __init__(self,
|
||||
ch_in,
|
||||
ch_out,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
freeze_norm=False,
|
||||
data_format='NCHW'):
|
||||
"""
|
||||
BasicBlock layer of DarkNet
|
||||
|
||||
Args:
|
||||
ch_in (int): input channel
|
||||
ch_out (int): output channel
|
||||
norm_type (str): batch norm type, default bn
|
||||
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
|
||||
freeze_norm (bool): whether to freeze norm, default False
|
||||
data_format (str): data format, NCHW or NHWC
|
||||
"""
|
||||
|
||||
super(BasicBlock, self).__init__()
|
||||
|
||||
assert ch_in == ch_out and (ch_in % 2) == 0, \
|
||||
f"ch_in and ch_out should be the same even int, but the input \'ch_in is {ch_in}, \'ch_out is {ch_out}"
|
||||
# example:
|
||||
# --------------{conv1} --> {conv2}
|
||||
# channel route: 10-->5 --> 5-->10
|
||||
self.conv1 = ConvBNLayer(
|
||||
ch_in=ch_in,
|
||||
ch_out=int(ch_out / 2),
|
||||
filter_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
data_format=data_format)
|
||||
self.conv2 = ConvBNLayer(
|
||||
ch_in=int(ch_out / 2),
|
||||
ch_out=ch_out,
|
||||
filter_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
data_format=data_format)
|
||||
|
||||
def forward(self, inputs):
|
||||
conv1 = self.conv1(inputs)
|
||||
conv2 = self.conv2(conv1)
|
||||
out = paddle.add(x=inputs, y=conv2)
|
||||
return out
|
||||
|
||||
|
||||
class Blocks(nn.Layer):
|
||||
def __init__(self,
|
||||
ch_in,
|
||||
ch_out,
|
||||
count,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
freeze_norm=False,
|
||||
name=None,
|
||||
data_format='NCHW'):
|
||||
"""
|
||||
Blocks layer, which consist of some BaickBlock layers
|
||||
|
||||
Args:
|
||||
ch_in (int): input channel
|
||||
ch_out (int): output channel
|
||||
count (int): number of BasicBlock layer
|
||||
norm_type (str): batch norm type, default bn
|
||||
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
|
||||
freeze_norm (bool): whether to freeze norm, default False
|
||||
name (str): layer name
|
||||
data_format (str): data format, NCHW or NHWC
|
||||
"""
|
||||
super(Blocks, self).__init__()
|
||||
|
||||
self.basicblock0 = BasicBlock(
|
||||
ch_in,
|
||||
ch_out,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
data_format=data_format)
|
||||
self.res_out_list = []
|
||||
for i in range(1, count):
|
||||
block_name = '{}.{}'.format(name, i)
|
||||
res_out = self.add_sublayer(
|
||||
block_name,
|
||||
BasicBlock(
|
||||
ch_out,
|
||||
ch_out,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
data_format=data_format))
|
||||
self.res_out_list.append(res_out)
|
||||
self.ch_out = ch_out
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self.basicblock0(inputs)
|
||||
for basic_block_i in self.res_out_list:
|
||||
y = basic_block_i(y)
|
||||
return y
|
||||
|
||||
|
||||
DarkNet_cfg = {53: ([1, 2, 8, 8, 4])}
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class DarkNet(nn.Layer):
|
||||
__shared__ = ['norm_type', 'data_format']
|
||||
|
||||
def __init__(self,
|
||||
depth=53,
|
||||
freeze_at=-1,
|
||||
return_idx=[2, 3, 4],
|
||||
num_stages=5,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
freeze_norm=False,
|
||||
data_format='NCHW'):
|
||||
"""
|
||||
Darknet, see https://pjreddie.com/darknet/yolo/
|
||||
|
||||
Args:
|
||||
depth (int): depth of network
|
||||
freeze_at (int): freeze the backbone at which stage
|
||||
filter_size (int): filter size, default 3
|
||||
return_idx (list): index of stages whose feature maps are returned
|
||||
norm_type (str): batch norm type, default bn
|
||||
norm_decay (str): decay for weight and bias of batch norm layer, default 0.
|
||||
data_format (str): data format, NCHW or NHWC
|
||||
"""
|
||||
super(DarkNet, self).__init__()
|
||||
self.depth = depth
|
||||
self.freeze_at = freeze_at
|
||||
self.return_idx = return_idx
|
||||
self.num_stages = num_stages
|
||||
self.stages = DarkNet_cfg[self.depth][0:num_stages]
|
||||
|
||||
self.conv0 = ConvBNLayer(
|
||||
ch_in=3,
|
||||
ch_out=32,
|
||||
filter_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
data_format=data_format)
|
||||
|
||||
self.downsample0 = DownSample(
|
||||
ch_in=32,
|
||||
ch_out=32 * 2,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
data_format=data_format)
|
||||
|
||||
self._out_channels = []
|
||||
self.darknet_conv_block_list = []
|
||||
self.downsample_list = []
|
||||
ch_in = [64, 128, 256, 512, 1024]
|
||||
for i, stage in enumerate(self.stages):
|
||||
name = 'stage.{}'.format(i)
|
||||
conv_block = self.add_sublayer(
|
||||
name,
|
||||
Blocks(
|
||||
int(ch_in[i]),
|
||||
int(ch_in[i]),
|
||||
stage,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
data_format=data_format,
|
||||
name=name))
|
||||
self.darknet_conv_block_list.append(conv_block)
|
||||
if i in return_idx:
|
||||
self._out_channels.append(int(ch_in[i]))
|
||||
for i in range(num_stages - 1):
|
||||
down_name = 'stage.{}.downsample'.format(i)
|
||||
downsample = self.add_sublayer(
|
||||
down_name,
|
||||
DownSample(
|
||||
ch_in=int(ch_in[i]),
|
||||
ch_out=int(ch_in[i + 1]),
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
data_format=data_format))
|
||||
self.downsample_list.append(downsample)
|
||||
|
||||
def forward(self, inputs):
|
||||
x = inputs['image']
|
||||
|
||||
out = self.conv0(x)
|
||||
out = self.downsample0(out)
|
||||
blocks = []
|
||||
for i, conv_block_i in enumerate(self.darknet_conv_block_list):
|
||||
out = conv_block_i(out)
|
||||
if i == self.freeze_at:
|
||||
out.stop_gradient = True
|
||||
if i in self.return_idx:
|
||||
blocks.append(out)
|
||||
if i < self.num_stages - 1:
|
||||
out = self.downsample_list[i](out)
|
||||
return blocks
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [ShapeSpec(channels=c) for c in self._out_channels]
|
||||
720
rtdetr_paddle/ppdet/modeling/backbones/focalnet.py
Normal file
720
rtdetr_paddle/ppdet/modeling/backbones/focalnet.py
Normal file
@@ -0,0 +1,720 @@
|
||||
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This code is based on https://github.com/microsoft/FocalNet/blob/main/classification/focalnet.py
|
||||
"""
|
||||
import numpy as np
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from ppdet.modeling.shape_spec import ShapeSpec
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from .transformer_utils import DropPath, Identity
|
||||
from .transformer_utils import add_parameter, to_2tuple
|
||||
from .transformer_utils import ones_, zeros_, trunc_normal_
|
||||
from .swin_transformer import Mlp
|
||||
|
||||
__all__ = ['FocalNet']
|
||||
|
||||
MODEL_cfg = {
|
||||
'focalnet_T_224_1k_srf': dict(
|
||||
embed_dim=96,
|
||||
depths=[2, 2, 6, 2],
|
||||
focal_levels=[2, 2, 2, 2],
|
||||
focal_windows=[3, 3, 3, 3],
|
||||
drop_path_rate=0.2,
|
||||
use_conv_embed=False,
|
||||
use_postln=False,
|
||||
use_postln_in_modulation=False,
|
||||
use_layerscale=False,
|
||||
normalize_modulator=False,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_srf_pretrained.pdparams',
|
||||
),
|
||||
'focalnet_S_224_1k_srf': dict(
|
||||
embed_dim=96,
|
||||
depths=[2, 2, 18, 2],
|
||||
focal_levels=[2, 2, 2, 2],
|
||||
focal_windows=[3, 3, 3, 3],
|
||||
drop_path_rate=0.3,
|
||||
use_conv_embed=False,
|
||||
use_postln=False,
|
||||
use_postln_in_modulation=False,
|
||||
use_layerscale=False,
|
||||
normalize_modulator=False,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_srf_pretrained.pdparams',
|
||||
),
|
||||
'focalnet_B_224_1k_srf': dict(
|
||||
embed_dim=128,
|
||||
depths=[2, 2, 18, 2],
|
||||
focal_levels=[2, 2, 2, 2],
|
||||
focal_windows=[3, 3, 3, 3],
|
||||
drop_path_rate=0.5,
|
||||
use_conv_embed=False,
|
||||
use_postln=False,
|
||||
use_postln_in_modulation=False,
|
||||
use_layerscale=False,
|
||||
normalize_modulator=False,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_srf_pretrained.pdparams',
|
||||
),
|
||||
'focalnet_T_224_1k_lrf': dict(
|
||||
embed_dim=96,
|
||||
depths=[2, 2, 6, 2],
|
||||
focal_levels=[3, 3, 3, 3],
|
||||
focal_windows=[3, 3, 3, 3],
|
||||
drop_path_rate=0.2,
|
||||
use_conv_embed=False,
|
||||
use_postln=False,
|
||||
use_postln_in_modulation=False,
|
||||
use_layerscale=False,
|
||||
normalize_modulator=False,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_lrf_pretrained.pdparams',
|
||||
),
|
||||
'focalnet_S_224_1k_lrf': dict(
|
||||
embed_dim=96,
|
||||
depths=[2, 2, 18, 2],
|
||||
focal_levels=[3, 3, 3, 3],
|
||||
focal_windows=[3, 3, 3, 3],
|
||||
drop_path_rate=0.3,
|
||||
use_conv_embed=False,
|
||||
use_postln=False,
|
||||
use_postln_in_modulation=False,
|
||||
use_layerscale=False,
|
||||
normalize_modulator=False,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_lrf_pretrained.pdparams',
|
||||
),
|
||||
'focalnet_B_224_1k_lrf': dict(
|
||||
embed_dim=128,
|
||||
depths=[2, 2, 18, 2],
|
||||
focal_levels=[3, 3, 3, 3],
|
||||
focal_windows=[3, 3, 3, 3],
|
||||
drop_path_rate=0.5,
|
||||
use_conv_embed=False,
|
||||
use_postln=False,
|
||||
use_postln_in_modulation=False,
|
||||
use_layerscale=False,
|
||||
normalize_modulator=False,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_lrf_pretrained.pdparams',
|
||||
),
|
||||
'focalnet_L_384_22k_fl3': dict(
|
||||
embed_dim=192,
|
||||
depths=[2, 2, 18, 2],
|
||||
focal_levels=[3, 3, 3, 3],
|
||||
focal_windows=[5, 5, 5, 5],
|
||||
drop_path_rate=0.5,
|
||||
use_conv_embed=True,
|
||||
use_postln=True,
|
||||
use_postln_in_modulation=False,
|
||||
use_layerscale=True,
|
||||
normalize_modulator=False,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_pretrained.pdparams',
|
||||
),
|
||||
'focalnet_L_384_22k_fl4': dict(
|
||||
embed_dim=192,
|
||||
depths=[2, 2, 18, 2],
|
||||
focal_levels=[4, 4, 4, 4],
|
||||
focal_windows=[3, 3, 3, 3],
|
||||
drop_path_rate=0.5,
|
||||
use_conv_embed=True,
|
||||
use_postln=True,
|
||||
use_postln_in_modulation=False,
|
||||
use_layerscale=True,
|
||||
normalize_modulator=True, #
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_fl4_pretrained.pdparams',
|
||||
),
|
||||
'focalnet_XL_384_22k_fl3': dict(
|
||||
embed_dim=256,
|
||||
depths=[2, 2, 18, 2],
|
||||
focal_levels=[3, 3, 3, 3],
|
||||
focal_windows=[5, 5, 5, 5],
|
||||
drop_path_rate=0.5,
|
||||
use_conv_embed=True,
|
||||
use_postln=True,
|
||||
use_postln_in_modulation=False,
|
||||
use_layerscale=True,
|
||||
normalize_modulator=False,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_pretrained.pdparams',
|
||||
),
|
||||
'focalnet_XL_384_22k_fl4': dict(
|
||||
embed_dim=256,
|
||||
depths=[2, 2, 18, 2],
|
||||
focal_levels=[4, 4, 4, 4],
|
||||
focal_windows=[3, 3, 3, 3],
|
||||
drop_path_rate=0.5,
|
||||
use_conv_embed=True,
|
||||
use_postln=True,
|
||||
use_postln_in_modulation=False,
|
||||
use_layerscale=True,
|
||||
normalize_modulator=False,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_fl4_pretrained.pdparams',
|
||||
),
|
||||
'focalnet_H_224_22k_fl3': dict(
|
||||
embed_dim=352,
|
||||
depths=[2, 2, 18, 2],
|
||||
focal_levels=[3, 3, 3, 3],
|
||||
focal_windows=[3, 3, 3, 3],
|
||||
drop_path_rate=0.5,
|
||||
use_conv_embed=True,
|
||||
use_postln=True,
|
||||
use_postln_in_modulation=True, #
|
||||
use_layerscale=True,
|
||||
normalize_modulator=False,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_pretrained.pdparams',
|
||||
),
|
||||
'focalnet_H_224_22k_fl4': dict(
|
||||
embed_dim=352,
|
||||
depths=[2, 2, 18, 2],
|
||||
focal_levels=[4, 4, 4, 4],
|
||||
focal_windows=[3, 3, 3, 3],
|
||||
drop_path_rate=0.5,
|
||||
use_conv_embed=True,
|
||||
use_postln=True,
|
||||
use_postln_in_modulation=True, #
|
||||
use_layerscale=True,
|
||||
normalize_modulator=False,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_fl4_pretrained.pdparams',
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
class FocalModulation(nn.Layer):
|
||||
"""
|
||||
Args:
|
||||
dim (int): Number of input channels.
|
||||
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
|
||||
focal_level (int): Number of focal levels
|
||||
focal_window (int): Focal window size at focal level 1
|
||||
focal_factor (int): Step to increase the focal window. Default: 2
|
||||
use_postln_in_modulation (bool): Whether use post-modulation layernorm
|
||||
normalize_modulator (bool): Whether use normalize in modulator
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
dim,
|
||||
proj_drop=0.,
|
||||
focal_level=2,
|
||||
focal_window=7,
|
||||
focal_factor=2,
|
||||
use_postln_in_modulation=False,
|
||||
normalize_modulator=False):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
|
||||
# specific args for focalv3
|
||||
self.focal_level = focal_level
|
||||
self.focal_window = focal_window
|
||||
self.focal_factor = focal_factor
|
||||
self.use_postln_in_modulation = use_postln_in_modulation
|
||||
self.normalize_modulator = normalize_modulator
|
||||
|
||||
self.f = nn.Linear(
|
||||
dim, 2 * dim + (self.focal_level + 1), bias_attr=True)
|
||||
self.h = nn.Conv2D(
|
||||
dim,
|
||||
dim,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
groups=1,
|
||||
bias_attr=True)
|
||||
|
||||
self.act = nn.GELU()
|
||||
self.proj = nn.Linear(dim, dim)
|
||||
self.proj_drop = nn.Dropout(proj_drop)
|
||||
self.focal_layers = nn.LayerList()
|
||||
|
||||
if self.use_postln_in_modulation:
|
||||
self.ln = nn.LayerNorm(dim)
|
||||
|
||||
for k in range(self.focal_level):
|
||||
kernel_size = self.focal_factor * k + self.focal_window
|
||||
self.focal_layers.append(
|
||||
nn.Sequential(
|
||||
nn.Conv2D(
|
||||
dim,
|
||||
dim,
|
||||
kernel_size=kernel_size,
|
||||
stride=1,
|
||||
groups=dim,
|
||||
padding=kernel_size // 2,
|
||||
bias_attr=False),
|
||||
nn.GELU()))
|
||||
|
||||
def forward(self, x):
|
||||
""" Forward function.
|
||||
Args:
|
||||
x: input features with shape of (B, H, W, C)
|
||||
"""
|
||||
_, _, _, C = x.shape
|
||||
x = self.f(x)
|
||||
x = x.transpose([0, 3, 1, 2])
|
||||
q, ctx, gates = paddle.split(x, (C, C, self.focal_level + 1), 1)
|
||||
|
||||
ctx_all = 0
|
||||
for l in range(self.focal_level):
|
||||
ctx = self.focal_layers[l](ctx)
|
||||
ctx_all = ctx_all + ctx * gates[:, l:l + 1]
|
||||
ctx_global = self.act(ctx.mean(2, keepdim=True).mean(3, keepdim=True))
|
||||
ctx_all = ctx_all + ctx_global * gates[:, self.focal_level:]
|
||||
if self.normalize_modulator:
|
||||
ctx_all = ctx_all / (self.focal_level + 1)
|
||||
|
||||
x_out = q * self.h(ctx_all)
|
||||
x_out = x_out.transpose([0, 2, 3, 1])
|
||||
if self.use_postln_in_modulation:
|
||||
x_out = self.ln(x_out)
|
||||
x_out = self.proj(x_out)
|
||||
x_out = self.proj_drop(x_out)
|
||||
return x_out
|
||||
|
||||
|
||||
class FocalModulationBlock(nn.Layer):
|
||||
""" Focal Modulation Block.
|
||||
Args:
|
||||
dim (int): Number of input channels.
|
||||
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
|
||||
drop (float, optional): Dropout rate. Default: 0.0
|
||||
drop_path (float, optional): Stochastic depth rate. Default: 0.0
|
||||
act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
|
||||
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
|
||||
focal_level (int): number of focal levels
|
||||
focal_window (int): focal kernel size at level 1
|
||||
use_postln (bool): Whether use layernorm after modulation. Default: False.
|
||||
use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
|
||||
normalize_modulator (bool): Whether use normalize in modulator
|
||||
use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False
|
||||
layerscale_value (float): Value for layer scale. Default: 1e-4
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
dim,
|
||||
mlp_ratio=4.,
|
||||
drop=0.,
|
||||
drop_path=0.,
|
||||
act_layer=nn.GELU,
|
||||
norm_layer=nn.LayerNorm,
|
||||
focal_level=2,
|
||||
focal_window=9,
|
||||
use_postln=False,
|
||||
use_postln_in_modulation=False,
|
||||
normalize_modulator=False,
|
||||
use_layerscale=False,
|
||||
layerscale_value=1e-4):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.mlp_ratio = mlp_ratio
|
||||
self.focal_window = focal_window
|
||||
self.focal_level = focal_level
|
||||
self.use_postln = use_postln
|
||||
self.use_layerscale = use_layerscale
|
||||
|
||||
self.norm1 = norm_layer(dim)
|
||||
self.modulation = FocalModulation(
|
||||
dim,
|
||||
proj_drop=drop,
|
||||
focal_level=self.focal_level,
|
||||
focal_window=self.focal_window,
|
||||
use_postln_in_modulation=use_postln_in_modulation,
|
||||
normalize_modulator=normalize_modulator)
|
||||
|
||||
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
|
||||
self.norm2 = norm_layer(dim)
|
||||
mlp_hidden_dim = int(dim * mlp_ratio)
|
||||
self.mlp = Mlp(in_features=dim,
|
||||
hidden_features=mlp_hidden_dim,
|
||||
act_layer=act_layer,
|
||||
drop=drop)
|
||||
self.H = None
|
||||
self.W = None
|
||||
|
||||
self.gamma_1 = 1.0
|
||||
self.gamma_2 = 1.0
|
||||
if self.use_layerscale:
|
||||
self.gamma_1 = add_parameter(self,
|
||||
layerscale_value * paddle.ones([dim]))
|
||||
self.gamma_2 = add_parameter(self,
|
||||
layerscale_value * paddle.ones([dim]))
|
||||
|
||||
def forward(self, x):
|
||||
"""
|
||||
Args:
|
||||
x: Input feature, tensor size (B, H*W, C).
|
||||
"""
|
||||
B, L, C = x.shape
|
||||
H, W = self.H, self.W
|
||||
assert L == H * W, "input feature has wrong size"
|
||||
|
||||
shortcut = x
|
||||
if not self.use_postln:
|
||||
x = self.norm1(x)
|
||||
x = x.reshape([-1, H, W, C])
|
||||
|
||||
# FM
|
||||
x = self.modulation(x).reshape([-1, H * W, C])
|
||||
if self.use_postln:
|
||||
x = self.norm1(x)
|
||||
|
||||
# FFN
|
||||
x = shortcut + self.drop_path(self.gamma_1 * x)
|
||||
|
||||
if self.use_postln:
|
||||
x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x)))
|
||||
else:
|
||||
x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
|
||||
return x
|
||||
|
||||
|
||||
class BasicLayer(nn.Layer):
|
||||
""" A basic focal modulation layer for one stage.
|
||||
Args:
|
||||
dim (int): Number of feature channels
|
||||
depth (int): Depths of this stage.
|
||||
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
|
||||
drop (float, optional): Dropout rate. Default: 0.0
|
||||
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
|
||||
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
|
||||
downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
|
||||
focal_level (int): Number of focal levels
|
||||
focal_window (int): Focal window size at focal level 1
|
||||
use_conv_embed (bool): Whether use overlapped convolution for patch embedding
|
||||
use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False
|
||||
layerscale_value (float): Value of layerscale
|
||||
use_postln (bool): Whether use layernorm after modulation. Default: False.
|
||||
use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
|
||||
normalize_modulator (bool): Whether use normalize in modulator
|
||||
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
dim,
|
||||
depth,
|
||||
mlp_ratio=4.,
|
||||
drop=0.,
|
||||
drop_path=0.,
|
||||
norm_layer=nn.LayerNorm,
|
||||
downsample=None,
|
||||
focal_level=2,
|
||||
focal_window=9,
|
||||
use_conv_embed=False,
|
||||
use_layerscale=False,
|
||||
layerscale_value=1e-4,
|
||||
use_postln=False,
|
||||
use_postln_in_modulation=False,
|
||||
normalize_modulator=False,
|
||||
use_checkpoint=False):
|
||||
super().__init__()
|
||||
self.depth = depth
|
||||
self.use_checkpoint = use_checkpoint
|
||||
|
||||
# build blocks
|
||||
self.blocks = nn.LayerList([
|
||||
FocalModulationBlock(
|
||||
dim=dim,
|
||||
mlp_ratio=mlp_ratio,
|
||||
drop=drop,
|
||||
drop_path=drop_path[i]
|
||||
if isinstance(drop_path, np.ndarray) else drop_path,
|
||||
act_layer=nn.GELU,
|
||||
norm_layer=norm_layer,
|
||||
focal_level=focal_level,
|
||||
focal_window=focal_window,
|
||||
use_postln=use_postln,
|
||||
use_postln_in_modulation=use_postln_in_modulation,
|
||||
normalize_modulator=normalize_modulator,
|
||||
use_layerscale=use_layerscale,
|
||||
layerscale_value=layerscale_value) for i in range(depth)
|
||||
])
|
||||
|
||||
# patch merging layer
|
||||
if downsample is not None:
|
||||
self.downsample = downsample(
|
||||
patch_size=2,
|
||||
in_chans=dim,
|
||||
embed_dim=2 * dim,
|
||||
use_conv_embed=use_conv_embed,
|
||||
norm_layer=norm_layer,
|
||||
is_stem=False)
|
||||
else:
|
||||
self.downsample = None
|
||||
|
||||
def forward(self, x, H, W):
|
||||
"""
|
||||
Args:
|
||||
x: Input feature, tensor size (B, H*W, C).
|
||||
"""
|
||||
for blk in self.blocks:
|
||||
blk.H, blk.W = H, W
|
||||
x = blk(x)
|
||||
|
||||
if self.downsample is not None:
|
||||
x_reshaped = x.transpose([0, 2, 1]).reshape(
|
||||
[x.shape[0], x.shape[-1], H, W])
|
||||
x_down = self.downsample(x_reshaped)
|
||||
x_down = x_down.flatten(2).transpose([0, 2, 1])
|
||||
Wh, Ww = (H + 1) // 2, (W + 1) // 2
|
||||
return x, H, W, x_down, Wh, Ww
|
||||
else:
|
||||
return x, H, W, x, H, W
|
||||
|
||||
|
||||
class PatchEmbed(nn.Layer):
|
||||
""" Image to Patch Embedding
|
||||
Args:
|
||||
patch_size (int): Patch token size. Default: 4.
|
||||
in_chans (int): Number of input image channels. Default: 3.
|
||||
embed_dim (int): Number of linear projection output channels. Default: 96.
|
||||
norm_layer (nn.Layer, optional): Normalization layer. Default: None
|
||||
use_conv_embed (bool): Whether use overlapped convolution for patch embedding. Default: False
|
||||
is_stem (bool): Is the stem block or not.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
patch_size=4,
|
||||
in_chans=3,
|
||||
embed_dim=96,
|
||||
norm_layer=None,
|
||||
use_conv_embed=False,
|
||||
is_stem=False):
|
||||
super().__init__()
|
||||
patch_size = to_2tuple(patch_size)
|
||||
self.patch_size = patch_size
|
||||
|
||||
self.in_chans = in_chans
|
||||
self.embed_dim = embed_dim
|
||||
|
||||
if use_conv_embed:
|
||||
# if we choose to use conv embedding, then we treat the stem and non-stem differently
|
||||
if is_stem:
|
||||
kernel_size = 7
|
||||
padding = 2
|
||||
stride = 4
|
||||
else:
|
||||
kernel_size = 3
|
||||
padding = 1
|
||||
stride = 2
|
||||
self.proj = nn.Conv2D(
|
||||
in_chans,
|
||||
embed_dim,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=padding)
|
||||
else:
|
||||
self.proj = nn.Conv2D(
|
||||
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
|
||||
|
||||
if norm_layer is not None:
|
||||
self.norm = norm_layer(embed_dim)
|
||||
else:
|
||||
self.norm = None
|
||||
|
||||
def forward(self, x):
|
||||
_, _, H, W = x.shape
|
||||
|
||||
if W % self.patch_size[1] != 0:
|
||||
# for 3D tensor: [pad_left, pad_right]
|
||||
# for 4D tensor: [pad_left, pad_right, pad_top, pad_bottom]
|
||||
x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
|
||||
W += W % self.patch_size[1]
|
||||
if H % self.patch_size[0] != 0:
|
||||
x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
|
||||
H += H % self.patch_size[0]
|
||||
|
||||
x = self.proj(x)
|
||||
if self.norm is not None:
|
||||
_, _, Wh, Ww = x.shape
|
||||
x = x.flatten(2).transpose([0, 2, 1])
|
||||
x = self.norm(x)
|
||||
x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
|
||||
|
||||
return x
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class FocalNet(nn.Layer):
|
||||
""" FocalNet backbone
|
||||
Args:
|
||||
arch (str): Architecture of FocalNet
|
||||
out_indices (Sequence[int]): Output from which stages.
|
||||
frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
|
||||
-1 means not freezing any parameters.
|
||||
patch_size (int | tuple(int)): Patch size. Default: 4.
|
||||
in_chans (int): Number of input image channels. Default: 3.
|
||||
embed_dim (int): Number of linear projection output channels. Default: 96.
|
||||
depths (tuple[int]): Depths of each FocalNet Transformer stage.
|
||||
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
|
||||
drop_rate (float): Dropout rate.
|
||||
drop_path_rate (float): Stochastic depth rate. Default: 0.2.
|
||||
norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
|
||||
patch_norm (bool): If True, add normalization after patch embedding. Default: True.
|
||||
focal_levels (Sequence[int]): Number of focal levels at four stages
|
||||
focal_windows (Sequence[int]): Focal window sizes at first focal level at four stages
|
||||
use_conv_embed (bool): Whether use overlapped convolution for patch embedding
|
||||
use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False
|
||||
layerscale_value (float): Value of layerscale
|
||||
use_postln (bool): Whether use layernorm after modulation. Default: False.
|
||||
use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
|
||||
normalize_modulator (bool): Whether use normalize in modulator
|
||||
use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
arch='focalnet_T_224_1k_srf',
|
||||
out_indices=(0, 1, 2, 3),
|
||||
frozen_stages=-1,
|
||||
patch_size=4,
|
||||
in_chans=3,
|
||||
embed_dim=96,
|
||||
depths=[2, 2, 6, 2],
|
||||
mlp_ratio=4.,
|
||||
drop_rate=0.,
|
||||
drop_path_rate=0.2, # 0.5 better for large+ models
|
||||
norm_layer=nn.LayerNorm,
|
||||
patch_norm=True,
|
||||
focal_levels=[2, 2, 2, 2],
|
||||
focal_windows=[3, 3, 3, 3],
|
||||
use_conv_embed=False,
|
||||
use_layerscale=False,
|
||||
layerscale_value=1e-4,
|
||||
use_postln=False,
|
||||
use_postln_in_modulation=False,
|
||||
normalize_modulator=False,
|
||||
use_checkpoint=False,
|
||||
pretrained=None):
|
||||
super(FocalNet, self).__init__()
|
||||
assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch)
|
||||
|
||||
embed_dim = MODEL_cfg[arch]['embed_dim']
|
||||
depths = MODEL_cfg[arch]['depths']
|
||||
drop_path_rate = MODEL_cfg[arch]['drop_path_rate']
|
||||
focal_levels = MODEL_cfg[arch]['focal_levels']
|
||||
focal_windows = MODEL_cfg[arch]['focal_windows']
|
||||
use_conv_embed = MODEL_cfg[arch]['use_conv_embed']
|
||||
use_layerscale = MODEL_cfg[arch]['use_layerscale']
|
||||
use_postln = MODEL_cfg[arch]['use_postln']
|
||||
use_postln_in_modulation = MODEL_cfg[arch]['use_postln_in_modulation']
|
||||
normalize_modulator = MODEL_cfg[arch]['normalize_modulator']
|
||||
if pretrained is None:
|
||||
pretrained = MODEL_cfg[arch]['pretrained']
|
||||
|
||||
self.out_indices = out_indices
|
||||
self.frozen_stages = frozen_stages
|
||||
self.num_layers = len(depths)
|
||||
self.patch_norm = patch_norm
|
||||
|
||||
# split image into non-overlapping patches
|
||||
self.patch_embed = PatchEmbed(
|
||||
patch_size=patch_size,
|
||||
in_chans=in_chans,
|
||||
embed_dim=embed_dim,
|
||||
norm_layer=norm_layer if self.patch_norm else None,
|
||||
use_conv_embed=use_conv_embed,
|
||||
is_stem=True)
|
||||
|
||||
self.pos_drop = nn.Dropout(p=drop_rate)
|
||||
|
||||
# stochastic depth decay rule
|
||||
dpr = np.linspace(0, drop_path_rate, sum(depths))
|
||||
|
||||
# build layers
|
||||
self.layers = nn.LayerList()
|
||||
for i_layer in range(self.num_layers):
|
||||
layer = BasicLayer(
|
||||
dim=int(embed_dim * 2**i_layer),
|
||||
depth=depths[i_layer],
|
||||
mlp_ratio=mlp_ratio,
|
||||
drop=drop_rate,
|
||||
drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
|
||||
norm_layer=norm_layer,
|
||||
downsample=PatchEmbed
|
||||
if (i_layer < self.num_layers - 1) else None,
|
||||
focal_level=focal_levels[i_layer],
|
||||
focal_window=focal_windows[i_layer],
|
||||
use_conv_embed=use_conv_embed,
|
||||
use_layerscale=use_layerscale,
|
||||
layerscale_value=layerscale_value,
|
||||
use_postln=use_postln,
|
||||
use_postln_in_modulation=use_postln_in_modulation,
|
||||
normalize_modulator=normalize_modulator,
|
||||
use_checkpoint=use_checkpoint)
|
||||
self.layers.append(layer)
|
||||
|
||||
num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
|
||||
self.num_features = num_features
|
||||
|
||||
# add a norm layer for each output
|
||||
for i_layer in out_indices:
|
||||
layer = norm_layer(num_features[i_layer])
|
||||
layer_name = f'norm{i_layer}'
|
||||
self.add_sublayer(layer_name, layer)
|
||||
|
||||
self.apply(self._init_weights)
|
||||
self._freeze_stages()
|
||||
if pretrained:
|
||||
if 'http' in pretrained: #URL
|
||||
path = paddle.utils.download.get_weights_path_from_url(
|
||||
pretrained)
|
||||
else: #model in local path
|
||||
path = pretrained
|
||||
self.set_state_dict(paddle.load(path))
|
||||
|
||||
def _freeze_stages(self):
|
||||
if self.frozen_stages >= 0:
|
||||
self.patch_embed.eval()
|
||||
for param in self.patch_embed.parameters():
|
||||
param.stop_gradient = True
|
||||
|
||||
if self.frozen_stages >= 2:
|
||||
self.pos_drop.eval()
|
||||
for i in range(0, self.frozen_stages - 1):
|
||||
m = self.layers[i]
|
||||
m.eval()
|
||||
for param in m.parameters():
|
||||
param.stop_gradient = True
|
||||
|
||||
def _init_weights(self, m):
|
||||
if isinstance(m, nn.Linear):
|
||||
trunc_normal_(m.weight)
|
||||
if isinstance(m, nn.Linear) and m.bias is not None:
|
||||
zeros_(m.bias)
|
||||
elif isinstance(m, nn.LayerNorm):
|
||||
zeros_(m.bias)
|
||||
ones_(m.weight)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.patch_embed(x['image'])
|
||||
B, _, Wh, Ww = x.shape
|
||||
x = x.flatten(2).transpose([0, 2, 1])
|
||||
x = self.pos_drop(x)
|
||||
outs = []
|
||||
for i in range(self.num_layers):
|
||||
layer = self.layers[i]
|
||||
x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
|
||||
if i in self.out_indices:
|
||||
norm_layer = getattr(self, f'norm{i}')
|
||||
x_out = norm_layer(x_out)
|
||||
out = x_out.reshape([-1, H, W, self.num_features[i]]).transpose(
|
||||
(0, 3, 1, 2))
|
||||
outs.append(out)
|
||||
|
||||
return outs
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
out_strides = [4, 8, 16, 32]
|
||||
return [
|
||||
ShapeSpec(
|
||||
channels=self.num_features[i], stride=out_strides[i])
|
||||
for i in self.out_indices
|
||||
]
|
||||
447
rtdetr_paddle/ppdet/modeling/backbones/hgnet_v2.py
Normal file
447
rtdetr_paddle/ppdet/modeling/backbones/hgnet_v2.py
Normal file
@@ -0,0 +1,447 @@
|
||||
# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle.nn.initializer import KaimingNormal, Constant
|
||||
from paddle.nn import Conv2D, BatchNorm2D, ReLU, AdaptiveAvgPool2D, MaxPool2D
|
||||
from paddle.regularizer import L2Decay
|
||||
from paddle import ParamAttr
|
||||
|
||||
import copy
|
||||
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from ..shape_spec import ShapeSpec
|
||||
|
||||
__all__ = ['PPHGNetV2']
|
||||
|
||||
kaiming_normal_ = KaimingNormal()
|
||||
zeros_ = Constant(value=0.)
|
||||
ones_ = Constant(value=1.)
|
||||
|
||||
|
||||
class LearnableAffineBlock(nn.Layer):
|
||||
def __init__(self,
|
||||
scale_value=1.0,
|
||||
bias_value=0.0,
|
||||
lr_mult=1.0,
|
||||
lab_lr=0.01):
|
||||
super().__init__()
|
||||
self.scale = self.create_parameter(
|
||||
shape=[1, ],
|
||||
default_initializer=Constant(value=scale_value),
|
||||
attr=ParamAttr(learning_rate=lr_mult * lab_lr))
|
||||
self.add_parameter("scale", self.scale)
|
||||
self.bias = self.create_parameter(
|
||||
shape=[1, ],
|
||||
default_initializer=Constant(value=bias_value),
|
||||
attr=ParamAttr(learning_rate=lr_mult * lab_lr))
|
||||
self.add_parameter("bias", self.bias)
|
||||
|
||||
def forward(self, x):
|
||||
return self.scale * x + self.bias
|
||||
|
||||
|
||||
class ConvBNAct(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size=3,
|
||||
stride=1,
|
||||
padding=1,
|
||||
groups=1,
|
||||
use_act=True,
|
||||
use_lab=False,
|
||||
lr_mult=1.0):
|
||||
super().__init__()
|
||||
self.use_act = use_act
|
||||
self.use_lab = use_lab
|
||||
self.conv = Conv2D(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding=padding
|
||||
if isinstance(padding, str) else (kernel_size - 1) // 2,
|
||||
groups=groups,
|
||||
weight_attr=ParamAttr(learning_rate=lr_mult),
|
||||
bias_attr=False)
|
||||
self.bn = BatchNorm2D(
|
||||
out_channels,
|
||||
weight_attr=ParamAttr(
|
||||
regularizer=L2Decay(0.0), learning_rate=lr_mult),
|
||||
bias_attr=ParamAttr(
|
||||
regularizer=L2Decay(0.0), learning_rate=lr_mult))
|
||||
if self.use_act:
|
||||
self.act = ReLU()
|
||||
if self.use_lab:
|
||||
self.lab = LearnableAffineBlock(lr_mult=lr_mult)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
if self.use_act:
|
||||
x = self.act(x)
|
||||
if self.use_lab:
|
||||
x = self.lab(x)
|
||||
return x
|
||||
|
||||
|
||||
class LightConvBNAct(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
groups=1,
|
||||
use_lab=False,
|
||||
lr_mult=1.0):
|
||||
super().__init__()
|
||||
self.conv1 = ConvBNAct(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
use_act=False,
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult)
|
||||
self.conv2 = ConvBNAct(
|
||||
in_channels=out_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
groups=out_channels,
|
||||
use_act=True,
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv1(x)
|
||||
x = self.conv2(x)
|
||||
return x
|
||||
|
||||
|
||||
class StemBlock(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
mid_channels,
|
||||
out_channels,
|
||||
use_lab=False,
|
||||
lr_mult=1.0):
|
||||
super().__init__()
|
||||
self.stem1 = ConvBNAct(
|
||||
in_channels=in_channels,
|
||||
out_channels=mid_channels,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult)
|
||||
self.stem2a = ConvBNAct(
|
||||
in_channels=mid_channels,
|
||||
out_channels=mid_channels // 2,
|
||||
kernel_size=2,
|
||||
stride=1,
|
||||
padding="SAME",
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult)
|
||||
self.stem2b = ConvBNAct(
|
||||
in_channels=mid_channels // 2,
|
||||
out_channels=mid_channels,
|
||||
kernel_size=2,
|
||||
stride=1,
|
||||
padding="SAME",
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult)
|
||||
self.stem3 = ConvBNAct(
|
||||
in_channels=mid_channels * 2,
|
||||
out_channels=mid_channels,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult)
|
||||
self.stem4 = ConvBNAct(
|
||||
in_channels=mid_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult)
|
||||
self.pool = nn.MaxPool2D(
|
||||
kernel_size=2, stride=1, ceil_mode=True, padding="SAME")
|
||||
|
||||
def forward(self, x):
|
||||
x = self.stem1(x)
|
||||
x2 = self.stem2a(x)
|
||||
x2 = self.stem2b(x2)
|
||||
x1 = self.pool(x)
|
||||
x = paddle.concat([x1, x2], 1)
|
||||
x = self.stem3(x)
|
||||
x = self.stem4(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class HG_Block(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
mid_channels,
|
||||
out_channels,
|
||||
kernel_size=3,
|
||||
layer_num=6,
|
||||
identity=False,
|
||||
light_block=True,
|
||||
use_lab=False,
|
||||
lr_mult=1.0):
|
||||
super().__init__()
|
||||
self.identity = identity
|
||||
|
||||
self.layers = nn.LayerList()
|
||||
block_type = "LightConvBNAct" if light_block else "ConvBNAct"
|
||||
for i in range(layer_num):
|
||||
self.layers.append(
|
||||
eval(block_type)(in_channels=in_channels
|
||||
if i == 0 else mid_channels,
|
||||
out_channels=mid_channels,
|
||||
stride=1,
|
||||
kernel_size=kernel_size,
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult))
|
||||
# feature aggregation
|
||||
total_channels = in_channels + layer_num * mid_channels
|
||||
self.aggregation_squeeze_conv = ConvBNAct(
|
||||
in_channels=total_channels,
|
||||
out_channels=out_channels // 2,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult)
|
||||
self.aggregation_excitation_conv = ConvBNAct(
|
||||
in_channels=out_channels // 2,
|
||||
out_channels=out_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult)
|
||||
|
||||
def forward(self, x):
|
||||
identity = x
|
||||
output = []
|
||||
output.append(x)
|
||||
for layer in self.layers:
|
||||
x = layer(x)
|
||||
output.append(x)
|
||||
x = paddle.concat(output, axis=1)
|
||||
x = self.aggregation_squeeze_conv(x)
|
||||
x = self.aggregation_excitation_conv(x)
|
||||
if self.identity:
|
||||
x += identity
|
||||
return x
|
||||
|
||||
|
||||
class HG_Stage(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
mid_channels,
|
||||
out_channels,
|
||||
block_num,
|
||||
layer_num=6,
|
||||
downsample=True,
|
||||
light_block=True,
|
||||
kernel_size=3,
|
||||
use_lab=False,
|
||||
lr_mult=1.0):
|
||||
super().__init__()
|
||||
self.downsample = downsample
|
||||
if downsample:
|
||||
self.downsample = ConvBNAct(
|
||||
in_channels=in_channels,
|
||||
out_channels=in_channels,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
groups=in_channels,
|
||||
use_act=False,
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult)
|
||||
|
||||
blocks_list = []
|
||||
for i in range(block_num):
|
||||
blocks_list.append(
|
||||
HG_Block(
|
||||
in_channels=in_channels if i == 0 else out_channels,
|
||||
mid_channels=mid_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
layer_num=layer_num,
|
||||
identity=False if i == 0 else True,
|
||||
light_block=light_block,
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult))
|
||||
self.blocks = nn.Sequential(*blocks_list)
|
||||
|
||||
def forward(self, x):
|
||||
if self.downsample:
|
||||
x = self.downsample(x)
|
||||
x = self.blocks(x)
|
||||
return x
|
||||
|
||||
|
||||
def _freeze_norm(m: nn.BatchNorm2D):
|
||||
param_attr = ParamAttr(
|
||||
learning_rate=0., regularizer=L2Decay(0.), trainable=False)
|
||||
bias_attr = ParamAttr(
|
||||
learning_rate=0., regularizer=L2Decay(0.), trainable=False)
|
||||
global_stats = True
|
||||
norm = nn.BatchNorm2D(
|
||||
m._num_features,
|
||||
weight_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
use_global_stats=global_stats)
|
||||
for param in norm.parameters():
|
||||
param.stop_gradient = True
|
||||
return norm
|
||||
|
||||
|
||||
def reset_bn(model: nn.Layer, reset_func=_freeze_norm):
|
||||
if isinstance(model, nn.BatchNorm2D):
|
||||
model = reset_func(model)
|
||||
else:
|
||||
for name, child in model.named_children():
|
||||
_child = reset_bn(child, reset_func)
|
||||
if _child is not child:
|
||||
setattr(model, name, _child)
|
||||
return model
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class PPHGNetV2(nn.Layer):
|
||||
"""
|
||||
PPHGNetV2
|
||||
Args:
|
||||
stem_channels: list. Number of channels for the stem block.
|
||||
stage_type: str. The stage configuration of PPHGNet. such as the number of channels, stride, etc.
|
||||
use_lab: boolean. Whether to use LearnableAffineBlock in network.
|
||||
lr_mult_list: list. Control the learning rate of different stages.
|
||||
Returns:
|
||||
model: nn.Layer. Specific PPHGNetV2 model depends on args.
|
||||
"""
|
||||
|
||||
arch_configs = {
|
||||
'L': {
|
||||
'stem_channels': [3, 32, 48],
|
||||
'stage_config': {
|
||||
# in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
|
||||
"stage1": [48, 48, 128, 1, False, False, 3, 6],
|
||||
"stage2": [128, 96, 512, 1, True, False, 3, 6],
|
||||
"stage3": [512, 192, 1024, 3, True, True, 5, 6],
|
||||
"stage4": [1024, 384, 2048, 1, True, True, 5, 6],
|
||||
}
|
||||
},
|
||||
'X': {
|
||||
'stem_channels': [3, 32, 64],
|
||||
'stage_config': {
|
||||
# in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
|
||||
"stage1": [64, 64, 128, 1, False, False, 3, 6],
|
||||
"stage2": [128, 128, 512, 2, True, False, 3, 6],
|
||||
"stage3": [512, 256, 1024, 5, True, True, 5, 6],
|
||||
"stage4": [1024, 512, 2048, 2, True, True, 5, 6],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
arch,
|
||||
use_lab=False,
|
||||
lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
|
||||
return_idx=[1, 2, 3],
|
||||
freeze_stem_only=True,
|
||||
freeze_at=0,
|
||||
freeze_norm=True):
|
||||
super().__init__()
|
||||
self.use_lab = use_lab
|
||||
self.return_idx = return_idx
|
||||
|
||||
stem_channels = self.arch_configs[arch]['stem_channels']
|
||||
stage_config = self.arch_configs[arch]['stage_config']
|
||||
|
||||
self._out_strides = [4, 8, 16, 32]
|
||||
self._out_channels = [stage_config[k][2] for k in stage_config]
|
||||
|
||||
# stem
|
||||
self.stem = StemBlock(
|
||||
in_channels=stem_channels[0],
|
||||
mid_channels=stem_channels[1],
|
||||
out_channels=stem_channels[2],
|
||||
use_lab=use_lab,
|
||||
lr_mult=lr_mult_list[0])
|
||||
|
||||
# stages
|
||||
self.stages = nn.LayerList()
|
||||
for i, k in enumerate(stage_config):
|
||||
in_channels, mid_channels, out_channels, block_num, downsample, light_block, kernel_size, layer_num = stage_config[
|
||||
k]
|
||||
self.stages.append(
|
||||
HG_Stage(
|
||||
in_channels,
|
||||
mid_channels,
|
||||
out_channels,
|
||||
block_num,
|
||||
layer_num,
|
||||
downsample,
|
||||
light_block,
|
||||
kernel_size,
|
||||
use_lab,
|
||||
lr_mult=lr_mult_list[i + 1]))
|
||||
|
||||
if freeze_at >= 0:
|
||||
self._freeze_parameters(self.stem)
|
||||
if not freeze_stem_only:
|
||||
for i in range(min(freeze_at + 1, len(self.stages))):
|
||||
self._freeze_parameters(self.stages[i])
|
||||
|
||||
if freeze_norm:
|
||||
reset_bn(self, reset_func=_freeze_norm)
|
||||
|
||||
self._init_weights()
|
||||
|
||||
def _freeze_parameters(self, m):
|
||||
for p in m.parameters():
|
||||
p.stop_gradient = True
|
||||
|
||||
def _init_weights(self):
|
||||
for m in self.sublayers():
|
||||
if isinstance(m, nn.Conv2D):
|
||||
kaiming_normal_(m.weight)
|
||||
elif isinstance(m, (nn.BatchNorm2D)):
|
||||
ones_(m.weight)
|
||||
zeros_(m.bias)
|
||||
elif isinstance(m, nn.Linear):
|
||||
zeros_(m.bias)
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [
|
||||
ShapeSpec(
|
||||
channels=self._out_channels[i], stride=self._out_strides[i])
|
||||
for i in self.return_idx
|
||||
]
|
||||
|
||||
def forward(self, inputs):
|
||||
x = inputs['image']
|
||||
x = self.stem(x)
|
||||
outs = []
|
||||
for idx, stage in enumerate(self.stages):
|
||||
x = stage(x)
|
||||
if idx in self.return_idx:
|
||||
outs.append(x)
|
||||
return outs
|
||||
271
rtdetr_paddle/ppdet/modeling/backbones/lcnet.py
Normal file
271
rtdetr_paddle/ppdet/modeling/backbones/lcnet.py
Normal file
@@ -0,0 +1,271 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
from paddle import ParamAttr
|
||||
from paddle.nn import AdaptiveAvgPool2D, Conv2D
|
||||
from paddle.regularizer import L2Decay
|
||||
from paddle.nn.initializer import KaimingNormal
|
||||
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from numbers import Integral
|
||||
from ..shape_spec import ShapeSpec
|
||||
|
||||
__all__ = ['LCNet']
|
||||
|
||||
NET_CONFIG = {
|
||||
"blocks2":
|
||||
#k, in_c, out_c, s, use_se
|
||||
[[3, 16, 32, 1, False], ],
|
||||
"blocks3": [
|
||||
[3, 32, 64, 2, False],
|
||||
[3, 64, 64, 1, False],
|
||||
],
|
||||
"blocks4": [
|
||||
[3, 64, 128, 2, False],
|
||||
[3, 128, 128, 1, False],
|
||||
],
|
||||
"blocks5": [
|
||||
[3, 128, 256, 2, False],
|
||||
[5, 256, 256, 1, False],
|
||||
[5, 256, 256, 1, False],
|
||||
[5, 256, 256, 1, False],
|
||||
[5, 256, 256, 1, False],
|
||||
[5, 256, 256, 1, False],
|
||||
],
|
||||
"blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]]
|
||||
}
|
||||
|
||||
|
||||
def make_divisible(v, divisor=8, min_value=None):
|
||||
if min_value is None:
|
||||
min_value = divisor
|
||||
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
|
||||
if new_v < 0.9 * v:
|
||||
new_v += divisor
|
||||
return new_v
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
num_channels,
|
||||
filter_size,
|
||||
num_filters,
|
||||
stride,
|
||||
num_groups=1,
|
||||
act='hard_swish'):
|
||||
super().__init__()
|
||||
|
||||
self.conv = Conv2D(
|
||||
in_channels=num_channels,
|
||||
out_channels=num_filters,
|
||||
kernel_size=filter_size,
|
||||
stride=stride,
|
||||
padding=(filter_size - 1) // 2,
|
||||
groups=num_groups,
|
||||
weight_attr=ParamAttr(initializer=KaimingNormal()),
|
||||
bias_attr=False)
|
||||
|
||||
self.bn = nn.BatchNorm2D(
|
||||
num_filters,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
|
||||
if act == 'hard_swish':
|
||||
self.act = nn.Hardswish()
|
||||
elif act == 'relu6':
|
||||
self.act = nn.ReLU6()
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
x = self.act(x)
|
||||
return x
|
||||
|
||||
|
||||
class DepthwiseSeparable(nn.Layer):
|
||||
def __init__(self,
|
||||
num_channels,
|
||||
num_filters,
|
||||
stride,
|
||||
dw_size=3,
|
||||
use_se=False,
|
||||
act='hard_swish'):
|
||||
super().__init__()
|
||||
self.use_se = use_se
|
||||
self.dw_conv = ConvBNLayer(
|
||||
num_channels=num_channels,
|
||||
num_filters=num_channels,
|
||||
filter_size=dw_size,
|
||||
stride=stride,
|
||||
num_groups=num_channels,
|
||||
act=act)
|
||||
if use_se:
|
||||
self.se = SEModule(num_channels)
|
||||
self.pw_conv = ConvBNLayer(
|
||||
num_channels=num_channels,
|
||||
filter_size=1,
|
||||
num_filters=num_filters,
|
||||
stride=1,
|
||||
act=act)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.dw_conv(x)
|
||||
if self.use_se:
|
||||
x = self.se(x)
|
||||
x = self.pw_conv(x)
|
||||
return x
|
||||
|
||||
|
||||
class SEModule(nn.Layer):
|
||||
def __init__(self, channel, reduction=4):
|
||||
super().__init__()
|
||||
self.avg_pool = AdaptiveAvgPool2D(1)
|
||||
self.conv1 = Conv2D(
|
||||
in_channels=channel,
|
||||
out_channels=channel // reduction,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0)
|
||||
self.relu = nn.ReLU()
|
||||
self.conv2 = Conv2D(
|
||||
in_channels=channel // reduction,
|
||||
out_channels=channel,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0)
|
||||
self.hardsigmoid = nn.Hardsigmoid()
|
||||
|
||||
def forward(self, x):
|
||||
identity = x
|
||||
x = self.avg_pool(x)
|
||||
x = self.conv1(x)
|
||||
x = self.relu(x)
|
||||
x = self.conv2(x)
|
||||
x = self.hardsigmoid(x)
|
||||
x = paddle.multiply(x=identity, y=x)
|
||||
return x
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class LCNet(nn.Layer):
|
||||
def __init__(self, scale=1.0, feature_maps=[3, 4, 5], act='hard_swish'):
|
||||
super().__init__()
|
||||
self.scale = scale
|
||||
self.feature_maps = feature_maps
|
||||
|
||||
out_channels = []
|
||||
|
||||
self.conv1 = ConvBNLayer(
|
||||
num_channels=3,
|
||||
filter_size=3,
|
||||
num_filters=make_divisible(16 * scale),
|
||||
stride=2,
|
||||
act=act)
|
||||
|
||||
self.blocks2 = nn.Sequential(* [
|
||||
DepthwiseSeparable(
|
||||
num_channels=make_divisible(in_c * scale),
|
||||
num_filters=make_divisible(out_c * scale),
|
||||
dw_size=k,
|
||||
stride=s,
|
||||
use_se=se,
|
||||
act=act)
|
||||
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"])
|
||||
])
|
||||
|
||||
self.blocks3 = nn.Sequential(* [
|
||||
DepthwiseSeparable(
|
||||
num_channels=make_divisible(in_c * scale),
|
||||
num_filters=make_divisible(out_c * scale),
|
||||
dw_size=k,
|
||||
stride=s,
|
||||
use_se=se,
|
||||
act=act)
|
||||
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"])
|
||||
])
|
||||
|
||||
out_channels.append(
|
||||
make_divisible(NET_CONFIG["blocks3"][-1][2] * scale))
|
||||
|
||||
self.blocks4 = nn.Sequential(* [
|
||||
DepthwiseSeparable(
|
||||
num_channels=make_divisible(in_c * scale),
|
||||
num_filters=make_divisible(out_c * scale),
|
||||
dw_size=k,
|
||||
stride=s,
|
||||
use_se=se,
|
||||
act=act)
|
||||
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"])
|
||||
])
|
||||
|
||||
out_channels.append(
|
||||
make_divisible(NET_CONFIG["blocks4"][-1][2] * scale))
|
||||
|
||||
self.blocks5 = nn.Sequential(* [
|
||||
DepthwiseSeparable(
|
||||
num_channels=make_divisible(in_c * scale),
|
||||
num_filters=make_divisible(out_c * scale),
|
||||
dw_size=k,
|
||||
stride=s,
|
||||
use_se=se,
|
||||
act=act)
|
||||
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"])
|
||||
])
|
||||
|
||||
out_channels.append(
|
||||
make_divisible(NET_CONFIG["blocks5"][-1][2] * scale))
|
||||
|
||||
self.blocks6 = nn.Sequential(* [
|
||||
DepthwiseSeparable(
|
||||
num_channels=make_divisible(in_c * scale),
|
||||
num_filters=make_divisible(out_c * scale),
|
||||
dw_size=k,
|
||||
stride=s,
|
||||
use_se=se,
|
||||
act=act)
|
||||
for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"])
|
||||
])
|
||||
|
||||
out_channels.append(
|
||||
make_divisible(NET_CONFIG["blocks6"][-1][2] * scale))
|
||||
self._out_channels = [
|
||||
ch for idx, ch in enumerate(out_channels) if idx + 2 in feature_maps
|
||||
]
|
||||
|
||||
def forward(self, inputs):
|
||||
x = inputs['image']
|
||||
outs = []
|
||||
|
||||
x = self.conv1(x)
|
||||
x = self.blocks2(x)
|
||||
x = self.blocks3(x)
|
||||
outs.append(x)
|
||||
x = self.blocks4(x)
|
||||
outs.append(x)
|
||||
x = self.blocks5(x)
|
||||
outs.append(x)
|
||||
x = self.blocks6(x)
|
||||
outs.append(x)
|
||||
outs = [o for i, o in enumerate(outs) if i + 2 in self.feature_maps]
|
||||
return outs
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [ShapeSpec(channels=c) for c in self._out_channels]
|
||||
402
rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v1.py
Normal file
402
rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v1.py
Normal file
@@ -0,0 +1,402 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
from paddle.regularizer import L2Decay
|
||||
from paddle.nn.initializer import KaimingNormal
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from numbers import Integral
|
||||
from ..shape_spec import ShapeSpec
|
||||
|
||||
__all__ = ['MobileNet']
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
num_groups=1,
|
||||
act='relu',
|
||||
conv_lr=1.,
|
||||
conv_decay=0.,
|
||||
norm_decay=0.,
|
||||
norm_type='bn',
|
||||
name=None):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
self.act = act
|
||||
self._conv = nn.Conv2D(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
groups=num_groups,
|
||||
weight_attr=ParamAttr(
|
||||
learning_rate=conv_lr,
|
||||
initializer=KaimingNormal(),
|
||||
regularizer=L2Decay(conv_decay)),
|
||||
bias_attr=False)
|
||||
|
||||
param_attr = ParamAttr(regularizer=L2Decay(norm_decay))
|
||||
bias_attr = ParamAttr(regularizer=L2Decay(norm_decay))
|
||||
if norm_type in ['sync_bn', 'bn']:
|
||||
self._batch_norm = nn.BatchNorm2D(
|
||||
out_channels, weight_attr=param_attr, bias_attr=bias_attr)
|
||||
|
||||
def forward(self, x):
|
||||
x = self._conv(x)
|
||||
x = self._batch_norm(x)
|
||||
if self.act == "relu":
|
||||
x = F.relu(x)
|
||||
elif self.act == "relu6":
|
||||
x = F.relu6(x)
|
||||
return x
|
||||
|
||||
|
||||
class DepthwiseSeparable(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels1,
|
||||
out_channels2,
|
||||
num_groups,
|
||||
stride,
|
||||
scale,
|
||||
conv_lr=1.,
|
||||
conv_decay=0.,
|
||||
norm_decay=0.,
|
||||
norm_type='bn',
|
||||
name=None):
|
||||
super(DepthwiseSeparable, self).__init__()
|
||||
|
||||
self._depthwise_conv = ConvBNLayer(
|
||||
in_channels,
|
||||
int(out_channels1 * scale),
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
padding=1,
|
||||
num_groups=int(num_groups * scale),
|
||||
conv_lr=conv_lr,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name=name + "_dw")
|
||||
|
||||
self._pointwise_conv = ConvBNLayer(
|
||||
int(out_channels1 * scale),
|
||||
int(out_channels2 * scale),
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
conv_lr=conv_lr,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name=name + "_sep")
|
||||
|
||||
def forward(self, x):
|
||||
x = self._depthwise_conv(x)
|
||||
x = self._pointwise_conv(x)
|
||||
return x
|
||||
|
||||
|
||||
class ExtraBlock(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels1,
|
||||
out_channels2,
|
||||
num_groups=1,
|
||||
stride=2,
|
||||
conv_lr=1.,
|
||||
conv_decay=0.,
|
||||
norm_decay=0.,
|
||||
norm_type='bn',
|
||||
name=None):
|
||||
super(ExtraBlock, self).__init__()
|
||||
|
||||
self.pointwise_conv = ConvBNLayer(
|
||||
in_channels,
|
||||
int(out_channels1),
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
num_groups=int(num_groups),
|
||||
act='relu6',
|
||||
conv_lr=conv_lr,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name=name + "_extra1")
|
||||
|
||||
self.normal_conv = ConvBNLayer(
|
||||
int(out_channels1),
|
||||
int(out_channels2),
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
padding=1,
|
||||
num_groups=int(num_groups),
|
||||
act='relu6',
|
||||
conv_lr=conv_lr,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name=name + "_extra2")
|
||||
|
||||
def forward(self, x):
|
||||
x = self.pointwise_conv(x)
|
||||
x = self.normal_conv(x)
|
||||
return x
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class MobileNet(nn.Layer):
|
||||
__shared__ = ['norm_type']
|
||||
|
||||
def __init__(self,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
conv_decay=0.,
|
||||
scale=1,
|
||||
conv_learning_rate=1.0,
|
||||
feature_maps=[4, 6, 13],
|
||||
with_extra_blocks=False,
|
||||
extra_block_filters=[[256, 512], [128, 256], [128, 256],
|
||||
[64, 128]]):
|
||||
super(MobileNet, self).__init__()
|
||||
if isinstance(feature_maps, Integral):
|
||||
feature_maps = [feature_maps]
|
||||
self.feature_maps = feature_maps
|
||||
self.with_extra_blocks = with_extra_blocks
|
||||
self.extra_block_filters = extra_block_filters
|
||||
|
||||
self._out_channels = []
|
||||
|
||||
self.conv1 = ConvBNLayer(
|
||||
in_channels=3,
|
||||
out_channels=int(32 * scale),
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
conv_lr=conv_learning_rate,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name="conv1")
|
||||
|
||||
self.dwsl = []
|
||||
dws21 = self.add_sublayer(
|
||||
"conv2_1",
|
||||
sublayer=DepthwiseSeparable(
|
||||
in_channels=int(32 * scale),
|
||||
out_channels1=32,
|
||||
out_channels2=64,
|
||||
num_groups=32,
|
||||
stride=1,
|
||||
scale=scale,
|
||||
conv_lr=conv_learning_rate,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name="conv2_1"))
|
||||
self.dwsl.append(dws21)
|
||||
self._update_out_channels(int(64 * scale), len(self.dwsl), feature_maps)
|
||||
dws22 = self.add_sublayer(
|
||||
"conv2_2",
|
||||
sublayer=DepthwiseSeparable(
|
||||
in_channels=int(64 * scale),
|
||||
out_channels1=64,
|
||||
out_channels2=128,
|
||||
num_groups=64,
|
||||
stride=2,
|
||||
scale=scale,
|
||||
conv_lr=conv_learning_rate,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name="conv2_2"))
|
||||
self.dwsl.append(dws22)
|
||||
self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps)
|
||||
# 1/4
|
||||
dws31 = self.add_sublayer(
|
||||
"conv3_1",
|
||||
sublayer=DepthwiseSeparable(
|
||||
in_channels=int(128 * scale),
|
||||
out_channels1=128,
|
||||
out_channels2=128,
|
||||
num_groups=128,
|
||||
stride=1,
|
||||
scale=scale,
|
||||
conv_lr=conv_learning_rate,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name="conv3_1"))
|
||||
self.dwsl.append(dws31)
|
||||
self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps)
|
||||
dws32 = self.add_sublayer(
|
||||
"conv3_2",
|
||||
sublayer=DepthwiseSeparable(
|
||||
in_channels=int(128 * scale),
|
||||
out_channels1=128,
|
||||
out_channels2=256,
|
||||
num_groups=128,
|
||||
stride=2,
|
||||
scale=scale,
|
||||
conv_lr=conv_learning_rate,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name="conv3_2"))
|
||||
self.dwsl.append(dws32)
|
||||
self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps)
|
||||
# 1/8
|
||||
dws41 = self.add_sublayer(
|
||||
"conv4_1",
|
||||
sublayer=DepthwiseSeparable(
|
||||
in_channels=int(256 * scale),
|
||||
out_channels1=256,
|
||||
out_channels2=256,
|
||||
num_groups=256,
|
||||
stride=1,
|
||||
scale=scale,
|
||||
conv_lr=conv_learning_rate,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name="conv4_1"))
|
||||
self.dwsl.append(dws41)
|
||||
self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps)
|
||||
dws42 = self.add_sublayer(
|
||||
"conv4_2",
|
||||
sublayer=DepthwiseSeparable(
|
||||
in_channels=int(256 * scale),
|
||||
out_channels1=256,
|
||||
out_channels2=512,
|
||||
num_groups=256,
|
||||
stride=2,
|
||||
scale=scale,
|
||||
conv_lr=conv_learning_rate,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name="conv4_2"))
|
||||
self.dwsl.append(dws42)
|
||||
self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps)
|
||||
# 1/16
|
||||
for i in range(5):
|
||||
tmp = self.add_sublayer(
|
||||
"conv5_" + str(i + 1),
|
||||
sublayer=DepthwiseSeparable(
|
||||
in_channels=int(512 * scale),
|
||||
out_channels1=512,
|
||||
out_channels2=512,
|
||||
num_groups=512,
|
||||
stride=1,
|
||||
scale=scale,
|
||||
conv_lr=conv_learning_rate,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name="conv5_" + str(i + 1)))
|
||||
self.dwsl.append(tmp)
|
||||
self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps)
|
||||
dws56 = self.add_sublayer(
|
||||
"conv5_6",
|
||||
sublayer=DepthwiseSeparable(
|
||||
in_channels=int(512 * scale),
|
||||
out_channels1=512,
|
||||
out_channels2=1024,
|
||||
num_groups=512,
|
||||
stride=2,
|
||||
scale=scale,
|
||||
conv_lr=conv_learning_rate,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name="conv5_6"))
|
||||
self.dwsl.append(dws56)
|
||||
self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps)
|
||||
# 1/32
|
||||
dws6 = self.add_sublayer(
|
||||
"conv6",
|
||||
sublayer=DepthwiseSeparable(
|
||||
in_channels=int(1024 * scale),
|
||||
out_channels1=1024,
|
||||
out_channels2=1024,
|
||||
num_groups=1024,
|
||||
stride=1,
|
||||
scale=scale,
|
||||
conv_lr=conv_learning_rate,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name="conv6"))
|
||||
self.dwsl.append(dws6)
|
||||
self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps)
|
||||
|
||||
if self.with_extra_blocks:
|
||||
self.extra_blocks = []
|
||||
for i, block_filter in enumerate(self.extra_block_filters):
|
||||
in_c = 1024 if i == 0 else self.extra_block_filters[i - 1][1]
|
||||
conv_extra = self.add_sublayer(
|
||||
"conv7_" + str(i + 1),
|
||||
sublayer=ExtraBlock(
|
||||
in_c,
|
||||
block_filter[0],
|
||||
block_filter[1],
|
||||
conv_lr=conv_learning_rate,
|
||||
conv_decay=conv_decay,
|
||||
norm_decay=norm_decay,
|
||||
norm_type=norm_type,
|
||||
name="conv7_" + str(i + 1)))
|
||||
self.extra_blocks.append(conv_extra)
|
||||
self._update_out_channels(
|
||||
block_filter[1],
|
||||
len(self.dwsl) + len(self.extra_blocks), feature_maps)
|
||||
|
||||
def _update_out_channels(self, channel, feature_idx, feature_maps):
|
||||
if feature_idx in feature_maps:
|
||||
self._out_channels.append(channel)
|
||||
|
||||
def forward(self, inputs):
|
||||
outs = []
|
||||
y = self.conv1(inputs['image'])
|
||||
for i, block in enumerate(self.dwsl):
|
||||
y = block(y)
|
||||
if i + 1 in self.feature_maps:
|
||||
outs.append(y)
|
||||
|
||||
if not self.with_extra_blocks:
|
||||
return outs
|
||||
|
||||
y = outs[-1]
|
||||
for i, block in enumerate(self.extra_blocks):
|
||||
idx = i + len(self.dwsl)
|
||||
y = block(y)
|
||||
if idx + 1 in self.feature_maps:
|
||||
outs.append(y)
|
||||
return outs
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [ShapeSpec(channels=c) for c in self._out_channels]
|
||||
478
rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v3.py
Normal file
478
rtdetr_paddle/ppdet/modeling/backbones/mobilenet_v3.py
Normal file
@@ -0,0 +1,478 @@
|
||||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
from paddle.regularizer import L2Decay
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from numbers import Integral
|
||||
from ..shape_spec import ShapeSpec
|
||||
|
||||
__all__ = ['MobileNetV3']
|
||||
|
||||
|
||||
def make_divisible(v, divisor=8, min_value=None):
|
||||
if min_value is None:
|
||||
min_value = divisor
|
||||
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
|
||||
if new_v < 0.9 * v:
|
||||
new_v += divisor
|
||||
return new_v
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
in_c,
|
||||
out_c,
|
||||
filter_size,
|
||||
stride,
|
||||
padding,
|
||||
num_groups=1,
|
||||
act=None,
|
||||
lr_mult=1.,
|
||||
conv_decay=0.,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
freeze_norm=False,
|
||||
name=""):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
self.act = act
|
||||
self.conv = nn.Conv2D(
|
||||
in_channels=in_c,
|
||||
out_channels=out_c,
|
||||
kernel_size=filter_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
groups=num_groups,
|
||||
weight_attr=ParamAttr(
|
||||
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
|
||||
bias_attr=False)
|
||||
|
||||
norm_lr = 0. if freeze_norm else lr_mult
|
||||
param_attr = ParamAttr(
|
||||
learning_rate=norm_lr,
|
||||
regularizer=L2Decay(norm_decay),
|
||||
trainable=False if freeze_norm else True)
|
||||
bias_attr = ParamAttr(
|
||||
learning_rate=norm_lr,
|
||||
regularizer=L2Decay(norm_decay),
|
||||
trainable=False if freeze_norm else True)
|
||||
global_stats = True if freeze_norm else None
|
||||
if norm_type in ['sync_bn', 'bn']:
|
||||
self.bn = nn.BatchNorm2D(
|
||||
out_c,
|
||||
weight_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
use_global_stats=global_stats)
|
||||
norm_params = self.bn.parameters()
|
||||
if freeze_norm:
|
||||
for param in norm_params:
|
||||
param.stop_gradient = True
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
if self.act is not None:
|
||||
if self.act == "relu":
|
||||
x = F.relu(x)
|
||||
elif self.act == "relu6":
|
||||
x = F.relu6(x)
|
||||
elif self.act == "hard_swish":
|
||||
x = F.hardswish(x)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"The activation function is selected incorrectly.")
|
||||
return x
|
||||
|
||||
|
||||
class ResidualUnit(nn.Layer):
|
||||
def __init__(self,
|
||||
in_c,
|
||||
mid_c,
|
||||
out_c,
|
||||
filter_size,
|
||||
stride,
|
||||
use_se,
|
||||
lr_mult,
|
||||
conv_decay=0.,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
freeze_norm=False,
|
||||
act=None,
|
||||
return_list=False,
|
||||
name=''):
|
||||
super(ResidualUnit, self).__init__()
|
||||
self.if_shortcut = stride == 1 and in_c == out_c
|
||||
self.use_se = use_se
|
||||
self.return_list = return_list
|
||||
|
||||
self.expand_conv = ConvBNLayer(
|
||||
in_c=in_c,
|
||||
out_c=mid_c,
|
||||
filter_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
act=act,
|
||||
lr_mult=lr_mult,
|
||||
conv_decay=conv_decay,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
name=name + "_expand")
|
||||
self.bottleneck_conv = ConvBNLayer(
|
||||
in_c=mid_c,
|
||||
out_c=mid_c,
|
||||
filter_size=filter_size,
|
||||
stride=stride,
|
||||
padding=int((filter_size - 1) // 2),
|
||||
num_groups=mid_c,
|
||||
act=act,
|
||||
lr_mult=lr_mult,
|
||||
conv_decay=conv_decay,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
name=name + "_depthwise")
|
||||
if self.use_se:
|
||||
self.mid_se = SEModule(
|
||||
mid_c, lr_mult, conv_decay, name=name + "_se")
|
||||
self.linear_conv = ConvBNLayer(
|
||||
in_c=mid_c,
|
||||
out_c=out_c,
|
||||
filter_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
act=None,
|
||||
lr_mult=lr_mult,
|
||||
conv_decay=conv_decay,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
name=name + "_linear")
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self.expand_conv(inputs)
|
||||
x = self.bottleneck_conv(y)
|
||||
if self.use_se:
|
||||
x = self.mid_se(x)
|
||||
x = self.linear_conv(x)
|
||||
if self.if_shortcut:
|
||||
x = paddle.add(inputs, x)
|
||||
if self.return_list:
|
||||
return [y, x]
|
||||
else:
|
||||
return x
|
||||
|
||||
|
||||
class SEModule(nn.Layer):
|
||||
def __init__(self, channel, lr_mult, conv_decay, reduction=4, name=""):
|
||||
super(SEModule, self).__init__()
|
||||
self.avg_pool = nn.AdaptiveAvgPool2D(1)
|
||||
mid_channels = int(channel // reduction)
|
||||
self.conv1 = nn.Conv2D(
|
||||
in_channels=channel,
|
||||
out_channels=mid_channels,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
weight_attr=ParamAttr(
|
||||
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
|
||||
bias_attr=ParamAttr(
|
||||
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
|
||||
self.conv2 = nn.Conv2D(
|
||||
in_channels=mid_channels,
|
||||
out_channels=channel,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
weight_attr=ParamAttr(
|
||||
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
|
||||
bias_attr=ParamAttr(
|
||||
learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
|
||||
|
||||
def forward(self, inputs):
|
||||
outputs = self.avg_pool(inputs)
|
||||
outputs = self.conv1(outputs)
|
||||
outputs = F.relu(outputs)
|
||||
outputs = self.conv2(outputs)
|
||||
outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5)
|
||||
return paddle.multiply(x=inputs, y=outputs)
|
||||
|
||||
|
||||
class ExtraBlockDW(nn.Layer):
|
||||
def __init__(self,
|
||||
in_c,
|
||||
ch_1,
|
||||
ch_2,
|
||||
stride,
|
||||
lr_mult,
|
||||
conv_decay=0.,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
freeze_norm=False,
|
||||
name=None):
|
||||
super(ExtraBlockDW, self).__init__()
|
||||
self.pointwise_conv = ConvBNLayer(
|
||||
in_c=in_c,
|
||||
out_c=ch_1,
|
||||
filter_size=1,
|
||||
stride=1,
|
||||
padding='SAME',
|
||||
act='relu6',
|
||||
lr_mult=lr_mult,
|
||||
conv_decay=conv_decay,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
name=name + "_extra1")
|
||||
self.depthwise_conv = ConvBNLayer(
|
||||
in_c=ch_1,
|
||||
out_c=ch_2,
|
||||
filter_size=3,
|
||||
stride=stride,
|
||||
padding='SAME',
|
||||
num_groups=int(ch_1),
|
||||
act='relu6',
|
||||
lr_mult=lr_mult,
|
||||
conv_decay=conv_decay,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
name=name + "_extra2_dw")
|
||||
self.normal_conv = ConvBNLayer(
|
||||
in_c=ch_2,
|
||||
out_c=ch_2,
|
||||
filter_size=1,
|
||||
stride=1,
|
||||
padding='SAME',
|
||||
act='relu6',
|
||||
lr_mult=lr_mult,
|
||||
conv_decay=conv_decay,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
name=name + "_extra2_sep")
|
||||
|
||||
def forward(self, inputs):
|
||||
x = self.pointwise_conv(inputs)
|
||||
x = self.depthwise_conv(x)
|
||||
x = self.normal_conv(x)
|
||||
return x
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class MobileNetV3(nn.Layer):
|
||||
__shared__ = ['norm_type']
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
scale=1.0,
|
||||
model_name="large",
|
||||
feature_maps=[6, 12, 15],
|
||||
with_extra_blocks=False,
|
||||
extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]],
|
||||
lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
|
||||
conv_decay=0.0,
|
||||
multiplier=1.0,
|
||||
norm_type='bn',
|
||||
norm_decay=0.0,
|
||||
freeze_norm=False):
|
||||
super(MobileNetV3, self).__init__()
|
||||
if isinstance(feature_maps, Integral):
|
||||
feature_maps = [feature_maps]
|
||||
if norm_type == 'sync_bn' and freeze_norm:
|
||||
raise ValueError(
|
||||
"The norm_type should not be sync_bn when freeze_norm is True")
|
||||
self.feature_maps = feature_maps
|
||||
self.with_extra_blocks = with_extra_blocks
|
||||
self.extra_block_filters = extra_block_filters
|
||||
|
||||
inplanes = 16
|
||||
if model_name == "large":
|
||||
self.cfg = [
|
||||
# k, exp, c, se, nl, s,
|
||||
[3, 16, 16, False, "relu", 1],
|
||||
[3, 64, 24, False, "relu", 2],
|
||||
[3, 72, 24, False, "relu", 1],
|
||||
[5, 72, 40, True, "relu", 2], # RCNN output
|
||||
[5, 120, 40, True, "relu", 1],
|
||||
[5, 120, 40, True, "relu", 1], # YOLOv3 output
|
||||
[3, 240, 80, False, "hard_swish", 2], # RCNN output
|
||||
[3, 200, 80, False, "hard_swish", 1],
|
||||
[3, 184, 80, False, "hard_swish", 1],
|
||||
[3, 184, 80, False, "hard_swish", 1],
|
||||
[3, 480, 112, True, "hard_swish", 1],
|
||||
[3, 672, 112, True, "hard_swish", 1], # YOLOv3 output
|
||||
[5, 672, 160, True, "hard_swish", 2], # SSD/SSDLite/RCNN output
|
||||
[5, 960, 160, True, "hard_swish", 1],
|
||||
[5, 960, 160, True, "hard_swish", 1], # YOLOv3 output
|
||||
]
|
||||
elif model_name == "small":
|
||||
self.cfg = [
|
||||
# k, exp, c, se, nl, s,
|
||||
[3, 16, 16, True, "relu", 2],
|
||||
[3, 72, 24, False, "relu", 2], # RCNN output
|
||||
[3, 88, 24, False, "relu", 1], # YOLOv3 output
|
||||
[5, 96, 40, True, "hard_swish", 2], # RCNN output
|
||||
[5, 240, 40, True, "hard_swish", 1],
|
||||
[5, 240, 40, True, "hard_swish", 1],
|
||||
[5, 120, 48, True, "hard_swish", 1],
|
||||
[5, 144, 48, True, "hard_swish", 1], # YOLOv3 output
|
||||
[5, 288, 96, True, "hard_swish", 2], # SSD/SSDLite/RCNN output
|
||||
[5, 576, 96, True, "hard_swish", 1],
|
||||
[5, 576, 96, True, "hard_swish", 1], # YOLOv3 output
|
||||
]
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"mode[{}_model] is not implemented!".format(model_name))
|
||||
|
||||
if multiplier != 1.0:
|
||||
self.cfg[-3][2] = int(self.cfg[-3][2] * multiplier)
|
||||
self.cfg[-2][1] = int(self.cfg[-2][1] * multiplier)
|
||||
self.cfg[-2][2] = int(self.cfg[-2][2] * multiplier)
|
||||
self.cfg[-1][1] = int(self.cfg[-1][1] * multiplier)
|
||||
self.cfg[-1][2] = int(self.cfg[-1][2] * multiplier)
|
||||
|
||||
self.conv1 = ConvBNLayer(
|
||||
in_c=3,
|
||||
out_c=make_divisible(inplanes * scale),
|
||||
filter_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
num_groups=1,
|
||||
act="hard_swish",
|
||||
lr_mult=lr_mult_list[0],
|
||||
conv_decay=conv_decay,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
name="conv1")
|
||||
|
||||
self._out_channels = []
|
||||
self.block_list = []
|
||||
i = 0
|
||||
inplanes = make_divisible(inplanes * scale)
|
||||
for (k, exp, c, se, nl, s) in self.cfg:
|
||||
lr_idx = min(i // 3, len(lr_mult_list) - 1)
|
||||
lr_mult = lr_mult_list[lr_idx]
|
||||
|
||||
# for SSD/SSDLite, first head input is after ResidualUnit expand_conv
|
||||
return_list = self.with_extra_blocks and i + 2 in self.feature_maps
|
||||
|
||||
block = self.add_sublayer(
|
||||
"conv" + str(i + 2),
|
||||
sublayer=ResidualUnit(
|
||||
in_c=inplanes,
|
||||
mid_c=make_divisible(scale * exp),
|
||||
out_c=make_divisible(scale * c),
|
||||
filter_size=k,
|
||||
stride=s,
|
||||
use_se=se,
|
||||
act=nl,
|
||||
lr_mult=lr_mult,
|
||||
conv_decay=conv_decay,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
return_list=return_list,
|
||||
name="conv" + str(i + 2)))
|
||||
self.block_list.append(block)
|
||||
inplanes = make_divisible(scale * c)
|
||||
i += 1
|
||||
self._update_out_channels(
|
||||
make_divisible(scale * exp)
|
||||
if return_list else inplanes, i + 1, feature_maps)
|
||||
|
||||
if self.with_extra_blocks:
|
||||
self.extra_block_list = []
|
||||
extra_out_c = make_divisible(scale * self.cfg[-1][1])
|
||||
lr_idx = min(i // 3, len(lr_mult_list) - 1)
|
||||
lr_mult = lr_mult_list[lr_idx]
|
||||
|
||||
conv_extra = self.add_sublayer(
|
||||
"conv" + str(i + 2),
|
||||
sublayer=ConvBNLayer(
|
||||
in_c=inplanes,
|
||||
out_c=extra_out_c,
|
||||
filter_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
num_groups=1,
|
||||
act="hard_swish",
|
||||
lr_mult=lr_mult,
|
||||
conv_decay=conv_decay,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
name="conv" + str(i + 2)))
|
||||
self.extra_block_list.append(conv_extra)
|
||||
i += 1
|
||||
self._update_out_channels(extra_out_c, i + 1, feature_maps)
|
||||
|
||||
for j, block_filter in enumerate(self.extra_block_filters):
|
||||
in_c = extra_out_c if j == 0 else self.extra_block_filters[j -
|
||||
1][1]
|
||||
conv_extra = self.add_sublayer(
|
||||
"conv" + str(i + 2),
|
||||
sublayer=ExtraBlockDW(
|
||||
in_c,
|
||||
block_filter[0],
|
||||
block_filter[1],
|
||||
stride=2,
|
||||
lr_mult=lr_mult,
|
||||
conv_decay=conv_decay,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
name='conv' + str(i + 2)))
|
||||
self.extra_block_list.append(conv_extra)
|
||||
i += 1
|
||||
self._update_out_channels(block_filter[1], i + 1, feature_maps)
|
||||
|
||||
def _update_out_channels(self, channel, feature_idx, feature_maps):
|
||||
if feature_idx in feature_maps:
|
||||
self._out_channels.append(channel)
|
||||
|
||||
def forward(self, inputs):
|
||||
x = self.conv1(inputs['image'])
|
||||
outs = []
|
||||
for idx, block in enumerate(self.block_list):
|
||||
x = block(x)
|
||||
if idx + 2 in self.feature_maps:
|
||||
if isinstance(x, list):
|
||||
outs.append(x[0])
|
||||
x = x[1]
|
||||
else:
|
||||
outs.append(x)
|
||||
|
||||
if not self.with_extra_blocks:
|
||||
return outs
|
||||
|
||||
for i, block in enumerate(self.extra_block_list):
|
||||
idx = i + len(self.block_list)
|
||||
x = block(x)
|
||||
if idx + 2 in self.feature_maps:
|
||||
outs.append(x)
|
||||
return outs
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [ShapeSpec(channels=c) for c in self._out_channels]
|
||||
266
rtdetr_paddle/ppdet/modeling/backbones/mobileone.py
Normal file
266
rtdetr_paddle/ppdet/modeling/backbones/mobileone.py
Normal file
@@ -0,0 +1,266 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This code is the paddle implementation of MobileOne block, see: https://arxiv.org/pdf/2206.04040.pdf.
|
||||
Some codes are based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
|
||||
Ths copyright of microsoft/Swin-Transformer is as follows:
|
||||
MIT License [see LICENSE for details]
|
||||
"""
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
from paddle import ParamAttr
|
||||
from paddle.regularizer import L2Decay
|
||||
from paddle.nn.initializer import Normal, Constant
|
||||
|
||||
from ppdet.modeling.ops import get_act_fn
|
||||
from ppdet.modeling.layers import ConvNormLayer
|
||||
|
||||
|
||||
class MobileOneBlock(nn.Layer):
|
||||
def __init__(
|
||||
self,
|
||||
ch_in,
|
||||
ch_out,
|
||||
stride,
|
||||
kernel_size,
|
||||
conv_num=1,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
norm_groups=32,
|
||||
bias_on=False,
|
||||
lr_scale=1.,
|
||||
freeze_norm=False,
|
||||
initializer=Normal(
|
||||
mean=0., std=0.01),
|
||||
skip_quant=False,
|
||||
act='relu', ):
|
||||
super(MobileOneBlock, self).__init__()
|
||||
|
||||
self.ch_in = ch_in
|
||||
self.ch_out = ch_out
|
||||
self.kernel_size = kernel_size
|
||||
self.stride = stride
|
||||
self.padding = (kernel_size - 1) // 2
|
||||
self.k = conv_num
|
||||
|
||||
self.depth_conv = nn.LayerList()
|
||||
self.point_conv = nn.LayerList()
|
||||
for _ in range(self.k):
|
||||
self.depth_conv.append(
|
||||
ConvNormLayer(
|
||||
ch_in,
|
||||
ch_in,
|
||||
kernel_size,
|
||||
stride=stride,
|
||||
groups=ch_in,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
norm_groups=norm_groups,
|
||||
bias_on=bias_on,
|
||||
lr_scale=lr_scale,
|
||||
freeze_norm=freeze_norm,
|
||||
initializer=initializer,
|
||||
skip_quant=skip_quant))
|
||||
self.point_conv.append(
|
||||
ConvNormLayer(
|
||||
ch_in,
|
||||
ch_out,
|
||||
1,
|
||||
stride=1,
|
||||
groups=1,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
norm_groups=norm_groups,
|
||||
bias_on=bias_on,
|
||||
lr_scale=lr_scale,
|
||||
freeze_norm=freeze_norm,
|
||||
initializer=initializer,
|
||||
skip_quant=skip_quant))
|
||||
self.rbr_1x1 = ConvNormLayer(
|
||||
ch_in,
|
||||
ch_in,
|
||||
1,
|
||||
stride=self.stride,
|
||||
groups=ch_in,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
norm_groups=norm_groups,
|
||||
bias_on=bias_on,
|
||||
lr_scale=lr_scale,
|
||||
freeze_norm=freeze_norm,
|
||||
initializer=initializer,
|
||||
skip_quant=skip_quant)
|
||||
self.rbr_identity_st1 = nn.BatchNorm2D(
|
||||
num_features=ch_in,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(
|
||||
0.0))) if ch_in == ch_out and self.stride == 1 else None
|
||||
self.rbr_identity_st2 = nn.BatchNorm2D(
|
||||
num_features=ch_out,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(
|
||||
0.0))) if ch_in == ch_out and self.stride == 1 else None
|
||||
self.act = get_act_fn(act) if act is None or isinstance(act, (
|
||||
str, dict)) else act
|
||||
|
||||
def forward(self, x):
|
||||
if hasattr(self, "conv1") and hasattr(self, "conv2"):
|
||||
y = self.act(self.conv2(self.act(self.conv1(x))))
|
||||
else:
|
||||
if self.rbr_identity_st1 is None:
|
||||
id_out_st1 = 0
|
||||
else:
|
||||
id_out_st1 = self.rbr_identity_st1(x)
|
||||
|
||||
x1_1 = 0
|
||||
for i in range(self.k):
|
||||
x1_1 += self.depth_conv[i](x)
|
||||
|
||||
x1_2 = self.rbr_1x1(x)
|
||||
x1 = self.act(x1_1 + x1_2 + id_out_st1)
|
||||
|
||||
if self.rbr_identity_st2 is None:
|
||||
id_out_st2 = 0
|
||||
else:
|
||||
id_out_st2 = self.rbr_identity_st2(x1)
|
||||
|
||||
x2_1 = 0
|
||||
for i in range(self.k):
|
||||
x2_1 += self.point_conv[i](x1)
|
||||
y = self.act(x2_1 + id_out_st2)
|
||||
|
||||
return y
|
||||
|
||||
def convert_to_deploy(self):
|
||||
if not hasattr(self, 'conv1'):
|
||||
self.conv1 = nn.Conv2D(
|
||||
in_channels=self.ch_in,
|
||||
out_channels=self.ch_in,
|
||||
kernel_size=self.kernel_size,
|
||||
stride=self.stride,
|
||||
padding=self.padding,
|
||||
groups=self.ch_in,
|
||||
bias_attr=ParamAttr(
|
||||
initializer=Constant(value=0.), learning_rate=1.))
|
||||
if not hasattr(self, 'conv2'):
|
||||
self.conv2 = nn.Conv2D(
|
||||
in_channels=self.ch_in,
|
||||
out_channels=self.ch_out,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding='SAME',
|
||||
groups=1,
|
||||
bias_attr=ParamAttr(
|
||||
initializer=Constant(value=0.), learning_rate=1.))
|
||||
|
||||
conv1_kernel, conv1_bias, conv2_kernel, conv2_bias = self.get_equivalent_kernel_bias(
|
||||
)
|
||||
self.conv1.weight.set_value(conv1_kernel)
|
||||
self.conv1.bias.set_value(conv1_bias)
|
||||
self.conv2.weight.set_value(conv2_kernel)
|
||||
self.conv2.bias.set_value(conv2_bias)
|
||||
self.__delattr__('depth_conv')
|
||||
self.__delattr__('point_conv')
|
||||
self.__delattr__('rbr_1x1')
|
||||
if hasattr(self, 'rbr_identity_st1'):
|
||||
self.__delattr__('rbr_identity_st1')
|
||||
if hasattr(self, 'rbr_identity_st2'):
|
||||
self.__delattr__('rbr_identity_st2')
|
||||
|
||||
def get_equivalent_kernel_bias(self):
|
||||
st1_kernel3x3, st1_bias3x3 = self._fuse_bn_tensor(self.depth_conv)
|
||||
st1_kernel1x1, st1_bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
|
||||
st1_kernelid, st1_biasid = self._fuse_bn_tensor(
|
||||
self.rbr_identity_st1, kernel_size=self.kernel_size)
|
||||
|
||||
st2_kernel1x1, st2_bias1x1 = self._fuse_bn_tensor(self.point_conv)
|
||||
st2_kernelid, st2_biasid = self._fuse_bn_tensor(
|
||||
self.rbr_identity_st2, kernel_size=1)
|
||||
|
||||
conv1_kernel = st1_kernel3x3 + self._pad_1x1_to_3x3_tensor(
|
||||
st1_kernel1x1) + st1_kernelid
|
||||
|
||||
conv1_bias = st1_bias3x3 + st1_bias1x1 + st1_biasid
|
||||
|
||||
conv2_kernel = st2_kernel1x1 + st2_kernelid
|
||||
conv2_bias = st2_bias1x1 + st2_biasid
|
||||
|
||||
return conv1_kernel, conv1_bias, conv2_kernel, conv2_bias
|
||||
|
||||
def _pad_1x1_to_3x3_tensor(self, kernel1x1):
|
||||
if kernel1x1 is None:
|
||||
return 0
|
||||
else:
|
||||
padding_size = (self.kernel_size - 1) // 2
|
||||
return nn.functional.pad(
|
||||
kernel1x1,
|
||||
[padding_size, padding_size, padding_size, padding_size])
|
||||
|
||||
def _fuse_bn_tensor(self, branch, kernel_size=3):
|
||||
if branch is None:
|
||||
return 0, 0
|
||||
|
||||
if isinstance(branch, nn.LayerList):
|
||||
fused_kernels = []
|
||||
fused_bias = []
|
||||
for block in branch:
|
||||
kernel = block.conv.weight
|
||||
running_mean = block.norm._mean
|
||||
running_var = block.norm._variance
|
||||
gamma = block.norm.weight
|
||||
beta = block.norm.bias
|
||||
eps = block.norm._epsilon
|
||||
|
||||
std = (running_var + eps).sqrt()
|
||||
t = (gamma / std).reshape((-1, 1, 1, 1))
|
||||
|
||||
fused_kernels.append(kernel * t)
|
||||
fused_bias.append(beta - running_mean * gamma / std)
|
||||
|
||||
return sum(fused_kernels), sum(fused_bias)
|
||||
|
||||
elif isinstance(branch, ConvNormLayer):
|
||||
kernel = branch.conv.weight
|
||||
running_mean = branch.norm._mean
|
||||
running_var = branch.norm._variance
|
||||
gamma = branch.norm.weight
|
||||
beta = branch.norm.bias
|
||||
eps = branch.norm._epsilon
|
||||
else:
|
||||
assert isinstance(branch, nn.BatchNorm2D)
|
||||
input_dim = self.ch_in if kernel_size == 1 else 1
|
||||
kernel_value = paddle.zeros(
|
||||
shape=[self.ch_in, input_dim, kernel_size, kernel_size],
|
||||
dtype='float32')
|
||||
if kernel_size > 1:
|
||||
for i in range(self.ch_in):
|
||||
kernel_value[i, i % input_dim, (kernel_size - 1) // 2, (
|
||||
kernel_size - 1) // 2] = 1
|
||||
elif kernel_size == 1:
|
||||
for i in range(self.ch_in):
|
||||
kernel_value[i, i % input_dim, 0, 0] = 1
|
||||
else:
|
||||
raise ValueError("Invalid kernel size recieved!")
|
||||
kernel = paddle.to_tensor(kernel_value, place=branch.weight.place)
|
||||
running_mean = branch._mean
|
||||
running_var = branch._variance
|
||||
gamma = branch.weight
|
||||
beta = branch.bias
|
||||
eps = branch._epsilon
|
||||
|
||||
std = (running_var + eps).sqrt()
|
||||
t = (gamma / std).reshape((-1, 1, 1, 1))
|
||||
|
||||
return kernel * t, beta - running_mean * gamma / std
|
||||
69
rtdetr_paddle/ppdet/modeling/backbones/name_adapter.py
Normal file
69
rtdetr_paddle/ppdet/modeling/backbones/name_adapter.py
Normal file
@@ -0,0 +1,69 @@
|
||||
class NameAdapter(object):
|
||||
"""Fix the backbones variable names for pretrained weight"""
|
||||
|
||||
def __init__(self, model):
|
||||
super(NameAdapter, self).__init__()
|
||||
self.model = model
|
||||
|
||||
@property
|
||||
def model_type(self):
|
||||
return getattr(self.model, '_model_type', '')
|
||||
|
||||
@property
|
||||
def variant(self):
|
||||
return getattr(self.model, 'variant', '')
|
||||
|
||||
def fix_conv_norm_name(self, name):
|
||||
if name == "conv1":
|
||||
bn_name = "bn_" + name
|
||||
else:
|
||||
bn_name = "bn" + name[3:]
|
||||
# the naming rule is same as pretrained weight
|
||||
if self.model_type == 'SEResNeXt':
|
||||
bn_name = name + "_bn"
|
||||
return bn_name
|
||||
|
||||
def fix_shortcut_name(self, name):
|
||||
if self.model_type == 'SEResNeXt':
|
||||
name = 'conv' + name + '_prj'
|
||||
return name
|
||||
|
||||
def fix_bottleneck_name(self, name):
|
||||
if self.model_type == 'SEResNeXt':
|
||||
conv_name1 = 'conv' + name + '_x1'
|
||||
conv_name2 = 'conv' + name + '_x2'
|
||||
conv_name3 = 'conv' + name + '_x3'
|
||||
shortcut_name = name
|
||||
else:
|
||||
conv_name1 = name + "_branch2a"
|
||||
conv_name2 = name + "_branch2b"
|
||||
conv_name3 = name + "_branch2c"
|
||||
shortcut_name = name + "_branch1"
|
||||
return conv_name1, conv_name2, conv_name3, shortcut_name
|
||||
|
||||
def fix_basicblock_name(self, name):
|
||||
if self.model_type == 'SEResNeXt':
|
||||
conv_name1 = 'conv' + name + '_x1'
|
||||
conv_name2 = 'conv' + name + '_x2'
|
||||
shortcut_name = name
|
||||
else:
|
||||
conv_name1 = name + "_branch2a"
|
||||
conv_name2 = name + "_branch2b"
|
||||
shortcut_name = name + "_branch1"
|
||||
return conv_name1, conv_name2, shortcut_name
|
||||
|
||||
def fix_layer_warp_name(self, stage_num, count, i):
|
||||
name = 'res' + str(stage_num)
|
||||
if count > 10 and stage_num == 4:
|
||||
if i == 0:
|
||||
conv_name = name + "a"
|
||||
else:
|
||||
conv_name = name + "b" + str(i)
|
||||
else:
|
||||
conv_name = name + chr(ord("a") + i)
|
||||
if self.model_type == 'SEResNeXt':
|
||||
conv_name = str(stage_num + 2) + '_' + str(i + 1)
|
||||
return conv_name
|
||||
|
||||
def fix_c1_stage_name(self):
|
||||
return "res_conv1" if self.model_type == 'ResNeXt' else "conv1"
|
||||
611
rtdetr_paddle/ppdet/modeling/backbones/resnet.py
Executable file
611
rtdetr_paddle/ppdet/modeling/backbones/resnet.py
Executable file
@@ -0,0 +1,611 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
from numbers import Integral
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from paddle.regularizer import L2Decay
|
||||
from paddle.nn.initializer import Uniform
|
||||
from paddle import ParamAttr
|
||||
from paddle.nn.initializer import Constant
|
||||
from paddle.vision.ops import DeformConv2D
|
||||
from .name_adapter import NameAdapter
|
||||
from ..shape_spec import ShapeSpec
|
||||
|
||||
__all__ = ['ResNet', 'Res5Head', 'Blocks', 'BasicBlock', 'BottleNeck']
|
||||
|
||||
ResNet_cfg = {
|
||||
18: [2, 2, 2, 2],
|
||||
34: [3, 4, 6, 3],
|
||||
50: [3, 4, 6, 3],
|
||||
101: [3, 4, 23, 3],
|
||||
152: [3, 8, 36, 3],
|
||||
}
|
||||
|
||||
|
||||
class ConvNormLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
ch_in,
|
||||
ch_out,
|
||||
filter_size,
|
||||
stride,
|
||||
groups=1,
|
||||
act=None,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
freeze_norm=True,
|
||||
lr=1.0,
|
||||
dcn_v2=False):
|
||||
super(ConvNormLayer, self).__init__()
|
||||
assert norm_type in ['bn', 'sync_bn']
|
||||
self.norm_type = norm_type
|
||||
self.act = act
|
||||
self.dcn_v2 = dcn_v2
|
||||
|
||||
if not self.dcn_v2:
|
||||
self.conv = nn.Conv2D(
|
||||
in_channels=ch_in,
|
||||
out_channels=ch_out,
|
||||
kernel_size=filter_size,
|
||||
stride=stride,
|
||||
padding=(filter_size - 1) // 2,
|
||||
groups=groups,
|
||||
weight_attr=ParamAttr(learning_rate=lr),
|
||||
bias_attr=False)
|
||||
else:
|
||||
self.offset_channel = 2 * filter_size**2
|
||||
self.mask_channel = filter_size**2
|
||||
|
||||
self.conv_offset = nn.Conv2D(
|
||||
in_channels=ch_in,
|
||||
out_channels=3 * filter_size**2,
|
||||
kernel_size=filter_size,
|
||||
stride=stride,
|
||||
padding=(filter_size - 1) // 2,
|
||||
weight_attr=ParamAttr(initializer=Constant(0.)),
|
||||
bias_attr=ParamAttr(initializer=Constant(0.)))
|
||||
self.conv = DeformConv2D(
|
||||
in_channels=ch_in,
|
||||
out_channels=ch_out,
|
||||
kernel_size=filter_size,
|
||||
stride=stride,
|
||||
padding=(filter_size - 1) // 2,
|
||||
dilation=1,
|
||||
groups=groups,
|
||||
weight_attr=ParamAttr(learning_rate=lr),
|
||||
bias_attr=False)
|
||||
|
||||
norm_lr = 0. if freeze_norm else lr
|
||||
param_attr = ParamAttr(
|
||||
learning_rate=norm_lr,
|
||||
regularizer=L2Decay(norm_decay),
|
||||
trainable=False if freeze_norm else True)
|
||||
bias_attr = ParamAttr(
|
||||
learning_rate=norm_lr,
|
||||
regularizer=L2Decay(norm_decay),
|
||||
trainable=False if freeze_norm else True)
|
||||
|
||||
global_stats = True if freeze_norm else None
|
||||
if norm_type in ['sync_bn', 'bn']:
|
||||
self.norm = nn.BatchNorm2D(
|
||||
ch_out,
|
||||
weight_attr=param_attr,
|
||||
bias_attr=bias_attr,
|
||||
use_global_stats=global_stats)
|
||||
norm_params = self.norm.parameters()
|
||||
|
||||
if freeze_norm:
|
||||
for param in norm_params:
|
||||
param.stop_gradient = True
|
||||
|
||||
def forward(self, inputs):
|
||||
if not self.dcn_v2:
|
||||
out = self.conv(inputs)
|
||||
else:
|
||||
offset_mask = self.conv_offset(inputs)
|
||||
offset, mask = paddle.split(
|
||||
offset_mask,
|
||||
num_or_sections=[self.offset_channel, self.mask_channel],
|
||||
axis=1)
|
||||
mask = F.sigmoid(mask)
|
||||
out = self.conv(inputs, offset, mask=mask)
|
||||
|
||||
if self.norm_type in ['bn', 'sync_bn']:
|
||||
out = self.norm(out)
|
||||
if self.act:
|
||||
out = getattr(F, self.act)(out)
|
||||
return out
|
||||
|
||||
|
||||
class SELayer(nn.Layer):
|
||||
def __init__(self, ch, reduction_ratio=16):
|
||||
super(SELayer, self).__init__()
|
||||
self.pool = nn.AdaptiveAvgPool2D(1)
|
||||
stdv = 1.0 / math.sqrt(ch)
|
||||
c_ = ch // reduction_ratio
|
||||
self.squeeze = nn.Linear(
|
||||
ch,
|
||||
c_,
|
||||
weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
|
||||
bias_attr=True)
|
||||
|
||||
stdv = 1.0 / math.sqrt(c_)
|
||||
self.extract = nn.Linear(
|
||||
c_,
|
||||
ch,
|
||||
weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
|
||||
bias_attr=True)
|
||||
|
||||
def forward(self, inputs):
|
||||
out = self.pool(inputs)
|
||||
out = paddle.squeeze(out, axis=[2, 3])
|
||||
out = self.squeeze(out)
|
||||
out = F.relu(out)
|
||||
out = self.extract(out)
|
||||
out = F.sigmoid(out)
|
||||
out = paddle.unsqueeze(out, axis=[2, 3])
|
||||
scale = out * inputs
|
||||
return scale
|
||||
|
||||
|
||||
class BasicBlock(nn.Layer):
|
||||
|
||||
expansion = 1
|
||||
|
||||
def __init__(self,
|
||||
ch_in,
|
||||
ch_out,
|
||||
stride,
|
||||
shortcut,
|
||||
variant='b',
|
||||
groups=1,
|
||||
base_width=64,
|
||||
lr=1.0,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
freeze_norm=True,
|
||||
dcn_v2=False,
|
||||
std_senet=False):
|
||||
super(BasicBlock, self).__init__()
|
||||
assert groups == 1 and base_width == 64, 'BasicBlock only supports groups=1 and base_width=64'
|
||||
|
||||
self.shortcut = shortcut
|
||||
if not shortcut:
|
||||
if variant == 'd' and stride == 2:
|
||||
self.short = nn.Sequential()
|
||||
self.short.add_sublayer(
|
||||
'pool',
|
||||
nn.AvgPool2D(
|
||||
kernel_size=2, stride=2, padding=0, ceil_mode=True))
|
||||
self.short.add_sublayer(
|
||||
'conv',
|
||||
ConvNormLayer(
|
||||
ch_in=ch_in,
|
||||
ch_out=ch_out,
|
||||
filter_size=1,
|
||||
stride=1,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
lr=lr))
|
||||
else:
|
||||
self.short = ConvNormLayer(
|
||||
ch_in=ch_in,
|
||||
ch_out=ch_out,
|
||||
filter_size=1,
|
||||
stride=stride,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
lr=lr)
|
||||
|
||||
self.branch2a = ConvNormLayer(
|
||||
ch_in=ch_in,
|
||||
ch_out=ch_out,
|
||||
filter_size=3,
|
||||
stride=stride,
|
||||
act='relu',
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
lr=lr)
|
||||
|
||||
self.branch2b = ConvNormLayer(
|
||||
ch_in=ch_out,
|
||||
ch_out=ch_out,
|
||||
filter_size=3,
|
||||
stride=1,
|
||||
act=None,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
lr=lr,
|
||||
dcn_v2=dcn_v2)
|
||||
|
||||
self.std_senet = std_senet
|
||||
if self.std_senet:
|
||||
self.se = SELayer(ch_out)
|
||||
|
||||
def forward(self, inputs):
|
||||
out = self.branch2a(inputs)
|
||||
out = self.branch2b(out)
|
||||
if self.std_senet:
|
||||
out = self.se(out)
|
||||
|
||||
if self.shortcut:
|
||||
short = inputs
|
||||
else:
|
||||
short = self.short(inputs)
|
||||
|
||||
out = paddle.add(x=out, y=short)
|
||||
out = F.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class BottleNeck(nn.Layer):
|
||||
|
||||
expansion = 4
|
||||
|
||||
def __init__(self,
|
||||
ch_in,
|
||||
ch_out,
|
||||
stride,
|
||||
shortcut,
|
||||
variant='b',
|
||||
groups=1,
|
||||
base_width=4,
|
||||
lr=1.0,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
freeze_norm=True,
|
||||
dcn_v2=False,
|
||||
std_senet=False):
|
||||
super(BottleNeck, self).__init__()
|
||||
if variant == 'a':
|
||||
stride1, stride2 = stride, 1
|
||||
else:
|
||||
stride1, stride2 = 1, stride
|
||||
|
||||
# ResNeXt
|
||||
width = int(ch_out * (base_width / 64.)) * groups
|
||||
|
||||
self.branch2a = ConvNormLayer(
|
||||
ch_in=ch_in,
|
||||
ch_out=width,
|
||||
filter_size=1,
|
||||
stride=stride1,
|
||||
groups=1,
|
||||
act='relu',
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
lr=lr)
|
||||
|
||||
self.branch2b = ConvNormLayer(
|
||||
ch_in=width,
|
||||
ch_out=width,
|
||||
filter_size=3,
|
||||
stride=stride2,
|
||||
groups=groups,
|
||||
act='relu',
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
lr=lr,
|
||||
dcn_v2=dcn_v2)
|
||||
|
||||
self.branch2c = ConvNormLayer(
|
||||
ch_in=width,
|
||||
ch_out=ch_out * self.expansion,
|
||||
filter_size=1,
|
||||
stride=1,
|
||||
groups=1,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
lr=lr)
|
||||
|
||||
self.shortcut = shortcut
|
||||
if not shortcut:
|
||||
if variant == 'd' and stride == 2:
|
||||
self.short = nn.Sequential()
|
||||
self.short.add_sublayer(
|
||||
'pool',
|
||||
nn.AvgPool2D(
|
||||
kernel_size=2, stride=2, padding=0, ceil_mode=True))
|
||||
self.short.add_sublayer(
|
||||
'conv',
|
||||
ConvNormLayer(
|
||||
ch_in=ch_in,
|
||||
ch_out=ch_out * self.expansion,
|
||||
filter_size=1,
|
||||
stride=1,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
lr=lr))
|
||||
else:
|
||||
self.short = ConvNormLayer(
|
||||
ch_in=ch_in,
|
||||
ch_out=ch_out * self.expansion,
|
||||
filter_size=1,
|
||||
stride=stride,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
lr=lr)
|
||||
|
||||
self.std_senet = std_senet
|
||||
if self.std_senet:
|
||||
self.se = SELayer(ch_out * self.expansion)
|
||||
|
||||
def forward(self, inputs):
|
||||
|
||||
out = self.branch2a(inputs)
|
||||
out = self.branch2b(out)
|
||||
out = self.branch2c(out)
|
||||
|
||||
if self.std_senet:
|
||||
out = self.se(out)
|
||||
|
||||
if self.shortcut:
|
||||
short = inputs
|
||||
else:
|
||||
short = self.short(inputs)
|
||||
|
||||
out = paddle.add(x=out, y=short)
|
||||
out = F.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class Blocks(nn.Layer):
|
||||
def __init__(self,
|
||||
block,
|
||||
ch_in,
|
||||
ch_out,
|
||||
count,
|
||||
name_adapter,
|
||||
stage_num,
|
||||
variant='b',
|
||||
groups=1,
|
||||
base_width=64,
|
||||
lr=1.0,
|
||||
norm_type='bn',
|
||||
norm_decay=0.,
|
||||
freeze_norm=True,
|
||||
dcn_v2=False,
|
||||
std_senet=False):
|
||||
super(Blocks, self).__init__()
|
||||
|
||||
self.blocks = []
|
||||
for i in range(count):
|
||||
conv_name = name_adapter.fix_layer_warp_name(stage_num, count, i)
|
||||
layer = self.add_sublayer(
|
||||
conv_name,
|
||||
block(
|
||||
ch_in=ch_in,
|
||||
ch_out=ch_out,
|
||||
stride=2 if i == 0 and stage_num != 2 else 1,
|
||||
shortcut=False if i == 0 else True,
|
||||
variant=variant,
|
||||
groups=groups,
|
||||
base_width=base_width,
|
||||
lr=lr,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
dcn_v2=dcn_v2,
|
||||
std_senet=std_senet))
|
||||
self.blocks.append(layer)
|
||||
if i == 0:
|
||||
ch_in = ch_out * block.expansion
|
||||
|
||||
def forward(self, inputs):
|
||||
block_out = inputs
|
||||
for block in self.blocks:
|
||||
block_out = block(block_out)
|
||||
return block_out
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class ResNet(nn.Layer):
|
||||
__shared__ = ['norm_type']
|
||||
|
||||
def __init__(self,
|
||||
depth=50,
|
||||
ch_in=64,
|
||||
variant='b',
|
||||
lr_mult_list=[1.0, 1.0, 1.0, 1.0],
|
||||
groups=1,
|
||||
base_width=64,
|
||||
norm_type='bn',
|
||||
norm_decay=0,
|
||||
freeze_norm=True,
|
||||
freeze_at=0,
|
||||
return_idx=[0, 1, 2, 3],
|
||||
dcn_v2_stages=[-1],
|
||||
num_stages=4,
|
||||
std_senet=False,
|
||||
freeze_stem_only=False):
|
||||
"""
|
||||
Residual Network, see https://arxiv.org/abs/1512.03385
|
||||
|
||||
Args:
|
||||
depth (int): ResNet depth, should be 18, 34, 50, 101, 152.
|
||||
ch_in (int): output channel of first stage, default 64
|
||||
variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently
|
||||
lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),
|
||||
lower learning rate ratio is need for pretrained model
|
||||
got using distillation(default as [1.0, 1.0, 1.0, 1.0]).
|
||||
groups (int): group convolution cardinality
|
||||
base_width (int): base width of each group convolution
|
||||
norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'
|
||||
norm_decay (float): weight decay for normalization layer weights
|
||||
freeze_norm (bool): freeze normalization layers
|
||||
freeze_at (int): freeze the backbone at which stage
|
||||
return_idx (list): index of the stages whose feature maps are returned
|
||||
dcn_v2_stages (list): index of stages who select deformable conv v2
|
||||
num_stages (int): total num of stages
|
||||
std_senet (bool): whether use senet, default False.
|
||||
"""
|
||||
super(ResNet, self).__init__()
|
||||
self._model_type = 'ResNet' if groups == 1 else 'ResNeXt'
|
||||
assert num_stages >= 1 and num_stages <= 4
|
||||
self.depth = depth
|
||||
self.variant = variant
|
||||
self.groups = groups
|
||||
self.base_width = base_width
|
||||
self.norm_type = norm_type
|
||||
self.norm_decay = norm_decay
|
||||
self.freeze_norm = freeze_norm
|
||||
self.freeze_at = freeze_at
|
||||
if isinstance(return_idx, Integral):
|
||||
return_idx = [return_idx]
|
||||
assert max(return_idx) < num_stages, \
|
||||
'the maximum return index must smaller than num_stages, ' \
|
||||
'but received maximum return index is {} and num_stages ' \
|
||||
'is {}'.format(max(return_idx), num_stages)
|
||||
self.return_idx = return_idx
|
||||
self.num_stages = num_stages
|
||||
assert len(lr_mult_list) == 4, \
|
||||
"lr_mult_list length must be 4 but got {}".format(len(lr_mult_list))
|
||||
if isinstance(dcn_v2_stages, Integral):
|
||||
dcn_v2_stages = [dcn_v2_stages]
|
||||
assert max(dcn_v2_stages) < num_stages
|
||||
|
||||
if isinstance(dcn_v2_stages, Integral):
|
||||
dcn_v2_stages = [dcn_v2_stages]
|
||||
assert max(dcn_v2_stages) < num_stages
|
||||
self.dcn_v2_stages = dcn_v2_stages
|
||||
|
||||
block_nums = ResNet_cfg[depth]
|
||||
na = NameAdapter(self)
|
||||
|
||||
conv1_name = na.fix_c1_stage_name()
|
||||
if variant in ['c', 'd']:
|
||||
conv_def = [
|
||||
[3, ch_in // 2, 3, 2, "conv1_1"],
|
||||
[ch_in // 2, ch_in // 2, 3, 1, "conv1_2"],
|
||||
[ch_in // 2, ch_in, 3, 1, "conv1_3"],
|
||||
]
|
||||
else:
|
||||
conv_def = [[3, ch_in, 7, 2, conv1_name]]
|
||||
self.conv1 = nn.Sequential()
|
||||
for (c_in, c_out, k, s, _name) in conv_def:
|
||||
self.conv1.add_sublayer(
|
||||
_name,
|
||||
ConvNormLayer(
|
||||
ch_in=c_in,
|
||||
ch_out=c_out,
|
||||
filter_size=k,
|
||||
stride=s,
|
||||
groups=1,
|
||||
act='relu',
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
lr=1.0))
|
||||
|
||||
self.ch_in = ch_in
|
||||
ch_out_list = [64, 128, 256, 512]
|
||||
block = BottleNeck if depth >= 50 else BasicBlock
|
||||
|
||||
self._out_channels = [block.expansion * v for v in ch_out_list]
|
||||
self._out_strides = [4, 8, 16, 32]
|
||||
|
||||
self.res_layers = []
|
||||
for i in range(num_stages):
|
||||
lr_mult = lr_mult_list[i]
|
||||
stage_num = i + 2
|
||||
res_name = "res{}".format(stage_num)
|
||||
res_layer = self.add_sublayer(
|
||||
res_name,
|
||||
Blocks(
|
||||
block,
|
||||
self.ch_in,
|
||||
ch_out_list[i],
|
||||
count=block_nums[i],
|
||||
name_adapter=na,
|
||||
stage_num=stage_num,
|
||||
variant=variant,
|
||||
groups=groups,
|
||||
base_width=base_width,
|
||||
lr=lr_mult,
|
||||
norm_type=norm_type,
|
||||
norm_decay=norm_decay,
|
||||
freeze_norm=freeze_norm,
|
||||
dcn_v2=(i in self.dcn_v2_stages),
|
||||
std_senet=std_senet))
|
||||
self.res_layers.append(res_layer)
|
||||
self.ch_in = self._out_channels[i]
|
||||
|
||||
if freeze_at >= 0:
|
||||
self._freeze_parameters(self.conv1)
|
||||
if not freeze_stem_only:
|
||||
for i in range(min(freeze_at + 1, num_stages)):
|
||||
self._freeze_parameters(self.res_layers[i])
|
||||
|
||||
def _freeze_parameters(self, m):
|
||||
for p in m.parameters():
|
||||
p.stop_gradient = True
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [
|
||||
ShapeSpec(
|
||||
channels=self._out_channels[i], stride=self._out_strides[i])
|
||||
for i in self.return_idx
|
||||
]
|
||||
|
||||
def forward(self, inputs):
|
||||
x = inputs['image']
|
||||
conv1 = self.conv1(x)
|
||||
x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1)
|
||||
outs = []
|
||||
for idx, stage in enumerate(self.res_layers):
|
||||
x = stage(x)
|
||||
if idx in self.return_idx:
|
||||
outs.append(x)
|
||||
return outs
|
||||
|
||||
|
||||
@register
|
||||
class Res5Head(nn.Layer):
|
||||
def __init__(self, depth=50):
|
||||
super(Res5Head, self).__init__()
|
||||
feat_in, feat_out = [1024, 512]
|
||||
if depth < 50:
|
||||
feat_in = 256
|
||||
na = NameAdapter(self)
|
||||
block = BottleNeck if depth >= 50 else BasicBlock
|
||||
self.res5 = Blocks(
|
||||
block, feat_in, feat_out, count=3, name_adapter=na, stage_num=5)
|
||||
self.feat_out = feat_out if depth < 50 else feat_out * 4
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [ShapeSpec(
|
||||
channels=self.feat_out,
|
||||
stride=16, )]
|
||||
|
||||
def forward(self, roi_feat, stage=0):
|
||||
y = self.res5(roi_feat)
|
||||
return y
|
||||
250
rtdetr_paddle/ppdet/modeling/backbones/shufflenet_v2.py
Normal file
250
rtdetr_paddle/ppdet/modeling/backbones/shufflenet_v2.py
Normal file
@@ -0,0 +1,250 @@
|
||||
# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
from paddle import ParamAttr
|
||||
import paddle.nn.functional as F
|
||||
from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm2D
|
||||
from paddle.nn.initializer import KaimingNormal
|
||||
from paddle.regularizer import L2Decay
|
||||
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from numbers import Integral
|
||||
from ..shape_spec import ShapeSpec
|
||||
from ppdet.modeling.ops import channel_shuffle
|
||||
|
||||
__all__ = ['ShuffleNetV2']
|
||||
|
||||
|
||||
class ConvBNLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
groups=1,
|
||||
act=None):
|
||||
super(ConvBNLayer, self).__init__()
|
||||
self._conv = Conv2D(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels,
|
||||
kernel_size=kernel_size,
|
||||
stride=stride,
|
||||
padding=padding,
|
||||
groups=groups,
|
||||
weight_attr=ParamAttr(initializer=KaimingNormal()),
|
||||
bias_attr=False)
|
||||
|
||||
self._batch_norm = BatchNorm2D(
|
||||
out_channels,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
|
||||
if act == "hard_swish":
|
||||
act = 'hardswish'
|
||||
self.act = act
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self._conv(inputs)
|
||||
y = self._batch_norm(y)
|
||||
if self.act:
|
||||
y = getattr(F, self.act)(y)
|
||||
return y
|
||||
|
||||
|
||||
class InvertedResidual(nn.Layer):
|
||||
def __init__(self, in_channels, out_channels, stride, act="relu"):
|
||||
super(InvertedResidual, self).__init__()
|
||||
self._conv_pw = ConvBNLayer(
|
||||
in_channels=in_channels // 2,
|
||||
out_channels=out_channels // 2,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
groups=1,
|
||||
act=act)
|
||||
self._conv_dw = ConvBNLayer(
|
||||
in_channels=out_channels // 2,
|
||||
out_channels=out_channels // 2,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
padding=1,
|
||||
groups=out_channels // 2,
|
||||
act=None)
|
||||
self._conv_linear = ConvBNLayer(
|
||||
in_channels=out_channels // 2,
|
||||
out_channels=out_channels // 2,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
groups=1,
|
||||
act=act)
|
||||
|
||||
def forward(self, inputs):
|
||||
x1, x2 = paddle.split(
|
||||
inputs,
|
||||
num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],
|
||||
axis=1)
|
||||
x2 = self._conv_pw(x2)
|
||||
x2 = self._conv_dw(x2)
|
||||
x2 = self._conv_linear(x2)
|
||||
out = paddle.concat([x1, x2], axis=1)
|
||||
return channel_shuffle(out, 2)
|
||||
|
||||
|
||||
class InvertedResidualDS(nn.Layer):
|
||||
def __init__(self, in_channels, out_channels, stride, act="relu"):
|
||||
super(InvertedResidualDS, self).__init__()
|
||||
|
||||
# branch1
|
||||
self._conv_dw_1 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=in_channels,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
padding=1,
|
||||
groups=in_channels,
|
||||
act=None)
|
||||
self._conv_linear_1 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels // 2,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
groups=1,
|
||||
act=act)
|
||||
# branch2
|
||||
self._conv_pw_2 = ConvBNLayer(
|
||||
in_channels=in_channels,
|
||||
out_channels=out_channels // 2,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
groups=1,
|
||||
act=act)
|
||||
self._conv_dw_2 = ConvBNLayer(
|
||||
in_channels=out_channels // 2,
|
||||
out_channels=out_channels // 2,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
padding=1,
|
||||
groups=out_channels // 2,
|
||||
act=None)
|
||||
self._conv_linear_2 = ConvBNLayer(
|
||||
in_channels=out_channels // 2,
|
||||
out_channels=out_channels // 2,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
groups=1,
|
||||
act=act)
|
||||
|
||||
def forward(self, inputs):
|
||||
x1 = self._conv_dw_1(inputs)
|
||||
x1 = self._conv_linear_1(x1)
|
||||
x2 = self._conv_pw_2(inputs)
|
||||
x2 = self._conv_dw_2(x2)
|
||||
x2 = self._conv_linear_2(x2)
|
||||
out = paddle.concat([x1, x2], axis=1)
|
||||
|
||||
return channel_shuffle(out, 2)
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class ShuffleNetV2(nn.Layer):
|
||||
def __init__(self, scale=1.0, act="relu", feature_maps=[5, 13, 17]):
|
||||
super(ShuffleNetV2, self).__init__()
|
||||
self.scale = scale
|
||||
if isinstance(feature_maps, Integral):
|
||||
feature_maps = [feature_maps]
|
||||
self.feature_maps = feature_maps
|
||||
stage_repeats = [4, 8, 4]
|
||||
|
||||
if scale == 0.25:
|
||||
stage_out_channels = [-1, 24, 24, 48, 96, 512]
|
||||
elif scale == 0.33:
|
||||
stage_out_channels = [-1, 24, 32, 64, 128, 512]
|
||||
elif scale == 0.5:
|
||||
stage_out_channels = [-1, 24, 48, 96, 192, 1024]
|
||||
elif scale == 1.0:
|
||||
stage_out_channels = [-1, 24, 116, 232, 464, 1024]
|
||||
elif scale == 1.5:
|
||||
stage_out_channels = [-1, 24, 176, 352, 704, 1024]
|
||||
elif scale == 2.0:
|
||||
stage_out_channels = [-1, 24, 244, 488, 976, 2048]
|
||||
else:
|
||||
raise NotImplementedError("This scale size:[" + str(scale) +
|
||||
"] is not implemented!")
|
||||
self._out_channels = []
|
||||
self._feature_idx = 0
|
||||
# 1. conv1
|
||||
self._conv1 = ConvBNLayer(
|
||||
in_channels=3,
|
||||
out_channels=stage_out_channels[1],
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
act=act)
|
||||
self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
|
||||
self._feature_idx += 1
|
||||
|
||||
# 2. bottleneck sequences
|
||||
self._block_list = []
|
||||
for stage_id, num_repeat in enumerate(stage_repeats):
|
||||
for i in range(num_repeat):
|
||||
if i == 0:
|
||||
block = self.add_sublayer(
|
||||
name=str(stage_id + 2) + '_' + str(i + 1),
|
||||
sublayer=InvertedResidualDS(
|
||||
in_channels=stage_out_channels[stage_id + 1],
|
||||
out_channels=stage_out_channels[stage_id + 2],
|
||||
stride=2,
|
||||
act=act))
|
||||
else:
|
||||
block = self.add_sublayer(
|
||||
name=str(stage_id + 2) + '_' + str(i + 1),
|
||||
sublayer=InvertedResidual(
|
||||
in_channels=stage_out_channels[stage_id + 2],
|
||||
out_channels=stage_out_channels[stage_id + 2],
|
||||
stride=1,
|
||||
act=act))
|
||||
self._block_list.append(block)
|
||||
self._feature_idx += 1
|
||||
self._update_out_channels(stage_out_channels[stage_id + 2],
|
||||
self._feature_idx, self.feature_maps)
|
||||
|
||||
def _update_out_channels(self, channel, feature_idx, feature_maps):
|
||||
if feature_idx in feature_maps:
|
||||
self._out_channels.append(channel)
|
||||
|
||||
def forward(self, inputs):
|
||||
y = self._conv1(inputs['image'])
|
||||
y = self._max_pool(y)
|
||||
outs = []
|
||||
for i, inv in enumerate(self._block_list):
|
||||
y = inv(y)
|
||||
if i + 2 in self.feature_maps:
|
||||
outs.append(y)
|
||||
|
||||
return outs
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [ShapeSpec(channels=c) for c in self._out_channels]
|
||||
752
rtdetr_paddle/ppdet/modeling/backbones/swin_transformer.py
Normal file
752
rtdetr_paddle/ppdet/modeling/backbones/swin_transformer.py
Normal file
@@ -0,0 +1,752 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This code is based on https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
|
||||
Ths copyright of microsoft/Swin-Transformer is as follows:
|
||||
MIT License [see LICENSE for details]
|
||||
"""
|
||||
import numpy as np
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from ppdet.modeling.shape_spec import ShapeSpec
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from .transformer_utils import DropPath, Identity
|
||||
from .transformer_utils import add_parameter, to_2tuple
|
||||
from .transformer_utils import ones_, zeros_, trunc_normal_
|
||||
|
||||
__all__ = ['SwinTransformer']
|
||||
|
||||
MODEL_cfg = {
|
||||
# use 22kto1k finetune weights as default pretrained, can set by SwinTransformer.pretrained in config
|
||||
'swin_T_224': dict(
|
||||
pretrain_img_size=224,
|
||||
embed_dim=96,
|
||||
depths=[2, 2, 6, 2],
|
||||
num_heads=[3, 6, 12, 24],
|
||||
window_size=7,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_tiny_patch4_window7_224_22kto1k_pretrained.pdparams',
|
||||
),
|
||||
'swin_S_224': dict(
|
||||
pretrain_img_size=224,
|
||||
embed_dim=96,
|
||||
depths=[2, 2, 18, 2],
|
||||
num_heads=[3, 6, 12, 24],
|
||||
window_size=7,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_small_patch4_window7_224_22kto1k_pretrained.pdparams',
|
||||
),
|
||||
'swin_B_224': dict(
|
||||
pretrain_img_size=224,
|
||||
embed_dim=128,
|
||||
depths=[2, 2, 18, 2],
|
||||
num_heads=[4, 8, 16, 32],
|
||||
window_size=7,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window7_224_22kto1k_pretrained.pdparams',
|
||||
),
|
||||
'swin_L_224': dict(
|
||||
pretrain_img_size=224,
|
||||
embed_dim=192,
|
||||
depths=[2, 2, 18, 2],
|
||||
num_heads=[6, 12, 24, 48],
|
||||
window_size=7,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window7_224_22kto1k_pretrained.pdparams',
|
||||
),
|
||||
'swin_B_384': dict(
|
||||
pretrain_img_size=384,
|
||||
embed_dim=128,
|
||||
depths=[2, 2, 18, 2],
|
||||
num_heads=[4, 8, 16, 32],
|
||||
window_size=12,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window12_384_22kto1k_pretrained.pdparams',
|
||||
),
|
||||
'swin_L_384': dict(
|
||||
pretrain_img_size=384,
|
||||
embed_dim=192,
|
||||
depths=[2, 2, 18, 2],
|
||||
num_heads=[6, 12, 24, 48],
|
||||
window_size=12,
|
||||
pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window12_384_22kto1k_pretrained.pdparams',
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
class Mlp(nn.Layer):
|
||||
def __init__(self,
|
||||
in_features,
|
||||
hidden_features=None,
|
||||
out_features=None,
|
||||
act_layer=nn.GELU,
|
||||
drop=0.):
|
||||
super().__init__()
|
||||
out_features = out_features or in_features
|
||||
hidden_features = hidden_features or in_features
|
||||
self.fc1 = nn.Linear(in_features, hidden_features)
|
||||
self.act = act_layer()
|
||||
self.fc2 = nn.Linear(hidden_features, out_features)
|
||||
self.drop = nn.Dropout(drop)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.fc1(x)
|
||||
x = self.act(x)
|
||||
x = self.drop(x)
|
||||
x = self.fc2(x)
|
||||
x = self.drop(x)
|
||||
return x
|
||||
|
||||
|
||||
def window_partition(x, window_size):
|
||||
"""
|
||||
Args:
|
||||
x: (B, H, W, C)
|
||||
window_size (int): window size
|
||||
Returns:
|
||||
windows: (num_windows*B, window_size, window_size, C)
|
||||
"""
|
||||
B, H, W, C = x.shape
|
||||
x = x.reshape(
|
||||
[-1, H // window_size, window_size, W // window_size, window_size, C])
|
||||
windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
|
||||
[-1, window_size, window_size, C])
|
||||
return windows
|
||||
|
||||
|
||||
def window_reverse(windows, window_size, H, W):
|
||||
"""
|
||||
Args:
|
||||
windows: (num_windows*B, window_size, window_size, C)
|
||||
window_size (int): Window size
|
||||
H (int): Height of image
|
||||
W (int): Width of image
|
||||
Returns:
|
||||
x: (B, H, W, C)
|
||||
"""
|
||||
_, _, _, C = windows.shape
|
||||
B = int(windows.shape[0] / (H * W / window_size / window_size))
|
||||
x = windows.reshape(
|
||||
[-1, H // window_size, W // window_size, window_size, window_size, C])
|
||||
x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C])
|
||||
return x
|
||||
|
||||
|
||||
class WindowAttention(nn.Layer):
|
||||
""" Window based multi-head self attention (W-MSA) module with relative position bias.
|
||||
It supports both of shifted and non-shifted window.
|
||||
|
||||
Args:
|
||||
dim (int): Number of input channels.
|
||||
window_size (tuple[int]): The height and width of the window.
|
||||
num_heads (int): Number of attention heads.
|
||||
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
|
||||
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
|
||||
attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
|
||||
proj_drop (float, optional): Dropout ratio of output. Default: 0.0
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
dim,
|
||||
window_size,
|
||||
num_heads,
|
||||
qkv_bias=True,
|
||||
qk_scale=None,
|
||||
attn_drop=0.,
|
||||
proj_drop=0.):
|
||||
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.window_size = window_size # Wh, Ww
|
||||
self.num_heads = num_heads
|
||||
head_dim = dim // num_heads
|
||||
self.scale = qk_scale or head_dim**-0.5
|
||||
|
||||
# define a parameter table of relative position bias
|
||||
self.relative_position_bias_table = add_parameter(
|
||||
self,
|
||||
paddle.zeros(((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
|
||||
num_heads))) # 2*Wh-1 * 2*Ww-1, nH
|
||||
|
||||
# get pair-wise relative position index for each token inside the window
|
||||
coords_h = paddle.arange(self.window_size[0])
|
||||
coords_w = paddle.arange(self.window_size[1])
|
||||
coords = paddle.stack(paddle.meshgrid(
|
||||
[coords_h, coords_w])) # 2, Wh, Ww
|
||||
coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww
|
||||
coords_flatten_1 = coords_flatten.unsqueeze(axis=2)
|
||||
coords_flatten_2 = coords_flatten.unsqueeze(axis=1)
|
||||
relative_coords = coords_flatten_1 - coords_flatten_2
|
||||
relative_coords = relative_coords.transpose(
|
||||
[1, 2, 0]) # Wh*Ww, Wh*Ww, 2
|
||||
relative_coords[:, :, 0] += self.window_size[
|
||||
0] - 1 # shift to start from 0
|
||||
relative_coords[:, :, 1] += self.window_size[1] - 1
|
||||
relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
|
||||
self.relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
|
||||
|
||||
self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
|
||||
self.attn_drop = nn.Dropout(attn_drop)
|
||||
self.proj = nn.Linear(dim, dim)
|
||||
self.proj_drop = nn.Dropout(proj_drop)
|
||||
|
||||
trunc_normal_(self.relative_position_bias_table)
|
||||
self.softmax = nn.Softmax(axis=-1)
|
||||
|
||||
def forward(self, x, mask=None):
|
||||
""" Forward function.
|
||||
Args:
|
||||
x: input features with shape of (num_windows*B, N, C)
|
||||
mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
|
||||
"""
|
||||
B_, N, C = x.shape
|
||||
qkv = self.qkv(x).reshape(
|
||||
[-1, N, 3, self.num_heads, C // self.num_heads]).transpose(
|
||||
[2, 0, 3, 1, 4])
|
||||
q, k, v = qkv[0], qkv[1], qkv[2]
|
||||
|
||||
q = q * self.scale
|
||||
attn = paddle.mm(q, k.transpose([0, 1, 3, 2]))
|
||||
|
||||
index = self.relative_position_index.flatten()
|
||||
|
||||
relative_position_bias = paddle.index_select(
|
||||
self.relative_position_bias_table, index)
|
||||
relative_position_bias = relative_position_bias.reshape([
|
||||
self.window_size[0] * self.window_size[1],
|
||||
self.window_size[0] * self.window_size[1], -1
|
||||
]) # Wh*Ww,Wh*Ww,nH
|
||||
relative_position_bias = relative_position_bias.transpose(
|
||||
[2, 0, 1]) # nH, Wh*Ww, Wh*Ww
|
||||
attn = attn + relative_position_bias.unsqueeze(0)
|
||||
|
||||
if mask is not None:
|
||||
nW = mask.shape[0]
|
||||
attn = attn.reshape([-1, nW, self.num_heads, N, N
|
||||
]) + mask.unsqueeze(1).unsqueeze(0)
|
||||
attn = attn.reshape([-1, self.num_heads, N, N])
|
||||
attn = self.softmax(attn)
|
||||
else:
|
||||
attn = self.softmax(attn)
|
||||
|
||||
attn = self.attn_drop(attn)
|
||||
|
||||
# x = (attn @ v).transpose(1, 2).reshape([B_, N, C])
|
||||
x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([-1, N, C])
|
||||
x = self.proj(x)
|
||||
x = self.proj_drop(x)
|
||||
return x
|
||||
|
||||
|
||||
class SwinTransformerBlock(nn.Layer):
|
||||
""" Swin Transformer Block.
|
||||
Args:
|
||||
dim (int): Number of input channels.
|
||||
num_heads (int): Number of attention heads.
|
||||
window_size (int): Window size.
|
||||
shift_size (int): Shift size for SW-MSA.
|
||||
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
|
||||
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
|
||||
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
|
||||
drop (float, optional): Dropout rate. Default: 0.0
|
||||
attn_drop (float, optional): Attention dropout rate. Default: 0.0
|
||||
drop_path (float, optional): Stochastic depth rate. Default: 0.0
|
||||
act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
|
||||
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
dim,
|
||||
num_heads,
|
||||
window_size=7,
|
||||
shift_size=0,
|
||||
mlp_ratio=4.,
|
||||
qkv_bias=True,
|
||||
qk_scale=None,
|
||||
drop=0.,
|
||||
attn_drop=0.,
|
||||
drop_path=0.,
|
||||
act_layer=nn.GELU,
|
||||
norm_layer=nn.LayerNorm):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.num_heads = num_heads
|
||||
self.window_size = window_size
|
||||
self.shift_size = shift_size
|
||||
self.mlp_ratio = mlp_ratio
|
||||
assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
|
||||
|
||||
self.norm1 = norm_layer(dim)
|
||||
self.attn = WindowAttention(
|
||||
dim,
|
||||
window_size=to_2tuple(self.window_size),
|
||||
num_heads=num_heads,
|
||||
qkv_bias=qkv_bias,
|
||||
qk_scale=qk_scale,
|
||||
attn_drop=attn_drop,
|
||||
proj_drop=drop)
|
||||
|
||||
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
|
||||
self.norm2 = norm_layer(dim)
|
||||
mlp_hidden_dim = int(dim * mlp_ratio)
|
||||
self.mlp = Mlp(in_features=dim,
|
||||
hidden_features=mlp_hidden_dim,
|
||||
act_layer=act_layer,
|
||||
drop=drop)
|
||||
|
||||
self.H = None
|
||||
self.W = None
|
||||
|
||||
def forward(self, x, mask_matrix):
|
||||
""" Forward function.
|
||||
Args:
|
||||
x: Input feature, tensor size (B, H*W, C).
|
||||
H, W: Spatial resolution of the input feature.
|
||||
mask_matrix: Attention mask for cyclic shift.
|
||||
"""
|
||||
B, L, C = x.shape
|
||||
H, W = self.H, self.W
|
||||
assert L == H * W, "input feature has wrong size"
|
||||
|
||||
shortcut = x
|
||||
x = self.norm1(x)
|
||||
x = x.reshape([-1, H, W, C])
|
||||
|
||||
# pad feature maps to multiples of window size
|
||||
pad_l = pad_t = 0
|
||||
pad_r = (self.window_size - W % self.window_size) % self.window_size
|
||||
pad_b = (self.window_size - H % self.window_size) % self.window_size
|
||||
x = F.pad(x, [0, pad_l, 0, pad_b, 0, pad_r, 0, pad_t],
|
||||
data_format='NHWC')
|
||||
_, Hp, Wp, _ = x.shape
|
||||
|
||||
# cyclic shift
|
||||
if self.shift_size > 0:
|
||||
shifted_x = paddle.roll(
|
||||
x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))
|
||||
attn_mask = mask_matrix
|
||||
else:
|
||||
shifted_x = x
|
||||
attn_mask = None
|
||||
|
||||
# partition windows
|
||||
x_windows = window_partition(
|
||||
shifted_x, self.window_size) # nW*B, window_size, window_size, C
|
||||
x_windows = x_windows.reshape(
|
||||
[x_windows.shape[0], self.window_size * self.window_size,
|
||||
C]) # nW*B, window_size*window_size, C
|
||||
|
||||
# W-MSA/SW-MSA
|
||||
attn_windows = self.attn(
|
||||
x_windows, mask=attn_mask) # nW*B, window_size*window_size, C
|
||||
|
||||
# merge windows
|
||||
attn_windows = attn_windows.reshape(
|
||||
[x_windows.shape[0], self.window_size, self.window_size, C])
|
||||
shifted_x = window_reverse(attn_windows, self.window_size, Hp,
|
||||
Wp) # B H' W' C
|
||||
|
||||
# reverse cyclic shift
|
||||
if self.shift_size > 0:
|
||||
x = paddle.roll(
|
||||
shifted_x,
|
||||
shifts=(self.shift_size, self.shift_size),
|
||||
axis=(1, 2))
|
||||
else:
|
||||
x = shifted_x
|
||||
|
||||
if pad_r > 0 or pad_b > 0:
|
||||
x = x[:, :H, :W, :]
|
||||
|
||||
x = x.reshape([-1, H * W, C])
|
||||
|
||||
# FFN
|
||||
x = shortcut + self.drop_path(x)
|
||||
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class PatchMerging(nn.Layer):
|
||||
r""" Patch Merging Layer.
|
||||
Args:
|
||||
dim (int): Number of input channels.
|
||||
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
|
||||
"""
|
||||
|
||||
def __init__(self, dim, norm_layer=nn.LayerNorm):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
|
||||
self.norm = norm_layer(4 * dim)
|
||||
|
||||
def forward(self, x, H, W):
|
||||
""" Forward function.
|
||||
Args:
|
||||
x: Input feature, tensor size (B, H*W, C).
|
||||
H, W: Spatial resolution of the input feature.
|
||||
"""
|
||||
B, L, C = x.shape
|
||||
assert L == H * W, "input feature has wrong size"
|
||||
|
||||
x = x.reshape([-1, H, W, C])
|
||||
|
||||
# padding
|
||||
pad_input = (H % 2 == 1) or (W % 2 == 1)
|
||||
if pad_input:
|
||||
# paddle F.pad default data_format is 'NCHW'
|
||||
x = F.pad(x, [0, 0, 0, H % 2, 0, W % 2, 0, 0], data_format='NHWC')
|
||||
H += H % 2
|
||||
W += W % 2
|
||||
|
||||
x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
|
||||
x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
|
||||
x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
|
||||
x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
|
||||
x = paddle.concat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
|
||||
x = x.reshape([-1, H * W // 4, 4 * C]) # B H/2*W/2 4*C
|
||||
|
||||
x = self.norm(x)
|
||||
x = self.reduction(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class BasicLayer(nn.Layer):
|
||||
""" A basic Swin Transformer layer for one stage.
|
||||
Args:
|
||||
dim (int): Number of input channels.
|
||||
depth (int): Number of blocks.
|
||||
num_heads (int): Number of attention heads.
|
||||
window_size (int): Local window size.
|
||||
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
|
||||
qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
|
||||
qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
|
||||
drop (float, optional): Dropout rate. Default: 0.0
|
||||
attn_drop (float, optional): Attention dropout rate. Default: 0.0
|
||||
drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
|
||||
norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
|
||||
downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
dim,
|
||||
depth,
|
||||
num_heads,
|
||||
window_size=7,
|
||||
mlp_ratio=4.,
|
||||
qkv_bias=True,
|
||||
qk_scale=None,
|
||||
drop=0.,
|
||||
attn_drop=0.,
|
||||
drop_path=0.,
|
||||
norm_layer=nn.LayerNorm,
|
||||
downsample=None):
|
||||
super().__init__()
|
||||
self.window_size = window_size
|
||||
self.shift_size = window_size // 2
|
||||
self.depth = depth
|
||||
|
||||
# build blocks
|
||||
self.blocks = nn.LayerList([
|
||||
SwinTransformerBlock(
|
||||
dim=dim,
|
||||
num_heads=num_heads,
|
||||
window_size=window_size,
|
||||
shift_size=0 if (i % 2 == 0) else window_size // 2,
|
||||
mlp_ratio=mlp_ratio,
|
||||
qkv_bias=qkv_bias,
|
||||
qk_scale=qk_scale,
|
||||
drop=drop,
|
||||
attn_drop=attn_drop,
|
||||
drop_path=drop_path[i]
|
||||
if isinstance(drop_path, np.ndarray) else drop_path,
|
||||
norm_layer=norm_layer) for i in range(depth)
|
||||
])
|
||||
|
||||
# patch merging layer
|
||||
if downsample is not None:
|
||||
self.downsample = downsample(dim=dim, norm_layer=norm_layer)
|
||||
else:
|
||||
self.downsample = None
|
||||
|
||||
def forward(self, x, H, W):
|
||||
""" Forward function.
|
||||
Args:
|
||||
x: Input feature, tensor size (B, H*W, C).
|
||||
H, W: Spatial resolution of the input feature.
|
||||
"""
|
||||
|
||||
# calculate attention mask for SW-MSA
|
||||
Hp = int(np.ceil(H / self.window_size)) * self.window_size
|
||||
Wp = int(np.ceil(W / self.window_size)) * self.window_size
|
||||
img_mask = paddle.zeros([1, Hp, Wp, 1], dtype='float32') # 1 Hp Wp 1
|
||||
h_slices = (slice(0, -self.window_size),
|
||||
slice(-self.window_size, -self.shift_size),
|
||||
slice(-self.shift_size, None))
|
||||
w_slices = (slice(0, -self.window_size),
|
||||
slice(-self.window_size, -self.shift_size),
|
||||
slice(-self.shift_size, None))
|
||||
cnt = 0
|
||||
for h in h_slices:
|
||||
for w in w_slices:
|
||||
img_mask[:, h, w, :] = cnt
|
||||
|
||||
cnt += 1
|
||||
|
||||
mask_windows = window_partition(
|
||||
img_mask, self.window_size) # nW, window_size, window_size, 1
|
||||
mask_windows = mask_windows.reshape(
|
||||
[-1, self.window_size * self.window_size])
|
||||
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
|
||||
huns = -100.0 * paddle.ones_like(attn_mask)
|
||||
attn_mask = huns * (attn_mask != 0).astype("float32")
|
||||
|
||||
for blk in self.blocks:
|
||||
blk.H, blk.W = H, W
|
||||
x = blk(x, attn_mask)
|
||||
if self.downsample is not None:
|
||||
x_down = self.downsample(x, H, W)
|
||||
Wh, Ww = (H + 1) // 2, (W + 1) // 2
|
||||
return x, H, W, x_down, Wh, Ww
|
||||
else:
|
||||
return x, H, W, x, H, W
|
||||
|
||||
|
||||
class PatchEmbed(nn.Layer):
|
||||
""" Image to Patch Embedding
|
||||
Args:
|
||||
patch_size (int): Patch token size. Default: 4.
|
||||
in_chans (int): Number of input image channels. Default: 3.
|
||||
embed_dim (int): Number of linear projection output channels. Default: 96.
|
||||
norm_layer (nn.Layer, optional): Normalization layer. Default: None
|
||||
"""
|
||||
|
||||
def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
|
||||
super().__init__()
|
||||
patch_size = to_2tuple(patch_size)
|
||||
self.patch_size = patch_size
|
||||
|
||||
self.in_chans = in_chans
|
||||
self.embed_dim = embed_dim
|
||||
|
||||
self.proj = nn.Conv2D(
|
||||
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
|
||||
if norm_layer is not None:
|
||||
self.norm = norm_layer(embed_dim)
|
||||
else:
|
||||
self.norm = None
|
||||
|
||||
def forward(self, x):
|
||||
# TODO # export dynamic shape
|
||||
B, C, H, W = x.shape
|
||||
# assert [H, W] == self.img_size[:2], "Input image size ({H}*{W}) doesn't match model ({}*{}).".format(H, W, self.img_size[0], self.img_size[1])
|
||||
if W % self.patch_size[1] != 0:
|
||||
x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
|
||||
if H % self.patch_size[0] != 0:
|
||||
x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
|
||||
|
||||
x = self.proj(x)
|
||||
if self.norm is not None:
|
||||
_, _, Wh, Ww = x.shape
|
||||
x = x.flatten(2).transpose([0, 2, 1])
|
||||
x = self.norm(x)
|
||||
x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
|
||||
|
||||
return x
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class SwinTransformer(nn.Layer):
|
||||
""" Swin Transformer backbone
|
||||
Args:
|
||||
arch (str): Architecture of FocalNet
|
||||
pretrain_img_size (int | tuple(int)): Input image size. Default 224
|
||||
patch_size (int | tuple(int)): Patch size. Default: 4
|
||||
in_chans (int): Number of input image channels. Default: 3
|
||||
embed_dim (int): Patch embedding dimension. Default: 96
|
||||
depths (tuple(int)): Depth of each Swin Transformer layer.
|
||||
num_heads (tuple(int)): Number of attention heads in different layers.
|
||||
window_size (int): Window size. Default: 7
|
||||
mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
|
||||
qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
|
||||
qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
|
||||
drop_rate (float): Dropout rate. Default: 0
|
||||
attn_drop_rate (float): Attention dropout rate. Default: 0
|
||||
drop_path_rate (float): Stochastic depth rate. Default: 0.1
|
||||
norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
|
||||
ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
|
||||
patch_norm (bool): If True, add normalization after patch embedding. Default: True
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
arch='swin_T_224',
|
||||
pretrain_img_size=224,
|
||||
patch_size=4,
|
||||
in_chans=3,
|
||||
embed_dim=96,
|
||||
depths=[2, 2, 6, 2],
|
||||
num_heads=[3, 6, 12, 24],
|
||||
window_size=7,
|
||||
mlp_ratio=4.,
|
||||
qkv_bias=True,
|
||||
qk_scale=None,
|
||||
drop_rate=0.,
|
||||
attn_drop_rate=0.,
|
||||
drop_path_rate=0.2,
|
||||
norm_layer=nn.LayerNorm,
|
||||
ape=False,
|
||||
patch_norm=True,
|
||||
out_indices=(0, 1, 2, 3),
|
||||
frozen_stages=-1,
|
||||
pretrained=None):
|
||||
super(SwinTransformer, self).__init__()
|
||||
assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch)
|
||||
|
||||
pretrain_img_size = MODEL_cfg[arch]['pretrain_img_size']
|
||||
embed_dim = MODEL_cfg[arch]['embed_dim']
|
||||
depths = MODEL_cfg[arch]['depths']
|
||||
num_heads = MODEL_cfg[arch]['num_heads']
|
||||
window_size = MODEL_cfg[arch]['window_size']
|
||||
if pretrained is None:
|
||||
pretrained = MODEL_cfg[arch]['pretrained']
|
||||
|
||||
self.num_layers = len(depths)
|
||||
self.ape = ape
|
||||
self.patch_norm = patch_norm
|
||||
self.out_indices = out_indices
|
||||
self.frozen_stages = frozen_stages
|
||||
|
||||
# split image into non-overlapping patches
|
||||
self.patch_embed = PatchEmbed(
|
||||
patch_size=patch_size,
|
||||
in_chans=in_chans,
|
||||
embed_dim=embed_dim,
|
||||
norm_layer=norm_layer if self.patch_norm else None)
|
||||
|
||||
# absolute position embedding
|
||||
if self.ape:
|
||||
pretrain_img_size = to_2tuple(pretrain_img_size)
|
||||
patch_size = to_2tuple(patch_size)
|
||||
patches_resolution = [
|
||||
pretrain_img_size[0] // patch_size[0],
|
||||
pretrain_img_size[1] // patch_size[1]
|
||||
]
|
||||
|
||||
self.absolute_pos_embed = add_parameter(
|
||||
self,
|
||||
paddle.zeros((1, embed_dim, patches_resolution[0],
|
||||
patches_resolution[1])))
|
||||
trunc_normal_(self.absolute_pos_embed)
|
||||
|
||||
self.pos_drop = nn.Dropout(p=drop_rate)
|
||||
|
||||
# stochastic depth
|
||||
dpr = np.linspace(0, drop_path_rate,
|
||||
sum(depths)) # stochastic depth decay rule
|
||||
|
||||
# build layers
|
||||
self.layers = nn.LayerList()
|
||||
for i_layer in range(self.num_layers):
|
||||
layer = BasicLayer(
|
||||
dim=int(embed_dim * 2**i_layer),
|
||||
depth=depths[i_layer],
|
||||
num_heads=num_heads[i_layer],
|
||||
window_size=window_size,
|
||||
mlp_ratio=mlp_ratio,
|
||||
qkv_bias=qkv_bias,
|
||||
qk_scale=qk_scale,
|
||||
drop=drop_rate,
|
||||
attn_drop=attn_drop_rate,
|
||||
drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
|
||||
norm_layer=norm_layer,
|
||||
downsample=PatchMerging
|
||||
if (i_layer < self.num_layers - 1) else None)
|
||||
self.layers.append(layer)
|
||||
|
||||
num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
|
||||
self.num_features = num_features
|
||||
|
||||
# add a norm layer for each output
|
||||
for i_layer in out_indices:
|
||||
layer = norm_layer(num_features[i_layer])
|
||||
layer_name = f'norm{i_layer}'
|
||||
self.add_sublayer(layer_name, layer)
|
||||
|
||||
self.apply(self._init_weights)
|
||||
self._freeze_stages()
|
||||
if pretrained:
|
||||
if 'http' in pretrained: #URL
|
||||
path = paddle.utils.download.get_weights_path_from_url(
|
||||
pretrained)
|
||||
else: #model in local path
|
||||
path = pretrained
|
||||
self.set_state_dict(paddle.load(path))
|
||||
|
||||
def _freeze_stages(self):
|
||||
if self.frozen_stages >= 0:
|
||||
self.patch_embed.eval()
|
||||
for param in self.patch_embed.parameters():
|
||||
param.stop_gradient = True
|
||||
|
||||
if self.frozen_stages >= 1 and self.ape:
|
||||
self.absolute_pos_embed.stop_gradient = True
|
||||
|
||||
if self.frozen_stages >= 2:
|
||||
self.pos_drop.eval()
|
||||
for i in range(0, self.frozen_stages - 1):
|
||||
m = self.layers[i]
|
||||
m.eval()
|
||||
for param in m.parameters():
|
||||
param.stop_gradient = True
|
||||
|
||||
def _init_weights(self, m):
|
||||
if isinstance(m, nn.Linear):
|
||||
trunc_normal_(m.weight)
|
||||
if isinstance(m, nn.Linear) and m.bias is not None:
|
||||
zeros_(m.bias)
|
||||
elif isinstance(m, nn.LayerNorm):
|
||||
zeros_(m.bias)
|
||||
ones_(m.weight)
|
||||
|
||||
def forward(self, x):
|
||||
"""Forward function."""
|
||||
x = self.patch_embed(x['image'])
|
||||
B, _, Wh, Ww = x.shape
|
||||
if self.ape:
|
||||
# interpolate the position embedding to the corresponding size
|
||||
absolute_pos_embed = F.interpolate(
|
||||
self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
|
||||
x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1])
|
||||
else:
|
||||
x = x.flatten(2).transpose([0, 2, 1])
|
||||
x = self.pos_drop(x)
|
||||
outs = []
|
||||
for i in range(self.num_layers):
|
||||
layer = self.layers[i]
|
||||
x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
|
||||
if i in self.out_indices:
|
||||
norm_layer = getattr(self, f'norm{i}')
|
||||
x_out = norm_layer(x_out)
|
||||
out = x_out.reshape((-1, H, W, self.num_features[i])).transpose(
|
||||
(0, 3, 1, 2))
|
||||
outs.append(out)
|
||||
|
||||
return outs
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
out_strides = [4, 8, 16, 32]
|
||||
return [
|
||||
ShapeSpec(
|
||||
channels=self.num_features[i], stride=out_strides[i])
|
||||
for i in self.out_indices
|
||||
]
|
||||
381
rtdetr_paddle/ppdet/modeling/backbones/trans_encoder.py
Normal file
381
rtdetr_paddle/ppdet/modeling/backbones/trans_encoder.py
Normal file
@@ -0,0 +1,381 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle.nn import ReLU, Swish, GELU
|
||||
import math
|
||||
|
||||
from ppdet.core.workspace import register
|
||||
from ..shape_spec import ShapeSpec
|
||||
|
||||
__all__ = ['TransEncoder']
|
||||
|
||||
|
||||
class BertEmbeddings(nn.Layer):
|
||||
def __init__(self, word_size, position_embeddings_size, word_type_size,
|
||||
hidden_size, dropout_prob):
|
||||
super(BertEmbeddings, self).__init__()
|
||||
self.word_embeddings = nn.Embedding(
|
||||
word_size, hidden_size, padding_idx=0)
|
||||
self.position_embeddings = nn.Embedding(position_embeddings_size,
|
||||
hidden_size)
|
||||
self.token_type_embeddings = nn.Embedding(word_type_size, hidden_size)
|
||||
self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
|
||||
self.dropout = nn.Dropout(dropout_prob)
|
||||
|
||||
def forward(self, x, token_type_ids=None, position_ids=None):
|
||||
seq_len = paddle.shape(x)[1]
|
||||
if position_ids is None:
|
||||
position_ids = paddle.arange(seq_len).unsqueeze(0).expand_as(x)
|
||||
if token_type_ids is None:
|
||||
token_type_ids = paddle.zeros(paddle.shape(x))
|
||||
|
||||
word_embs = self.word_embeddings(x)
|
||||
position_embs = self.position_embeddings(position_ids)
|
||||
token_type_embs = self.token_type_embeddings(token_type_ids)
|
||||
|
||||
embs_cmb = word_embs + position_embs + token_type_embs
|
||||
embs_out = self.layernorm(embs_cmb)
|
||||
embs_out = self.dropout(embs_out)
|
||||
return embs_out
|
||||
|
||||
|
||||
class BertSelfAttention(nn.Layer):
|
||||
def __init__(self,
|
||||
hidden_size,
|
||||
num_attention_heads,
|
||||
attention_probs_dropout_prob,
|
||||
output_attentions=False):
|
||||
super(BertSelfAttention, self).__init__()
|
||||
if hidden_size % num_attention_heads != 0:
|
||||
raise ValueError(
|
||||
"The hidden_size must be a multiple of the number of attention "
|
||||
"heads, but got {} % {} != 0" %
|
||||
(hidden_size, num_attention_heads))
|
||||
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.attention_head_size = int(hidden_size / num_attention_heads)
|
||||
self.all_head_size = self.num_attention_heads * self.attention_head_size
|
||||
|
||||
self.query = nn.Linear(hidden_size, self.all_head_size)
|
||||
self.key = nn.Linear(hidden_size, self.all_head_size)
|
||||
self.value = nn.Linear(hidden_size, self.all_head_size)
|
||||
|
||||
self.dropout = nn.Dropout(attention_probs_dropout_prob)
|
||||
self.output_attentions = output_attentions
|
||||
|
||||
def forward(self, x, attention_mask, head_mask=None):
|
||||
query = self.query(x)
|
||||
key = self.key(x)
|
||||
value = self.value(x)
|
||||
|
||||
query_dim1, query_dim2 = paddle.shape(query)[:-1]
|
||||
new_shape = [
|
||||
query_dim1, query_dim2, self.num_attention_heads,
|
||||
self.attention_head_size
|
||||
]
|
||||
query = query.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
|
||||
key = key.reshape(new_shape).transpose(perm=(0, 2, 3, 1))
|
||||
value = value.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
|
||||
|
||||
attention = paddle.matmul(query,
|
||||
key) / math.sqrt(self.attention_head_size)
|
||||
attention = attention + attention_mask
|
||||
attention_value = F.softmax(attention, axis=-1)
|
||||
attention_value = self.dropout(attention_value)
|
||||
|
||||
if head_mask is not None:
|
||||
attention_value = attention_value * head_mask
|
||||
|
||||
context = paddle.matmul(attention_value, value).transpose(perm=(0, 2, 1,
|
||||
3))
|
||||
ctx_dim1, ctx_dim2 = paddle.shape(context)[:-2]
|
||||
new_context_shape = [
|
||||
ctx_dim1,
|
||||
ctx_dim2,
|
||||
self.all_head_size,
|
||||
]
|
||||
context = context.reshape(new_context_shape)
|
||||
|
||||
if self.output_attentions:
|
||||
return (context, attention_value)
|
||||
else:
|
||||
return (context, )
|
||||
|
||||
|
||||
class BertAttention(nn.Layer):
|
||||
def __init__(self,
|
||||
hidden_size,
|
||||
num_attention_heads,
|
||||
attention_probs_dropout_prob,
|
||||
fc_dropout_prob,
|
||||
output_attentions=False):
|
||||
super(BertAttention, self).__init__()
|
||||
self.bert_selfattention = BertSelfAttention(
|
||||
hidden_size, num_attention_heads, attention_probs_dropout_prob,
|
||||
output_attentions)
|
||||
self.fc = nn.Linear(hidden_size, hidden_size)
|
||||
self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
|
||||
self.dropout = nn.Dropout(fc_dropout_prob)
|
||||
|
||||
def forward(self, x, attention_mask, head_mask=None):
|
||||
attention_feats = self.bert_selfattention(x, attention_mask, head_mask)
|
||||
features = self.fc(attention_feats[0])
|
||||
features = self.dropout(features)
|
||||
features = self.layernorm(features + x)
|
||||
if len(attention_feats) == 2:
|
||||
return (features, attention_feats[1])
|
||||
else:
|
||||
return (features, )
|
||||
|
||||
|
||||
class BertFeedForward(nn.Layer):
|
||||
def __init__(self,
|
||||
hidden_size,
|
||||
intermediate_size,
|
||||
num_attention_heads,
|
||||
attention_probs_dropout_prob,
|
||||
fc_dropout_prob,
|
||||
act_fn='ReLU',
|
||||
output_attentions=False):
|
||||
super(BertFeedForward, self).__init__()
|
||||
self.fc1 = nn.Linear(hidden_size, intermediate_size)
|
||||
self.act_fn = eval(act_fn)
|
||||
self.fc2 = nn.Linear(intermediate_size, hidden_size)
|
||||
self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
|
||||
self.dropout = nn.Dropout(fc_dropout_prob)
|
||||
|
||||
def forward(self, x):
|
||||
features = self.fc1(x)
|
||||
features = self.act_fn(features)
|
||||
features = self.fc2(features)
|
||||
features = self.dropout(features)
|
||||
features = self.layernorm(features + x)
|
||||
return features
|
||||
|
||||
|
||||
class BertLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
hidden_size,
|
||||
intermediate_size,
|
||||
num_attention_heads,
|
||||
attention_probs_dropout_prob,
|
||||
fc_dropout_prob,
|
||||
act_fn='ReLU',
|
||||
output_attentions=False):
|
||||
super(BertLayer, self).__init__()
|
||||
self.attention = BertAttention(hidden_size, num_attention_heads,
|
||||
attention_probs_dropout_prob,
|
||||
output_attentions)
|
||||
self.feed_forward = BertFeedForward(
|
||||
hidden_size, intermediate_size, num_attention_heads,
|
||||
attention_probs_dropout_prob, fc_dropout_prob, act_fn,
|
||||
output_attentions)
|
||||
|
||||
def forward(self, x, attention_mask, head_mask=None):
|
||||
attention_feats = self.attention(x, attention_mask, head_mask)
|
||||
features = self.feed_forward(attention_feats[0])
|
||||
if len(attention_feats) == 2:
|
||||
return (features, attention_feats[1])
|
||||
else:
|
||||
return (features, )
|
||||
|
||||
|
||||
class BertEncoder(nn.Layer):
|
||||
def __init__(self,
|
||||
num_hidden_layers,
|
||||
hidden_size,
|
||||
intermediate_size,
|
||||
num_attention_heads,
|
||||
attention_probs_dropout_prob,
|
||||
fc_dropout_prob,
|
||||
act_fn='ReLU',
|
||||
output_attentions=False,
|
||||
output_hidden_feats=False):
|
||||
super(BertEncoder, self).__init__()
|
||||
self.output_attentions = output_attentions
|
||||
self.output_hidden_feats = output_hidden_feats
|
||||
self.layers = nn.LayerList([
|
||||
BertLayer(hidden_size, intermediate_size, num_attention_heads,
|
||||
attention_probs_dropout_prob, fc_dropout_prob, act_fn,
|
||||
output_attentions) for _ in range(num_hidden_layers)
|
||||
])
|
||||
|
||||
def forward(self, x, attention_mask, head_mask=None):
|
||||
all_features = (x, )
|
||||
all_attentions = ()
|
||||
|
||||
for i, layer in enumerate(self.layers):
|
||||
mask = head_mask[i] if head_mask is not None else None
|
||||
layer_out = layer(x, attention_mask, mask)
|
||||
|
||||
if self.output_hidden_feats:
|
||||
all_features = all_features + (x, )
|
||||
x = layer_out[0]
|
||||
if self.output_attentions:
|
||||
all_attentions = all_attentions + (layer_out[1], )
|
||||
|
||||
outputs = (x, )
|
||||
if self.output_hidden_feats:
|
||||
outputs += (all_features, )
|
||||
if self.output_attentions:
|
||||
outputs += (all_attentions, )
|
||||
return outputs
|
||||
|
||||
|
||||
class BertPooler(nn.Layer):
|
||||
def __init__(self, hidden_size):
|
||||
super(BertPooler, self).__init__()
|
||||
self.fc = nn.Linear(hidden_size, hidden_size)
|
||||
self.act = nn.Tanh()
|
||||
|
||||
def forward(self, x):
|
||||
first_token = x[:, 0]
|
||||
pooled_output = self.fc(first_token)
|
||||
pooled_output = self.act(pooled_output)
|
||||
return pooled_output
|
||||
|
||||
|
||||
class METROEncoder(nn.Layer):
|
||||
def __init__(self,
|
||||
vocab_size,
|
||||
num_hidden_layers,
|
||||
features_dims,
|
||||
position_embeddings_size,
|
||||
hidden_size,
|
||||
intermediate_size,
|
||||
output_feature_dim,
|
||||
num_attention_heads,
|
||||
attention_probs_dropout_prob,
|
||||
fc_dropout_prob,
|
||||
act_fn='ReLU',
|
||||
output_attentions=False,
|
||||
output_hidden_feats=False,
|
||||
use_img_layernorm=False):
|
||||
super(METROEncoder, self).__init__()
|
||||
self.img_dims = features_dims
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.use_img_layernorm = use_img_layernorm
|
||||
self.output_attentions = output_attentions
|
||||
self.embedding = BertEmbeddings(vocab_size, position_embeddings_size, 2,
|
||||
hidden_size, fc_dropout_prob)
|
||||
self.encoder = BertEncoder(
|
||||
num_hidden_layers, hidden_size, intermediate_size,
|
||||
num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob,
|
||||
act_fn, output_attentions, output_hidden_feats)
|
||||
self.pooler = BertPooler(hidden_size)
|
||||
self.position_embeddings = nn.Embedding(position_embeddings_size,
|
||||
hidden_size)
|
||||
self.img_embedding = nn.Linear(
|
||||
features_dims, hidden_size, bias_attr=True)
|
||||
self.dropout = nn.Dropout(fc_dropout_prob)
|
||||
self.cls_head = nn.Linear(hidden_size, output_feature_dim)
|
||||
self.residual = nn.Linear(features_dims, output_feature_dim)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
|
||||
def init_weights(self, module):
|
||||
""" Initialize the weights.
|
||||
"""
|
||||
if isinstance(module, (nn.Linear, nn.Embedding)):
|
||||
module.weight.set_value(
|
||||
paddle.normal(
|
||||
mean=0.0, std=0.02, shape=module.weight.shape))
|
||||
elif isinstance(module, nn.LayerNorm):
|
||||
module.bias.set_value(paddle.zeros(shape=module.bias.shape))
|
||||
module.weight.set_value(
|
||||
paddle.full(
|
||||
shape=module.weight.shape, fill_value=1.0))
|
||||
if isinstance(module, nn.Linear) and module.bias is not None:
|
||||
module.bias.set_value(paddle.zeros(shape=module.bias.shape))
|
||||
|
||||
def forward(self, x):
|
||||
batchsize, seq_len = paddle.shape(x)[:2]
|
||||
input_ids = paddle.zeros((batchsize, seq_len), dtype="int64")
|
||||
position_ids = paddle.arange(
|
||||
seq_len, dtype="int64").unsqueeze(0).expand_as(input_ids)
|
||||
|
||||
attention_mask = paddle.ones_like(input_ids).unsqueeze(1).unsqueeze(2)
|
||||
head_mask = [None] * self.num_hidden_layers
|
||||
|
||||
position_embs = self.position_embeddings(position_ids)
|
||||
attention_mask = (1.0 - attention_mask) * -10000.0
|
||||
|
||||
img_features = self.img_embedding(x)
|
||||
|
||||
# We empirically observe that adding an additional learnable position embedding leads to more stable training
|
||||
embeddings = position_embs + img_features
|
||||
if self.use_img_layernorm:
|
||||
embeddings = self.layernorm(embeddings)
|
||||
embeddings = self.dropout(embeddings)
|
||||
|
||||
encoder_outputs = self.encoder(
|
||||
embeddings, attention_mask, head_mask=head_mask)
|
||||
|
||||
pred_score = self.cls_head(encoder_outputs[0])
|
||||
res_img_feats = self.residual(x)
|
||||
pred_score = pred_score + res_img_feats
|
||||
|
||||
if self.output_attentions and self.output_hidden_feats:
|
||||
return pred_score, encoder_outputs[1], encoder_outputs[-1]
|
||||
else:
|
||||
return pred_score
|
||||
|
||||
|
||||
def gelu(x):
|
||||
"""Implementation of the gelu activation function.
|
||||
https://arxiv.org/abs/1606.08415
|
||||
"""
|
||||
return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))
|
||||
|
||||
|
||||
@register
|
||||
class TransEncoder(nn.Layer):
|
||||
def __init__(self,
|
||||
vocab_size=30522,
|
||||
num_hidden_layers=4,
|
||||
num_attention_heads=4,
|
||||
position_embeddings_size=512,
|
||||
intermediate_size=3072,
|
||||
input_feat_dim=[2048, 512, 128],
|
||||
hidden_feat_dim=[1024, 256, 128],
|
||||
attention_probs_dropout_prob=0.1,
|
||||
fc_dropout_prob=0.1,
|
||||
act_fn='gelu',
|
||||
output_attentions=False,
|
||||
output_hidden_feats=False):
|
||||
super(TransEncoder, self).__init__()
|
||||
output_feat_dim = input_feat_dim[1:] + [3]
|
||||
trans_encoder = []
|
||||
for i in range(len(output_feat_dim)):
|
||||
features_dims = input_feat_dim[i]
|
||||
output_feature_dim = output_feat_dim[i]
|
||||
hidden_size = hidden_feat_dim[i]
|
||||
|
||||
# init a transformer encoder and append it to a list
|
||||
assert hidden_size % num_attention_heads == 0
|
||||
model = METROEncoder(vocab_size, num_hidden_layers, features_dims,
|
||||
position_embeddings_size, hidden_size,
|
||||
intermediate_size, output_feature_dim,
|
||||
num_attention_heads,
|
||||
attention_probs_dropout_prob, fc_dropout_prob,
|
||||
act_fn, output_attentions, output_hidden_feats)
|
||||
trans_encoder.append(model)
|
||||
self.trans_encoder = paddle.nn.Sequential(*trans_encoder)
|
||||
|
||||
def forward(self, x):
|
||||
out = self.trans_encoder(x)
|
||||
return out
|
||||
124
rtdetr_paddle/ppdet/modeling/backbones/transformer_utils.py
Normal file
124
rtdetr_paddle/ppdet/modeling/backbones/transformer_utils.py
Normal file
@@ -0,0 +1,124 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
|
||||
from paddle.nn.initializer import TruncatedNormal, Constant, Assign
|
||||
|
||||
# Common initializations
|
||||
ones_ = Constant(value=1.)
|
||||
zeros_ = Constant(value=0.)
|
||||
trunc_normal_ = TruncatedNormal(std=.02)
|
||||
|
||||
|
||||
# Common Layers
|
||||
def drop_path(x, drop_prob=0., training=False):
|
||||
"""
|
||||
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
|
||||
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
|
||||
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
|
||||
"""
|
||||
if drop_prob == 0. or not training:
|
||||
return x
|
||||
keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
|
||||
shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
|
||||
random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
|
||||
random_tensor = paddle.floor(random_tensor) # binarize
|
||||
output = x.divide(keep_prob) * random_tensor
|
||||
return output
|
||||
|
||||
|
||||
class DropPath(nn.Layer):
|
||||
def __init__(self, drop_prob=None):
|
||||
super(DropPath, self).__init__()
|
||||
self.drop_prob = drop_prob
|
||||
|
||||
def forward(self, x):
|
||||
return drop_path(x, self.drop_prob, self.training)
|
||||
|
||||
|
||||
class Identity(nn.Layer):
|
||||
def __init__(self):
|
||||
super(Identity, self).__init__()
|
||||
|
||||
def forward(self, input):
|
||||
return input
|
||||
|
||||
|
||||
# common funcs
|
||||
|
||||
|
||||
def to_2tuple(x):
|
||||
if isinstance(x, (list, tuple)):
|
||||
return x
|
||||
return tuple([x] * 2)
|
||||
|
||||
|
||||
def add_parameter(layer, datas, name=None):
|
||||
parameter = layer.create_parameter(
|
||||
shape=(datas.shape), default_initializer=Assign(datas))
|
||||
if name:
|
||||
layer.add_parameter(name, parameter)
|
||||
return parameter
|
||||
|
||||
|
||||
def window_partition(x, window_size):
|
||||
"""
|
||||
Partition into non-overlapping windows with padding if needed.
|
||||
Args:
|
||||
x (tensor): input tokens with [B, H, W, C].
|
||||
window_size (int): window size.
|
||||
Returns:
|
||||
windows: windows after partition with [B * num_windows, window_size, window_size, C].
|
||||
(Hp, Wp): padded height and width before partition
|
||||
"""
|
||||
B, H, W, C = paddle.shape(x)
|
||||
|
||||
pad_h = (window_size - H % window_size) % window_size
|
||||
pad_w = (window_size - W % window_size) % window_size
|
||||
x = F.pad(x.transpose([0, 3, 1, 2]),
|
||||
paddle.to_tensor(
|
||||
[0, int(pad_w), 0, int(pad_h)],
|
||||
dtype='int32')).transpose([0, 2, 3, 1])
|
||||
Hp, Wp = H + pad_h, W + pad_w
|
||||
|
||||
num_h, num_w = Hp // window_size, Wp // window_size
|
||||
|
||||
x = x.reshape([B, num_h, window_size, num_w, window_size, C])
|
||||
windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
|
||||
[-1, window_size, window_size, C])
|
||||
return windows, (Hp, Wp), (num_h, num_w)
|
||||
|
||||
|
||||
def window_unpartition(x, pad_hw, num_hw, hw):
|
||||
"""
|
||||
Window unpartition into original sequences and removing padding.
|
||||
Args:
|
||||
x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
|
||||
pad_hw (Tuple): padded height and width (Hp, Wp).
|
||||
hw (Tuple): original height and width (H, W) before padding.
|
||||
Returns:
|
||||
x: unpartitioned sequences with [B, H, W, C].
|
||||
"""
|
||||
Hp, Wp = pad_hw
|
||||
num_h, num_w = num_hw
|
||||
H, W = hw
|
||||
B, window_size, _, C = paddle.shape(x)
|
||||
B = B // (num_h * num_w)
|
||||
x = x.reshape([B, num_h, num_w, window_size, window_size, C])
|
||||
x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, Hp, Wp, C])
|
||||
|
||||
return x[:, :H, :W, :]
|
||||
652
rtdetr_paddle/ppdet/modeling/backbones/vision_transformer.py
Normal file
652
rtdetr_paddle/ppdet/modeling/backbones/vision_transformer.py
Normal file
@@ -0,0 +1,652 @@
|
||||
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
import numpy as np
|
||||
from paddle.nn.initializer import Constant
|
||||
|
||||
from ppdet.modeling.shape_spec import ShapeSpec
|
||||
from ppdet.core.workspace import register, serializable
|
||||
|
||||
from .transformer_utils import zeros_, DropPath, Identity
|
||||
|
||||
|
||||
class Mlp(nn.Layer):
|
||||
def __init__(self,
|
||||
in_features,
|
||||
hidden_features=None,
|
||||
out_features=None,
|
||||
act_layer=nn.GELU,
|
||||
drop=0.):
|
||||
super().__init__()
|
||||
out_features = out_features or in_features
|
||||
hidden_features = hidden_features or in_features
|
||||
self.fc1 = nn.Linear(in_features, hidden_features)
|
||||
self.act = act_layer()
|
||||
self.fc2 = nn.Linear(hidden_features, out_features)
|
||||
self.drop = nn.Dropout(drop)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.fc1(x)
|
||||
x = self.act(x)
|
||||
x = self.drop(x)
|
||||
x = self.fc2(x)
|
||||
x = self.drop(x)
|
||||
return x
|
||||
|
||||
|
||||
class Attention(nn.Layer):
|
||||
def __init__(self,
|
||||
dim,
|
||||
num_heads=8,
|
||||
qkv_bias=False,
|
||||
qk_scale=None,
|
||||
attn_drop=0.,
|
||||
proj_drop=0.,
|
||||
window_size=None):
|
||||
super().__init__()
|
||||
self.num_heads = num_heads
|
||||
head_dim = dim // num_heads
|
||||
self.scale = qk_scale or head_dim**-0.5
|
||||
|
||||
self.qkv = nn.Linear(dim, dim * 3, bias_attr=False)
|
||||
|
||||
if qkv_bias:
|
||||
self.q_bias = self.create_parameter(
|
||||
shape=([dim]), default_initializer=zeros_)
|
||||
self.v_bias = self.create_parameter(
|
||||
shape=([dim]), default_initializer=zeros_)
|
||||
else:
|
||||
self.q_bias = None
|
||||
self.v_bias = None
|
||||
if window_size:
|
||||
self.window_size = window_size
|
||||
self.num_relative_distance = (2 * window_size[0] - 1) * (
|
||||
2 * window_size[1] - 1) + 3
|
||||
self.relative_position_bias_table = self.create_parameter(
|
||||
shape=(self.num_relative_distance, num_heads),
|
||||
default_initializer=zeros_) # 2*Wh-1 * 2*Ww-1, nH
|
||||
# cls to token & token 2 cls & cls to cls
|
||||
|
||||
# get pair-wise relative position index for each token inside the window
|
||||
coords_h = paddle.arange(window_size[0])
|
||||
coords_w = paddle.arange(window_size[1])
|
||||
coords = paddle.stack(paddle.meshgrid(
|
||||
[coords_h, coords_w])) # 2, Wh, Ww
|
||||
coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww
|
||||
coords_flatten_1 = paddle.unsqueeze(coords_flatten, 2)
|
||||
coords_flatten_2 = paddle.unsqueeze(coords_flatten, 1)
|
||||
relative_coords = coords_flatten_1.clone() - coords_flatten_2.clone(
|
||||
)
|
||||
|
||||
#relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Wh
|
||||
relative_coords = relative_coords.transpose(
|
||||
(1, 2, 0)) #.contiguous() # Wh*Ww, Wh*Ww, 2
|
||||
relative_coords[:, :, 0] += window_size[
|
||||
0] - 1 # shift to start from 0
|
||||
relative_coords[:, :, 1] += window_size[1] - 1
|
||||
relative_coords[:, :, 0] *= 2 * window_size[1] - 1
|
||||
relative_position_index = \
|
||||
paddle.zeros(shape=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
|
||||
relative_position_index[1:, 1:] = relative_coords.sum(
|
||||
-1) # Wh*Ww, Wh*Ww
|
||||
relative_position_index[0, 0:] = self.num_relative_distance - 3
|
||||
relative_position_index[0:, 0] = self.num_relative_distance - 2
|
||||
relative_position_index[0, 0] = self.num_relative_distance - 1
|
||||
|
||||
self.register_buffer("relative_position_index",
|
||||
relative_position_index)
|
||||
# trunc_normal_(self.relative_position_bias_table, std=.0)
|
||||
else:
|
||||
self.window_size = None
|
||||
self.relative_position_bias_table = None
|
||||
self.relative_position_index = None
|
||||
|
||||
self.attn_drop = nn.Dropout(attn_drop)
|
||||
self.proj = nn.Linear(dim, dim)
|
||||
self.proj_drop = nn.Dropout(proj_drop)
|
||||
|
||||
def forward(self, x, rel_pos_bias=None):
|
||||
x_shape = paddle.shape(x)
|
||||
N, C = x_shape[1], x_shape[2]
|
||||
|
||||
qkv_bias = None
|
||||
if self.q_bias is not None:
|
||||
qkv_bias = paddle.concat(
|
||||
(self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
|
||||
qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
|
||||
|
||||
qkv = qkv.reshape((-1, N, 3, self.num_heads,
|
||||
C // self.num_heads)).transpose((2, 0, 3, 1, 4))
|
||||
q, k, v = qkv[0], qkv[1], qkv[2]
|
||||
attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
|
||||
|
||||
if self.relative_position_bias_table is not None:
|
||||
relative_position_bias = self.relative_position_bias_table[
|
||||
self.relative_position_index.reshape([-1])].reshape([
|
||||
self.window_size[0] * self.window_size[1] + 1,
|
||||
self.window_size[0] * self.window_size[1] + 1, -1
|
||||
]) # Wh*Ww,Wh*Ww,nH
|
||||
relative_position_bias = relative_position_bias.transpose(
|
||||
(2, 0, 1)) #.contiguous() # nH, Wh*Ww, Wh*Ww
|
||||
attn = attn + relative_position_bias.unsqueeze(0)
|
||||
if rel_pos_bias is not None:
|
||||
attn = attn + rel_pos_bias
|
||||
|
||||
attn = nn.functional.softmax(attn, axis=-1)
|
||||
attn = self.attn_drop(attn)
|
||||
|
||||
x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
|
||||
x = self.proj(x)
|
||||
x = self.proj_drop(x)
|
||||
return x
|
||||
|
||||
|
||||
class Block(nn.Layer):
|
||||
def __init__(self,
|
||||
dim,
|
||||
num_heads,
|
||||
mlp_ratio=4.,
|
||||
qkv_bias=False,
|
||||
qk_scale=None,
|
||||
drop=0.,
|
||||
attn_drop=0.,
|
||||
drop_path=0.,
|
||||
window_size=None,
|
||||
init_values=None,
|
||||
act_layer=nn.GELU,
|
||||
norm_layer='nn.LayerNorm',
|
||||
epsilon=1e-5):
|
||||
super().__init__()
|
||||
self.norm1 = nn.LayerNorm(dim, epsilon=1e-6)
|
||||
self.attn = Attention(
|
||||
dim,
|
||||
num_heads=num_heads,
|
||||
qkv_bias=qkv_bias,
|
||||
qk_scale=qk_scale,
|
||||
attn_drop=attn_drop,
|
||||
proj_drop=drop,
|
||||
window_size=window_size)
|
||||
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
|
||||
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
|
||||
self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
|
||||
mlp_hidden_dim = int(dim * mlp_ratio)
|
||||
self.mlp = Mlp(in_features=dim,
|
||||
hidden_features=mlp_hidden_dim,
|
||||
act_layer=act_layer,
|
||||
drop=drop)
|
||||
if init_values is not None:
|
||||
self.gamma_1 = self.create_parameter(
|
||||
shape=([dim]), default_initializer=Constant(value=init_values))
|
||||
self.gamma_2 = self.create_parameter(
|
||||
shape=([dim]), default_initializer=Constant(value=init_values))
|
||||
else:
|
||||
self.gamma_1, self.gamma_2 = None, None
|
||||
|
||||
def forward(self, x, rel_pos_bias=None):
|
||||
|
||||
if self.gamma_1 is None:
|
||||
x = x + self.drop_path(
|
||||
self.attn(
|
||||
self.norm1(x), rel_pos_bias=rel_pos_bias))
|
||||
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
||||
else:
|
||||
x = x + self.drop_path(self.gamma_1 * self.attn(
|
||||
self.norm1(x), rel_pos_bias=rel_pos_bias))
|
||||
x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
|
||||
return x
|
||||
|
||||
|
||||
class PatchEmbed(nn.Layer):
|
||||
""" Image to Patch Embedding
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
img_size=[224, 224],
|
||||
patch_size=16,
|
||||
in_chans=3,
|
||||
embed_dim=768):
|
||||
super().__init__()
|
||||
self.num_patches_w = img_size[0] // patch_size
|
||||
self.num_patches_h = img_size[1] // patch_size
|
||||
|
||||
num_patches = self.num_patches_w * self.num_patches_h
|
||||
self.patch_shape = (img_size[0] // patch_size,
|
||||
img_size[1] // patch_size)
|
||||
self.img_size = img_size
|
||||
self.patch_size = patch_size
|
||||
self.num_patches = num_patches
|
||||
|
||||
self.proj = nn.Conv2D(
|
||||
in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
|
||||
|
||||
@property
|
||||
def num_patches_in_h(self):
|
||||
return self.img_size[1] // self.patch_size
|
||||
|
||||
@property
|
||||
def num_patches_in_w(self):
|
||||
return self.img_size[0] // self.patch_size
|
||||
|
||||
def forward(self, x, mask=None):
|
||||
B, C, H, W = x.shape
|
||||
return self.proj(x)
|
||||
|
||||
|
||||
class RelativePositionBias(nn.Layer):
|
||||
def __init__(self, window_size, num_heads):
|
||||
super().__init__()
|
||||
self.window_size = window_size
|
||||
self.num_relative_distance = (2 * window_size[0] - 1) * (
|
||||
2 * window_size[1] - 1) + 3
|
||||
self.relative_position_bias_table = self.create_parameter(
|
||||
shape=(self.num_relative_distance, num_heads),
|
||||
default_initialize=zeros_)
|
||||
# cls to token & token 2 cls & cls to cls
|
||||
|
||||
# get pair-wise relative position index for each token inside the window
|
||||
coords_h = paddle.arange(window_size[0])
|
||||
coords_w = paddle.arange(window_size[1])
|
||||
coords = paddle.stack(paddle.meshgrid(
|
||||
[coords_h, coords_w])) # 2, Wh, Ww
|
||||
coords_flatten = coords.flatten(1) # 2, Wh*Ww
|
||||
|
||||
relative_coords = coords_flatten[:, :,
|
||||
None] - coords_flatten[:,
|
||||
None, :] # 2, Wh*Ww, Wh*Ww
|
||||
relative_coords = relative_coords.transpos(
|
||||
(1, 2, 0)) # Wh*Ww, Wh*Ww, 2
|
||||
relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0
|
||||
relative_coords[:, :, 1] += window_size[1] - 1
|
||||
relative_coords[:, :, 0] *= 2 * window_size[1] - 1
|
||||
relative_position_index = \
|
||||
paddle.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
|
||||
relative_position_index[1:, 1:] = relative_coords.sum(
|
||||
-1) # Wh*Ww, Wh*Ww
|
||||
relative_position_index[0, 0:] = self.num_relative_distance - 3
|
||||
relative_position_index[0:, 0] = self.num_relative_distance - 2
|
||||
relative_position_index[0, 0] = self.num_relative_distance - 1
|
||||
self.register_buffer("relative_position_index", relative_position_index)
|
||||
|
||||
def forward(self):
|
||||
relative_position_bias = \
|
||||
self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
|
||||
self.window_size[0] * self.window_size[1] + 1,
|
||||
self.window_size[0] * self.window_size[1] + 1, -1]) # Wh*Ww,Wh*Ww,nH
|
||||
return relative_position_bias.transpose((2, 0, 1)) # nH, Wh*Ww, Wh*Ww
|
||||
|
||||
|
||||
def get_sinusoid_encoding_table(n_position, d_hid, token=False):
|
||||
''' Sinusoid position encoding table '''
|
||||
|
||||
def get_position_angle_vec(position):
|
||||
return [
|
||||
position / np.power(10000, 2 * (hid_j // 2) / d_hid)
|
||||
for hid_j in range(d_hid)
|
||||
]
|
||||
|
||||
sinusoid_table = np.array(
|
||||
[get_position_angle_vec(pos_i) for pos_i in range(n_position)])
|
||||
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i
|
||||
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1
|
||||
if token:
|
||||
sinusoid_table = np.concatenate(
|
||||
[sinusoid_table, np.zeros([1, d_hid])], dim=0)
|
||||
|
||||
return paddle.to_tensor(sinusoid_table, dtype=paddle.float32).unsqueeze(0)
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class VisionTransformer(nn.Layer):
|
||||
""" Vision Transformer with support for patch input
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
img_size=[672, 1092],
|
||||
patch_size=16,
|
||||
in_chans=3,
|
||||
embed_dim=768,
|
||||
depth=12,
|
||||
num_heads=12,
|
||||
mlp_ratio=4,
|
||||
qkv_bias=False,
|
||||
qk_scale=None,
|
||||
drop_rate=0.,
|
||||
attn_drop_rate=0.,
|
||||
drop_path_rate=0.,
|
||||
norm_layer='nn.LayerNorm',
|
||||
init_values=None,
|
||||
use_rel_pos_bias=False,
|
||||
use_shared_rel_pos_bias=False,
|
||||
epsilon=1e-5,
|
||||
final_norm=False,
|
||||
pretrained=None,
|
||||
out_indices=[3, 5, 7, 11],
|
||||
use_abs_pos_emb=False,
|
||||
use_sincos_pos_emb=True,
|
||||
with_fpn=True,
|
||||
num_fpn_levels=4,
|
||||
use_checkpoint=False,
|
||||
**args):
|
||||
super().__init__()
|
||||
self.img_size = img_size
|
||||
self.embed_dim = embed_dim
|
||||
self.with_fpn = with_fpn
|
||||
self.use_checkpoint = use_checkpoint
|
||||
self.use_sincos_pos_emb = use_sincos_pos_emb
|
||||
self.use_rel_pos_bias = use_rel_pos_bias
|
||||
self.final_norm = final_norm
|
||||
self.out_indices = out_indices
|
||||
self.num_fpn_levels = num_fpn_levels
|
||||
|
||||
if use_checkpoint:
|
||||
paddle.seed(0)
|
||||
|
||||
self.patch_embed = PatchEmbed(
|
||||
img_size=img_size,
|
||||
patch_size=patch_size,
|
||||
in_chans=in_chans,
|
||||
embed_dim=embed_dim)
|
||||
|
||||
self.pos_w = self.patch_embed.num_patches_in_w
|
||||
self.pos_h = self.patch_embed.num_patches_in_h
|
||||
|
||||
self.cls_token = self.create_parameter(
|
||||
shape=(1, 1, embed_dim),
|
||||
default_initializer=paddle.nn.initializer.Constant(value=0.))
|
||||
|
||||
if use_abs_pos_emb:
|
||||
self.pos_embed = self.create_parameter(
|
||||
shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
|
||||
default_initializer=paddle.nn.initializer.TruncatedNormal(
|
||||
std=.02))
|
||||
elif use_sincos_pos_emb:
|
||||
pos_embed = self.build_2d_sincos_position_embedding(embed_dim)
|
||||
|
||||
self.pos_embed = pos_embed
|
||||
self.pos_embed = self.create_parameter(shape=pos_embed.shape)
|
||||
self.pos_embed.set_value(pos_embed.numpy())
|
||||
self.pos_embed.stop_gradient = True
|
||||
|
||||
else:
|
||||
self.pos_embed = None
|
||||
|
||||
self.pos_drop = nn.Dropout(p=drop_rate)
|
||||
|
||||
if use_shared_rel_pos_bias:
|
||||
self.rel_pos_bias = RelativePositionBias(
|
||||
window_size=self.patch_embed.patch_shape, num_heads=num_heads)
|
||||
else:
|
||||
self.rel_pos_bias = None
|
||||
|
||||
dpr = np.linspace(0, drop_path_rate, depth)
|
||||
|
||||
self.blocks = nn.LayerList([
|
||||
Block(
|
||||
dim=embed_dim,
|
||||
num_heads=num_heads,
|
||||
mlp_ratio=mlp_ratio,
|
||||
qkv_bias=qkv_bias,
|
||||
qk_scale=qk_scale,
|
||||
drop=drop_rate,
|
||||
attn_drop=attn_drop_rate,
|
||||
drop_path=dpr[i],
|
||||
norm_layer=norm_layer,
|
||||
init_values=init_values,
|
||||
window_size=self.patch_embed.patch_shape
|
||||
if use_rel_pos_bias else None,
|
||||
epsilon=epsilon) for i in range(depth)
|
||||
])
|
||||
|
||||
self.pretrained = pretrained
|
||||
self.init_weight()
|
||||
|
||||
assert len(out_indices) <= 4, ''
|
||||
self.out_indices = out_indices
|
||||
self.out_channels = [embed_dim for _ in range(num_fpn_levels)]
|
||||
self.out_strides = [4, 8, 16, 32][-num_fpn_levels:] if with_fpn else [
|
||||
patch_size for _ in range(len(out_indices))
|
||||
]
|
||||
|
||||
self.norm = Identity()
|
||||
|
||||
if self.with_fpn:
|
||||
assert num_fpn_levels <= 4, ''
|
||||
self.init_fpn(
|
||||
embed_dim=embed_dim,
|
||||
patch_size=patch_size, )
|
||||
|
||||
def init_weight(self):
|
||||
pretrained = self.pretrained
|
||||
|
||||
if pretrained:
|
||||
if 'http' in pretrained: #URL
|
||||
path = paddle.utils.download.get_weights_path_from_url(
|
||||
pretrained)
|
||||
else: #model in local path
|
||||
path = pretrained
|
||||
|
||||
load_state_dict = paddle.load(path)
|
||||
model_state_dict = self.state_dict()
|
||||
pos_embed_name = "pos_embed"
|
||||
|
||||
if pos_embed_name in load_state_dict.keys():
|
||||
load_pos_embed = paddle.to_tensor(
|
||||
load_state_dict[pos_embed_name], dtype="float32")
|
||||
if self.pos_embed.shape != load_pos_embed.shape:
|
||||
pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
|
||||
model_state_dict[pos_embed_name] = self.resize_pos_embed(
|
||||
load_pos_embed, (pos_size, pos_size),
|
||||
(self.pos_h, self.pos_w))
|
||||
|
||||
# self.set_state_dict(model_state_dict)
|
||||
load_state_dict[pos_embed_name] = model_state_dict[
|
||||
pos_embed_name]
|
||||
|
||||
print("Load pos_embed and resize it from {} to {} .".format(
|
||||
load_pos_embed.shape, self.pos_embed.shape))
|
||||
|
||||
self.set_state_dict(load_state_dict)
|
||||
print("Load load_state_dict....")
|
||||
|
||||
def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
|
||||
if patch_size == 16:
|
||||
self.fpn1 = nn.Sequential(
|
||||
nn.Conv2DTranspose(
|
||||
embed_dim, embed_dim, kernel_size=2, stride=2),
|
||||
nn.BatchNorm2D(embed_dim),
|
||||
nn.GELU(),
|
||||
nn.Conv2DTranspose(
|
||||
embed_dim, embed_dim, kernel_size=2, stride=2), )
|
||||
|
||||
self.fpn2 = nn.Sequential(
|
||||
nn.Conv2DTranspose(
|
||||
embed_dim, embed_dim, kernel_size=2, stride=2), )
|
||||
|
||||
self.fpn3 = Identity()
|
||||
|
||||
self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
|
||||
elif patch_size == 8:
|
||||
self.fpn1 = nn.Sequential(
|
||||
nn.Conv2DTranspose(
|
||||
embed_dim, embed_dim, kernel_size=2, stride=2), )
|
||||
|
||||
self.fpn2 = Identity()
|
||||
|
||||
self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )
|
||||
|
||||
self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )
|
||||
|
||||
if not out_with_norm:
|
||||
self.norm = Identity()
|
||||
else:
|
||||
self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6)
|
||||
|
||||
def interpolate_pos_encoding(self, x, w, h):
|
||||
npatch = x.shape[1] - 1
|
||||
N = self.pos_embed.shape[1] - 1
|
||||
w0 = w // self.patch_embed.patch_size
|
||||
h0 = h // self.patch_embed.patch_size
|
||||
if npatch == N and w0 == self.patch_embed.num_patches_w and h0 == self.patch_embed.num_patches_h:
|
||||
return self.pos_embed
|
||||
class_pos_embed = self.pos_embed[:, 0]
|
||||
patch_pos_embed = self.pos_embed[:, 1:]
|
||||
dim = x.shape[-1]
|
||||
# we add a small number to avoid floating point error in the interpolation
|
||||
# see discussion at https://github.com/facebookresearch/dino/issues/8
|
||||
# w0, h0 = w0 + 0.1, h0 + 0.1
|
||||
# patch_pos_embed = nn.functional.interpolate(
|
||||
# patch_pos_embed.reshape([
|
||||
# 1, self.patch_embed.num_patches_w,
|
||||
# self.patch_embed.num_patches_h, dim
|
||||
# ]).transpose((0, 3, 1, 2)),
|
||||
# scale_factor=(w0 / self.patch_embed.num_patches_w,
|
||||
# h0 / self.patch_embed.num_patches_h),
|
||||
# mode='bicubic', )
|
||||
|
||||
patch_pos_embed = nn.functional.interpolate(
|
||||
patch_pos_embed.reshape([
|
||||
1, self.patch_embed.num_patches_w,
|
||||
self.patch_embed.num_patches_h, dim
|
||||
]).transpose((0, 3, 1, 2)),
|
||||
(w0, h0),
|
||||
mode='bicubic', )
|
||||
|
||||
assert int(w0) == patch_pos_embed.shape[-2] and int(
|
||||
h0) == patch_pos_embed.shape[-1]
|
||||
patch_pos_embed = patch_pos_embed.transpose(
|
||||
(0, 2, 3, 1)).reshape([1, -1, dim])
|
||||
return paddle.concat(
|
||||
(class_pos_embed.unsqueeze(0), patch_pos_embed), axis=1)
|
||||
|
||||
def resize_pos_embed(self, pos_embed, old_hw, new_hw):
|
||||
"""
|
||||
Resize pos_embed weight.
|
||||
Args:
|
||||
pos_embed (Tensor): the pos_embed weight
|
||||
old_hw (list[int]): the height and width of old pos_embed
|
||||
new_hw (list[int]): the height and width of new pos_embed
|
||||
Returns:
|
||||
Tensor: the resized pos_embed weight
|
||||
"""
|
||||
cls_pos_embed = pos_embed[:, :1, :]
|
||||
pos_embed = pos_embed[:, 1:, :]
|
||||
|
||||
pos_embed = pos_embed.transpose([0, 2, 1])
|
||||
pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
|
||||
pos_embed = F.interpolate(
|
||||
pos_embed, new_hw, mode='bicubic', align_corners=False)
|
||||
pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
|
||||
pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
|
||||
|
||||
return pos_embed
|
||||
|
||||
def build_2d_sincos_position_embedding(
|
||||
self,
|
||||
embed_dim=768,
|
||||
temperature=10000., ):
|
||||
h, w = self.patch_embed.patch_shape
|
||||
grid_w = paddle.arange(w, dtype=paddle.float32)
|
||||
grid_h = paddle.arange(h, dtype=paddle.float32)
|
||||
grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
|
||||
assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
|
||||
pos_dim = embed_dim // 4
|
||||
omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
|
||||
omega = 1. / (temperature**omega)
|
||||
|
||||
out_w = grid_w.flatten()[..., None] @omega[None]
|
||||
out_h = grid_h.flatten()[..., None] @omega[None]
|
||||
|
||||
pos_emb = paddle.concat(
|
||||
[
|
||||
paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
|
||||
paddle.cos(out_h)
|
||||
],
|
||||
axis=1)[None, :, :]
|
||||
|
||||
pe_token = paddle.zeros([1, 1, embed_dim], dtype=paddle.float32)
|
||||
pos_embed = paddle.concat([pe_token, pos_emb], axis=1)
|
||||
# pos_embed.stop_gradient = True
|
||||
|
||||
return pos_embed
|
||||
|
||||
def forward(self, x):
|
||||
x = x['image'] if isinstance(x, dict) else x
|
||||
_, _, h, w = x.shape
|
||||
|
||||
x = self.patch_embed(x)
|
||||
|
||||
B, D, Hp, Wp = x.shape # b * c * h * w
|
||||
|
||||
cls_tokens = self.cls_token.expand(
|
||||
(B, self.cls_token.shape[-2], self.cls_token.shape[-1]))
|
||||
x = x.flatten(2).transpose([0, 2, 1]) # b * hw * c
|
||||
x = paddle.concat([cls_tokens, x], axis=1)
|
||||
|
||||
if self.pos_embed is not None:
|
||||
# x = x + self.interpolate_pos_encoding(x, w, h)
|
||||
x = x + self.interpolate_pos_encoding(x, h, w)
|
||||
|
||||
x = self.pos_drop(x)
|
||||
|
||||
rel_pos_bias = self.rel_pos_bias(
|
||||
) if self.rel_pos_bias is not None else None
|
||||
|
||||
feats = []
|
||||
for idx, blk in enumerate(self.blocks):
|
||||
if self.use_checkpoint and self.training:
|
||||
x = paddle.distributed.fleet.utils.recompute(
|
||||
blk, x, rel_pos_bias, **{"preserve_rng_state": True})
|
||||
else:
|
||||
x = blk(x, rel_pos_bias)
|
||||
|
||||
if idx in self.out_indices:
|
||||
xp = paddle.reshape(
|
||||
paddle.transpose(
|
||||
self.norm(x[:, 1:, :]), perm=[0, 2, 1]),
|
||||
shape=[B, D, Hp, Wp])
|
||||
feats.append(xp)
|
||||
|
||||
if self.with_fpn:
|
||||
fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4][
|
||||
-self.num_fpn_levels:]
|
||||
assert len(fpns) == len(feats) or len(feats) == 1, ''
|
||||
outputs = []
|
||||
for i, m in enumerate(fpns):
|
||||
outputs.append(
|
||||
m(feats[i] if len(feats) == len(fpns) else feats[-1]))
|
||||
|
||||
return outputs
|
||||
|
||||
return feats
|
||||
|
||||
@property
|
||||
def num_layers(self):
|
||||
return len(self.blocks)
|
||||
|
||||
@property
|
||||
def no_weight_decay(self):
|
||||
return {'pos_embed', 'cls_token'}
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [
|
||||
ShapeSpec(
|
||||
channels=c, stride=s)
|
||||
for c, s in zip(self.out_channels, self.out_strides)
|
||||
]
|
||||
749
rtdetr_paddle/ppdet/modeling/backbones/vit_mae.py
Normal file
749
rtdetr_paddle/ppdet/modeling/backbones/vit_mae.py
Normal file
@@ -0,0 +1,749 @@
|
||||
# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
import numpy as np
|
||||
import math
|
||||
from paddle import ParamAttr
|
||||
from paddle.regularizer import L2Decay
|
||||
from paddle.nn.initializer import Constant, TruncatedNormal
|
||||
|
||||
from ppdet.modeling.shape_spec import ShapeSpec
|
||||
from ppdet.core.workspace import register, serializable
|
||||
|
||||
from .transformer_utils import (zeros_, DropPath, Identity, window_partition,
|
||||
window_unpartition)
|
||||
from ..initializer import linear_init_
|
||||
|
||||
__all__ = ['VisionTransformer2D', 'SimpleFeaturePyramid']
|
||||
|
||||
|
||||
class Mlp(nn.Layer):
|
||||
def __init__(self,
|
||||
in_features,
|
||||
hidden_features=None,
|
||||
out_features=None,
|
||||
act_layer='nn.GELU',
|
||||
drop=0.,
|
||||
lr_factor=1.0):
|
||||
super().__init__()
|
||||
out_features = out_features or in_features
|
||||
hidden_features = hidden_features or in_features
|
||||
self.fc1 = nn.Linear(
|
||||
in_features,
|
||||
hidden_features,
|
||||
weight_attr=ParamAttr(learning_rate=lr_factor),
|
||||
bias_attr=ParamAttr(learning_rate=lr_factor))
|
||||
self.act = eval(act_layer)()
|
||||
self.fc2 = nn.Linear(
|
||||
hidden_features,
|
||||
out_features,
|
||||
weight_attr=ParamAttr(learning_rate=lr_factor),
|
||||
bias_attr=ParamAttr(learning_rate=lr_factor))
|
||||
self.drop = nn.Dropout(drop)
|
||||
|
||||
self._init_weights()
|
||||
|
||||
def _init_weights(self):
|
||||
linear_init_(self.fc1)
|
||||
linear_init_(self.fc2)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.drop(self.act(self.fc1(x)))
|
||||
x = self.drop(self.fc2(x))
|
||||
return x
|
||||
|
||||
|
||||
class Attention(nn.Layer):
|
||||
def __init__(self,
|
||||
dim,
|
||||
num_heads=8,
|
||||
qkv_bias=False,
|
||||
attn_bias=False,
|
||||
attn_drop=0.,
|
||||
proj_drop=0.,
|
||||
use_rel_pos=False,
|
||||
rel_pos_zero_init=True,
|
||||
window_size=None,
|
||||
input_size=None,
|
||||
qk_scale=None,
|
||||
lr_factor=1.0):
|
||||
super().__init__()
|
||||
self.num_heads = num_heads
|
||||
self.head_dim = dim // num_heads
|
||||
self.scale = qk_scale or self.head_dim**-0.5
|
||||
self.use_rel_pos = use_rel_pos
|
||||
self.input_size = input_size
|
||||
self.rel_pos_zero_init = rel_pos_zero_init
|
||||
self.window_size = window_size
|
||||
self.lr_factor = lr_factor
|
||||
|
||||
self.qkv = nn.Linear(
|
||||
dim,
|
||||
dim * 3,
|
||||
weight_attr=ParamAttr(learning_rate=lr_factor),
|
||||
bias_attr=ParamAttr(learning_rate=lr_factor)
|
||||
if attn_bias else False)
|
||||
if qkv_bias:
|
||||
self.q_bias = self.create_parameter(
|
||||
shape=([dim]), default_initializer=zeros_)
|
||||
self.v_bias = self.create_parameter(
|
||||
shape=([dim]), default_initializer=zeros_)
|
||||
else:
|
||||
self.q_bias = None
|
||||
self.v_bias = None
|
||||
self.proj = nn.Linear(
|
||||
dim,
|
||||
dim,
|
||||
weight_attr=ParamAttr(learning_rate=lr_factor),
|
||||
bias_attr=ParamAttr(learning_rate=lr_factor))
|
||||
self.attn_drop = nn.Dropout(attn_drop)
|
||||
if window_size is None:
|
||||
self.window_size = self.input_size[0]
|
||||
|
||||
self._init_weights()
|
||||
|
||||
def _init_weights(self):
|
||||
linear_init_(self.qkv)
|
||||
linear_init_(self.proj)
|
||||
|
||||
if self.use_rel_pos:
|
||||
self.rel_pos_h = self.create_parameter(
|
||||
[2 * self.window_size - 1, self.head_dim],
|
||||
attr=ParamAttr(learning_rate=self.lr_factor),
|
||||
default_initializer=Constant(value=0.))
|
||||
self.rel_pos_w = self.create_parameter(
|
||||
[2 * self.window_size - 1, self.head_dim],
|
||||
attr=ParamAttr(learning_rate=self.lr_factor),
|
||||
default_initializer=Constant(value=0.))
|
||||
|
||||
if not self.rel_pos_zero_init:
|
||||
TruncatedNormal(self.rel_pos_h, std=0.02)
|
||||
TruncatedNormal(self.rel_pos_w, std=0.02)
|
||||
|
||||
def get_rel_pos(self, seq_size, rel_pos):
|
||||
max_rel_dist = int(2 * seq_size - 1)
|
||||
# Interpolate rel pos if needed.
|
||||
if rel_pos.shape[0] != max_rel_dist:
|
||||
# Interpolate rel pos.
|
||||
rel_pos = rel_pos.reshape([1, rel_pos.shape[0], -1])
|
||||
rel_pos = rel_pos.transpose([0, 2, 1])
|
||||
rel_pos_resized = F.interpolate(
|
||||
rel_pos,
|
||||
size=(max_rel_dist, ),
|
||||
mode="linear",
|
||||
data_format='NCW')
|
||||
rel_pos_resized = rel_pos_resized.reshape([-1, max_rel_dist])
|
||||
rel_pos_resized = rel_pos_resized.transpose([1, 0])
|
||||
else:
|
||||
rel_pos_resized = rel_pos
|
||||
|
||||
coords = paddle.arange(seq_size, dtype='float32')
|
||||
relative_coords = coords.unsqueeze(-1) - coords.unsqueeze(0)
|
||||
relative_coords += (seq_size - 1)
|
||||
relative_coords = relative_coords.astype('int64').flatten()
|
||||
|
||||
return paddle.index_select(rel_pos_resized, relative_coords).reshape(
|
||||
[seq_size, seq_size, self.head_dim])
|
||||
|
||||
def add_decomposed_rel_pos(self, attn, q, h, w):
|
||||
"""
|
||||
Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
|
||||
Args:
|
||||
attn (Tensor): attention map.
|
||||
q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
|
||||
Returns:
|
||||
attn (Tensor): attention map with added relative positional embeddings.
|
||||
"""
|
||||
Rh = self.get_rel_pos(h, self.rel_pos_h)
|
||||
Rw = self.get_rel_pos(w, self.rel_pos_w)
|
||||
|
||||
B, _, dim = q.shape
|
||||
r_q = q.reshape([B, h, w, dim])
|
||||
# bhwc, hch->bhwh1
|
||||
# bwhc, wcw->bhw1w
|
||||
rel_h = paddle.einsum("bhwc,hkc->bhwk", r_q, Rh).unsqueeze(-1)
|
||||
rel_w = paddle.einsum("bhwc,wkc->bhwk", r_q, Rw).unsqueeze(-2)
|
||||
|
||||
attn = attn.reshape([B, h, w, h, w]) + rel_h + rel_w
|
||||
return attn.reshape([B, h * w, h * w])
|
||||
|
||||
def forward(self, x):
|
||||
B, H, W, C = paddle.shape(x)
|
||||
|
||||
if self.q_bias is not None:
|
||||
qkv_bias = paddle.concat(
|
||||
(self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
|
||||
qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
|
||||
else:
|
||||
qkv = self.qkv(x).reshape(
|
||||
[B, H * W, 3, self.num_heads, self.head_dim]).transpose(
|
||||
[2, 0, 3, 1, 4]).reshape(
|
||||
[3, B * self.num_heads, H * W, self.head_dim])
|
||||
|
||||
q, k, v = qkv[0], qkv[1], qkv[2]
|
||||
attn = q.matmul(k.transpose([0, 2, 1])) * self.scale
|
||||
|
||||
if self.use_rel_pos:
|
||||
attn = self.add_decomposed_rel_pos(attn, q, H, W)
|
||||
|
||||
attn = F.softmax(attn, axis=-1)
|
||||
attn = self.attn_drop(attn)
|
||||
x = attn.matmul(v).reshape(
|
||||
[B, self.num_heads, H * W, self.head_dim]).transpose(
|
||||
[0, 2, 1, 3]).reshape([B, H, W, C])
|
||||
x = self.proj(x)
|
||||
return x
|
||||
|
||||
|
||||
class Block(nn.Layer):
|
||||
def __init__(self,
|
||||
dim,
|
||||
num_heads,
|
||||
mlp_ratio=4.,
|
||||
qkv_bias=False,
|
||||
attn_bias=False,
|
||||
qk_scale=None,
|
||||
init_values=None,
|
||||
drop=0.,
|
||||
attn_drop=0.,
|
||||
drop_path=0.,
|
||||
use_rel_pos=True,
|
||||
rel_pos_zero_init=True,
|
||||
window_size=None,
|
||||
input_size=None,
|
||||
act_layer='nn.GELU',
|
||||
norm_layer='nn.LayerNorm',
|
||||
lr_factor=1.0,
|
||||
epsilon=1e-5):
|
||||
super().__init__()
|
||||
self.window_size = window_size
|
||||
|
||||
self.norm1 = eval(norm_layer)(dim,
|
||||
weight_attr=ParamAttr(
|
||||
learning_rate=lr_factor,
|
||||
regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(
|
||||
learning_rate=lr_factor,
|
||||
regularizer=L2Decay(0.0)),
|
||||
epsilon=epsilon)
|
||||
self.attn = Attention(
|
||||
dim,
|
||||
num_heads=num_heads,
|
||||
qkv_bias=qkv_bias,
|
||||
attn_bias=attn_bias,
|
||||
qk_scale=qk_scale,
|
||||
attn_drop=attn_drop,
|
||||
proj_drop=drop,
|
||||
use_rel_pos=use_rel_pos,
|
||||
rel_pos_zero_init=rel_pos_zero_init,
|
||||
window_size=window_size,
|
||||
input_size=input_size,
|
||||
lr_factor=lr_factor)
|
||||
|
||||
self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
|
||||
self.norm2 = eval(norm_layer)(dim,
|
||||
weight_attr=ParamAttr(
|
||||
learning_rate=lr_factor,
|
||||
regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(
|
||||
learning_rate=lr_factor,
|
||||
regularizer=L2Decay(0.0)),
|
||||
epsilon=epsilon)
|
||||
self.mlp = Mlp(in_features=dim,
|
||||
hidden_features=int(dim * mlp_ratio),
|
||||
act_layer=act_layer,
|
||||
drop=drop,
|
||||
lr_factor=lr_factor)
|
||||
if init_values is not None:
|
||||
self.gamma_1 = self.create_parameter(
|
||||
shape=([dim]), default_initializer=Constant(value=init_values))
|
||||
self.gamma_2 = self.create_parameter(
|
||||
shape=([dim]), default_initializer=Constant(value=init_values))
|
||||
else:
|
||||
self.gamma_1, self.gamma_2 = None, None
|
||||
|
||||
def forward(self, x):
|
||||
y = self.norm1(x)
|
||||
if self.window_size is not None:
|
||||
y, pad_hw, num_hw = window_partition(y, self.window_size)
|
||||
y = self.attn(y)
|
||||
if self.gamma_1 is not None:
|
||||
y = self.gamma_1 * y
|
||||
|
||||
if self.window_size is not None:
|
||||
y = window_unpartition(y, pad_hw, num_hw, (x.shape[1], x.shape[2]))
|
||||
x = x + self.drop_path(y)
|
||||
if self.gamma_2 is None:
|
||||
x = x + self.drop_path(self.mlp(self.norm2(x)))
|
||||
else:
|
||||
x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
|
||||
|
||||
return x
|
||||
|
||||
|
||||
class PatchEmbed(nn.Layer):
|
||||
""" Image to Patch Embedding
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
img_size=(224, 224),
|
||||
patch_size=16,
|
||||
in_chans=3,
|
||||
embed_dim=768,
|
||||
lr_factor=0.01):
|
||||
super().__init__()
|
||||
self.img_size = img_size
|
||||
self.patch_size = patch_size
|
||||
self.proj = nn.Conv2D(
|
||||
in_chans,
|
||||
embed_dim,
|
||||
kernel_size=patch_size,
|
||||
stride=patch_size,
|
||||
weight_attr=ParamAttr(learning_rate=lr_factor),
|
||||
bias_attr=ParamAttr(learning_rate=lr_factor))
|
||||
|
||||
@property
|
||||
def num_patches_in_h(self):
|
||||
return self.img_size[1] // self.patch_size
|
||||
|
||||
@property
|
||||
def num_patches_in_w(self):
|
||||
return self.img_size[0] // self.patch_size
|
||||
|
||||
def forward(self, x):
|
||||
out = self.proj(x)
|
||||
return out
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class VisionTransformer2D(nn.Layer):
|
||||
""" Vision Transformer with support for patch input
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
img_size=(1024, 1024),
|
||||
patch_size=16,
|
||||
in_chans=3,
|
||||
embed_dim=768,
|
||||
depth=12,
|
||||
num_heads=12,
|
||||
mlp_ratio=4,
|
||||
qkv_bias=False,
|
||||
attn_bias=False,
|
||||
qk_scale=None,
|
||||
init_values=None,
|
||||
drop_rate=0.,
|
||||
attn_drop_rate=0.,
|
||||
drop_path_rate=0.,
|
||||
act_layer='nn.GELU',
|
||||
norm_layer='nn.LayerNorm',
|
||||
lr_decay_rate=1.0,
|
||||
global_attn_indexes=(2, 5, 8, 11),
|
||||
use_abs_pos=False,
|
||||
use_rel_pos=False,
|
||||
use_abs_pos_emb=False,
|
||||
use_sincos_pos_emb=False,
|
||||
rel_pos_zero_init=True,
|
||||
epsilon=1e-5,
|
||||
final_norm=False,
|
||||
pretrained=None,
|
||||
window_size=None,
|
||||
out_indices=(11, ),
|
||||
with_fpn=False,
|
||||
use_checkpoint=False,
|
||||
*args,
|
||||
**kwargs):
|
||||
super().__init__()
|
||||
self.img_size = img_size
|
||||
self.patch_size = patch_size
|
||||
self.embed_dim = embed_dim
|
||||
self.num_heads = num_heads
|
||||
self.depth = depth
|
||||
self.global_attn_indexes = global_attn_indexes
|
||||
self.epsilon = epsilon
|
||||
self.with_fpn = with_fpn
|
||||
self.use_checkpoint = use_checkpoint
|
||||
|
||||
self.patch_h = img_size[0] // patch_size
|
||||
self.patch_w = img_size[1] // patch_size
|
||||
self.num_patches = self.patch_h * self.patch_w
|
||||
self.use_abs_pos = use_abs_pos
|
||||
self.use_abs_pos_emb = use_abs_pos_emb
|
||||
|
||||
self.patch_embed = PatchEmbed(
|
||||
img_size=img_size,
|
||||
patch_size=patch_size,
|
||||
in_chans=in_chans,
|
||||
embed_dim=embed_dim)
|
||||
|
||||
dpr = np.linspace(0, drop_path_rate, depth)
|
||||
if use_checkpoint:
|
||||
paddle.seed(0)
|
||||
|
||||
if use_abs_pos_emb:
|
||||
self.pos_w = self.patch_embed.num_patches_in_w
|
||||
self.pos_h = self.patch_embed.num_patches_in_h
|
||||
self.pos_embed = self.create_parameter(
|
||||
shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
|
||||
default_initializer=paddle.nn.initializer.TruncatedNormal(
|
||||
std=.02))
|
||||
elif use_sincos_pos_emb:
|
||||
pos_embed = self.get_2d_sincos_position_embedding(self.patch_h,
|
||||
self.patch_w)
|
||||
|
||||
self.pos_embed = pos_embed
|
||||
self.pos_embed = self.create_parameter(shape=pos_embed.shape)
|
||||
self.pos_embed.set_value(pos_embed.numpy())
|
||||
self.pos_embed.stop_gradient = True
|
||||
else:
|
||||
self.pos_embed = None
|
||||
|
||||
self.blocks = nn.LayerList([
|
||||
Block(
|
||||
embed_dim,
|
||||
num_heads=num_heads,
|
||||
mlp_ratio=mlp_ratio,
|
||||
qkv_bias=qkv_bias,
|
||||
attn_bias=attn_bias,
|
||||
qk_scale=qk_scale,
|
||||
drop=drop_rate,
|
||||
attn_drop=attn_drop_rate,
|
||||
drop_path=dpr[i],
|
||||
use_rel_pos=use_rel_pos,
|
||||
rel_pos_zero_init=rel_pos_zero_init,
|
||||
window_size=None
|
||||
if i in self.global_attn_indexes else window_size,
|
||||
input_size=[self.patch_h, self.patch_w],
|
||||
act_layer=act_layer,
|
||||
lr_factor=self.get_vit_lr_decay_rate(i, lr_decay_rate),
|
||||
norm_layer=norm_layer,
|
||||
init_values=init_values,
|
||||
epsilon=epsilon) for i in range(depth)
|
||||
])
|
||||
|
||||
assert len(out_indices) <= 4, 'out_indices out of bound'
|
||||
self.out_indices = out_indices
|
||||
self.pretrained = pretrained
|
||||
self.init_weight()
|
||||
|
||||
self.out_channels = [embed_dim for _ in range(len(out_indices))]
|
||||
self.out_strides = [4, 8, 16, 32][-len(out_indices):] if with_fpn else [
|
||||
patch_size for _ in range(len(out_indices))
|
||||
]
|
||||
self.norm = Identity()
|
||||
if self.with_fpn:
|
||||
self.init_fpn(
|
||||
embed_dim=embed_dim,
|
||||
patch_size=patch_size,
|
||||
out_with_norm=final_norm)
|
||||
|
||||
def get_vit_lr_decay_rate(self, layer_id, lr_decay_rate):
|
||||
return lr_decay_rate**(self.depth - layer_id)
|
||||
|
||||
def init_weight(self):
|
||||
pretrained = self.pretrained
|
||||
if pretrained:
|
||||
if 'http' in pretrained:
|
||||
path = paddle.utils.download.get_weights_path_from_url(
|
||||
pretrained)
|
||||
else:
|
||||
path = pretrained
|
||||
|
||||
load_state_dict = paddle.load(path)
|
||||
model_state_dict = self.state_dict()
|
||||
pos_embed_name = "pos_embed"
|
||||
|
||||
if pos_embed_name in load_state_dict.keys(
|
||||
) and self.use_abs_pos_emb:
|
||||
load_pos_embed = paddle.to_tensor(
|
||||
load_state_dict[pos_embed_name], dtype="float32")
|
||||
if self.pos_embed.shape != load_pos_embed.shape:
|
||||
pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
|
||||
model_state_dict[pos_embed_name] = self.resize_pos_embed(
|
||||
load_pos_embed, (pos_size, pos_size),
|
||||
(self.pos_h, self.pos_w))
|
||||
|
||||
# self.set_state_dict(model_state_dict)
|
||||
load_state_dict[pos_embed_name] = model_state_dict[
|
||||
pos_embed_name]
|
||||
|
||||
print("Load pos_embed and resize it from {} to {} .".format(
|
||||
load_pos_embed.shape, self.pos_embed.shape))
|
||||
|
||||
self.set_state_dict(load_state_dict)
|
||||
print("Load load_state_dict....")
|
||||
|
||||
def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
|
||||
if patch_size == 16:
|
||||
self.fpn1 = nn.Sequential(
|
||||
nn.Conv2DTranspose(
|
||||
embed_dim, embed_dim, kernel_size=2, stride=2),
|
||||
nn.BatchNorm2D(embed_dim),
|
||||
nn.GELU(),
|
||||
nn.Conv2DTranspose(
|
||||
embed_dim, embed_dim, kernel_size=2, stride=2), )
|
||||
|
||||
self.fpn2 = nn.Sequential(
|
||||
nn.Conv2DTranspose(
|
||||
embed_dim, embed_dim, kernel_size=2, stride=2), )
|
||||
|
||||
self.fpn3 = Identity()
|
||||
|
||||
self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
|
||||
elif patch_size == 8:
|
||||
self.fpn1 = nn.Sequential(
|
||||
nn.Conv2DTranspose(
|
||||
embed_dim, embed_dim, kernel_size=2, stride=2), )
|
||||
|
||||
self.fpn2 = Identity()
|
||||
|
||||
self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )
|
||||
|
||||
self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )
|
||||
|
||||
if not out_with_norm:
|
||||
self.norm = Identity()
|
||||
else:
|
||||
self.norm = nn.LayerNorm(embed_dim, epsilon=self.epsilon)
|
||||
|
||||
def resize_pos_embed(self, pos_embed, old_hw, new_hw):
|
||||
"""
|
||||
Resize pos_embed weight.
|
||||
Args:
|
||||
pos_embed (Tensor): the pos_embed weight
|
||||
old_hw (list[int]): the height and width of old pos_embed
|
||||
new_hw (list[int]): the height and width of new pos_embed
|
||||
Returns:
|
||||
Tensor: the resized pos_embed weight
|
||||
"""
|
||||
cls_pos_embed = pos_embed[:, :1, :]
|
||||
pos_embed = pos_embed[:, 1:, :]
|
||||
|
||||
pos_embed = pos_embed.transpose([0, 2, 1])
|
||||
pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
|
||||
pos_embed = F.interpolate(
|
||||
pos_embed, new_hw, mode='bicubic', align_corners=False)
|
||||
pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
|
||||
pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
|
||||
|
||||
return pos_embed
|
||||
|
||||
def get_2d_sincos_position_embedding(self, h, w, temperature=10000.):
|
||||
grid_y, grid_x = paddle.meshgrid(
|
||||
paddle.arange(
|
||||
h, dtype=paddle.float32),
|
||||
paddle.arange(
|
||||
w, dtype=paddle.float32))
|
||||
assert self.embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
|
||||
pos_dim = self.embed_dim // 4
|
||||
omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
|
||||
omega = (1. / (temperature**omega)).unsqueeze(0)
|
||||
|
||||
out_x = grid_x.reshape([-1, 1]).matmul(omega)
|
||||
out_y = grid_y.reshape([-1, 1]).matmul(omega)
|
||||
|
||||
pos_emb = paddle.concat(
|
||||
[
|
||||
paddle.sin(out_y), paddle.cos(out_y), paddle.sin(out_x),
|
||||
paddle.cos(out_x)
|
||||
],
|
||||
axis=1)
|
||||
|
||||
return pos_emb.reshape([1, h, w, self.embed_dim])
|
||||
|
||||
def forward(self, inputs):
|
||||
x = self.patch_embed(inputs['image']).transpose([0, 2, 3, 1])
|
||||
B, Hp, Wp, _ = paddle.shape(x)
|
||||
|
||||
if self.use_abs_pos:
|
||||
x = x + self.get_2d_sincos_position_embedding(Hp, Wp)
|
||||
|
||||
if self.use_abs_pos_emb:
|
||||
x = x + self.resize_pos_embed(self.pos_embed,
|
||||
(self.pos_h, self.pos_w), (Hp, Wp))
|
||||
|
||||
feats = []
|
||||
for idx, blk in enumerate(self.blocks):
|
||||
if self.use_checkpoint and self.training:
|
||||
x = paddle.distributed.fleet.utils.recompute(
|
||||
blk, x, **{"preserve_rng_state": True})
|
||||
else:
|
||||
x = blk(x)
|
||||
if idx in self.out_indices:
|
||||
feats.append(self.norm(x.transpose([0, 3, 1, 2])))
|
||||
|
||||
if self.with_fpn:
|
||||
fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
|
||||
for i in range(len(feats)):
|
||||
feats[i] = fpns[i](feats[i])
|
||||
return feats
|
||||
|
||||
@property
|
||||
def num_layers(self):
|
||||
return len(self.blocks)
|
||||
|
||||
@property
|
||||
def no_weight_decay(self):
|
||||
return {'pos_embed', 'cls_token'}
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [
|
||||
ShapeSpec(
|
||||
channels=c, stride=s)
|
||||
for c, s in zip(self.out_channels, self.out_strides)
|
||||
]
|
||||
|
||||
|
||||
class LayerNorm(nn.Layer):
|
||||
"""
|
||||
A LayerNorm variant, popularized by Transformers, that performs point-wise mean and
|
||||
variance normalization over the channel dimension for inputs that have shape
|
||||
(batch_size, channels, height, width).
|
||||
Note that, the modified LayerNorm on used in ResBlock and SimpleFeaturePyramid.
|
||||
|
||||
In ViT, we use the nn.LayerNorm
|
||||
"""
|
||||
|
||||
def __init__(self, normalized_shape, eps=1e-6):
|
||||
super().__init__()
|
||||
self.weight = self.create_parameter([normalized_shape])
|
||||
self.bias = self.create_parameter([normalized_shape])
|
||||
self.eps = eps
|
||||
self.normalized_shape = (normalized_shape, )
|
||||
|
||||
def forward(self, x):
|
||||
u = x.mean(1, keepdim=True)
|
||||
s = (x - u).pow(2).mean(1, keepdim=True)
|
||||
x = (x - u) / paddle.sqrt(s + self.eps)
|
||||
x = self.weight[:, None, None] * x + self.bias[:, None, None]
|
||||
return x
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class SimpleFeaturePyramid(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
spatial_scales,
|
||||
num_levels=4,
|
||||
use_bias=False):
|
||||
"""
|
||||
Args:
|
||||
in_channels (list[int]): input channels of each level which can be
|
||||
derived from the output shape of backbone by from_config
|
||||
out_channel (int): output channel of each level.
|
||||
spatial_scales (list[float]): list of scaling factors to upsample or downsample
|
||||
the input features for creating pyramid features which can be derived from
|
||||
the output shape of backbone by from_config
|
||||
num_levels (int): number of levels of output features.
|
||||
use_bias (bool): whether use bias or not.
|
||||
"""
|
||||
super(SimpleFeaturePyramid, self).__init__()
|
||||
|
||||
self.in_channels = in_channels[0]
|
||||
self.out_channels = out_channels
|
||||
self.num_levels = num_levels
|
||||
|
||||
self.stages = []
|
||||
dim = self.in_channels
|
||||
if num_levels == 4:
|
||||
scale_factors = [2.0, 1.0, 0.5]
|
||||
elif num_levels == 5:
|
||||
scale_factors = [4.0, 2.0, 1.0, 0.5]
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"num_levels={num_levels} is not supported yet.")
|
||||
|
||||
dim = in_channels[0]
|
||||
for idx, scale in enumerate(scale_factors):
|
||||
out_dim = dim
|
||||
if scale == 4.0:
|
||||
layers = [
|
||||
nn.Conv2DTranspose(
|
||||
dim, dim // 2, kernel_size=2, stride=2),
|
||||
nn.LayerNorm(dim // 2),
|
||||
nn.GELU(),
|
||||
nn.Conv2DTranspose(
|
||||
dim // 2, dim // 4, kernel_size=2, stride=2),
|
||||
]
|
||||
out_dim = dim // 4
|
||||
elif scale == 2.0:
|
||||
layers = [
|
||||
nn.Conv2DTranspose(
|
||||
dim, dim // 2, kernel_size=2, stride=2)
|
||||
]
|
||||
out_dim = dim // 2
|
||||
elif scale == 1.0:
|
||||
layers = []
|
||||
elif scale == 0.5:
|
||||
layers = [nn.MaxPool2D(kernel_size=2, stride=2)]
|
||||
|
||||
layers.extend([
|
||||
nn.Conv2D(
|
||||
out_dim,
|
||||
out_channels,
|
||||
kernel_size=1,
|
||||
bias_attr=use_bias, ), LayerNorm(out_channels), nn.Conv2D(
|
||||
out_channels,
|
||||
out_channels,
|
||||
kernel_size=3,
|
||||
padding=1,
|
||||
bias_attr=use_bias, ), LayerNorm(out_channels)
|
||||
])
|
||||
layers = nn.Sequential(*layers)
|
||||
|
||||
stage = -int(math.log2(spatial_scales[0] * scale_factors[idx]))
|
||||
self.add_sublayer(f"simfp_{stage}", layers)
|
||||
self.stages.append(layers)
|
||||
|
||||
# top block output feature maps.
|
||||
self.top_block = nn.Sequential(
|
||||
nn.MaxPool2D(
|
||||
kernel_size=1, stride=2, padding=0))
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, cfg, input_shape):
|
||||
return {
|
||||
'in_channels': [i.channels for i in input_shape],
|
||||
'spatial_scales': [1.0 / i.stride for i in input_shape],
|
||||
}
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [
|
||||
ShapeSpec(channels=self.out_channels)
|
||||
for _ in range(self.num_levels)
|
||||
]
|
||||
|
||||
def forward(self, feats):
|
||||
"""
|
||||
Args:
|
||||
x: Tensor of shape (N,C,H,W).
|
||||
"""
|
||||
features = feats[0]
|
||||
results = []
|
||||
|
||||
for stage in self.stages:
|
||||
results.append(stage(features))
|
||||
|
||||
top_block_in_feature = results[-1]
|
||||
results.append(self.top_block(top_block_in_feature))
|
||||
assert self.num_levels == len(results)
|
||||
|
||||
return results
|
||||
607
rtdetr_paddle/ppdet/modeling/bbox_utils.py
Normal file
607
rtdetr_paddle/ppdet/modeling/bbox_utils.py
Normal file
@@ -0,0 +1,607 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import math
|
||||
import paddle
|
||||
import numpy as np
|
||||
|
||||
|
||||
def bbox2delta(src_boxes, tgt_boxes, weights=[1.0, 1.0, 1.0, 1.0]):
|
||||
"""Encode bboxes to deltas.
|
||||
"""
|
||||
src_w = src_boxes[:, 2] - src_boxes[:, 0]
|
||||
src_h = src_boxes[:, 3] - src_boxes[:, 1]
|
||||
src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
|
||||
src_ctr_y = src_boxes[:, 1] + 0.5 * src_h
|
||||
|
||||
tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]
|
||||
tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]
|
||||
tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w
|
||||
tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h
|
||||
|
||||
wx, wy, ww, wh = weights
|
||||
dx = wx * (tgt_ctr_x - src_ctr_x) / src_w
|
||||
dy = wy * (tgt_ctr_y - src_ctr_y) / src_h
|
||||
dw = ww * paddle.log(tgt_w / src_w)
|
||||
dh = wh * paddle.log(tgt_h / src_h)
|
||||
|
||||
deltas = paddle.stack((dx, dy, dw, dh), axis=1)
|
||||
return deltas
|
||||
|
||||
|
||||
def delta2bbox(deltas, boxes, weights=[1.0, 1.0, 1.0, 1.0], max_shape=None):
|
||||
"""Decode deltas to boxes. Used in RCNNBox,CascadeHead,RCNNHead,RetinaHead.
|
||||
Note: return tensor shape [n,1,4]
|
||||
If you want to add a reshape, please add after the calling code instead of here.
|
||||
"""
|
||||
clip_scale = math.log(1000.0 / 16)
|
||||
|
||||
widths = boxes[:, 2] - boxes[:, 0]
|
||||
heights = boxes[:, 3] - boxes[:, 1]
|
||||
ctr_x = boxes[:, 0] + 0.5 * widths
|
||||
ctr_y = boxes[:, 1] + 0.5 * heights
|
||||
|
||||
wx, wy, ww, wh = weights
|
||||
dx = deltas[:, 0::4] / wx
|
||||
dy = deltas[:, 1::4] / wy
|
||||
dw = deltas[:, 2::4] / ww
|
||||
dh = deltas[:, 3::4] / wh
|
||||
# Prevent sending too large values into paddle.exp()
|
||||
dw = paddle.clip(dw, max=clip_scale)
|
||||
dh = paddle.clip(dh, max=clip_scale)
|
||||
|
||||
pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1)
|
||||
pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1)
|
||||
pred_w = paddle.exp(dw) * widths.unsqueeze(1)
|
||||
pred_h = paddle.exp(dh) * heights.unsqueeze(1)
|
||||
|
||||
pred_boxes = []
|
||||
pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
|
||||
pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
|
||||
pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
|
||||
pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
|
||||
pred_boxes = paddle.stack(pred_boxes, axis=-1)
|
||||
|
||||
if max_shape is not None:
|
||||
pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(
|
||||
min=0, max=max_shape[1])
|
||||
pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(
|
||||
min=0, max=max_shape[0])
|
||||
return pred_boxes
|
||||
|
||||
|
||||
def bbox2delta_v2(src_boxes,
|
||||
tgt_boxes,
|
||||
delta_mean=[0.0, 0.0, 0.0, 0.0],
|
||||
delta_std=[1.0, 1.0, 1.0, 1.0]):
|
||||
"""Encode bboxes to deltas.
|
||||
Modified from bbox2delta() which just use weight parameters to multiply deltas.
|
||||
"""
|
||||
src_w = src_boxes[:, 2] - src_boxes[:, 0]
|
||||
src_h = src_boxes[:, 3] - src_boxes[:, 1]
|
||||
src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
|
||||
src_ctr_y = src_boxes[:, 1] + 0.5 * src_h
|
||||
|
||||
tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]
|
||||
tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]
|
||||
tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w
|
||||
tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h
|
||||
|
||||
dx = (tgt_ctr_x - src_ctr_x) / src_w
|
||||
dy = (tgt_ctr_y - src_ctr_y) / src_h
|
||||
dw = paddle.log(tgt_w / src_w)
|
||||
dh = paddle.log(tgt_h / src_h)
|
||||
|
||||
deltas = paddle.stack((dx, dy, dw, dh), axis=1)
|
||||
deltas = (
|
||||
deltas - paddle.to_tensor(delta_mean)) / paddle.to_tensor(delta_std)
|
||||
return deltas
|
||||
|
||||
|
||||
def delta2bbox_v2(deltas,
|
||||
boxes,
|
||||
delta_mean=[0.0, 0.0, 0.0, 0.0],
|
||||
delta_std=[1.0, 1.0, 1.0, 1.0],
|
||||
max_shape=None,
|
||||
ctr_clip=32.0):
|
||||
"""Decode deltas to bboxes.
|
||||
Modified from delta2bbox() which just use weight parameters to be divided by deltas.
|
||||
Used in YOLOFHead.
|
||||
Note: return tensor shape [n,1,4]
|
||||
If you want to add a reshape, please add after the calling code instead of here.
|
||||
"""
|
||||
clip_scale = math.log(1000.0 / 16)
|
||||
|
||||
widths = boxes[:, 2] - boxes[:, 0]
|
||||
heights = boxes[:, 3] - boxes[:, 1]
|
||||
ctr_x = boxes[:, 0] + 0.5 * widths
|
||||
ctr_y = boxes[:, 1] + 0.5 * heights
|
||||
|
||||
deltas = deltas * paddle.to_tensor(delta_std) + paddle.to_tensor(delta_mean)
|
||||
dx = deltas[:, 0::4]
|
||||
dy = deltas[:, 1::4]
|
||||
dw = deltas[:, 2::4]
|
||||
dh = deltas[:, 3::4]
|
||||
|
||||
# Prevent sending too large values into paddle.exp()
|
||||
dx = dx * widths.unsqueeze(1)
|
||||
dy = dy * heights.unsqueeze(1)
|
||||
if ctr_clip is not None:
|
||||
dx = paddle.clip(dx, max=ctr_clip, min=-ctr_clip)
|
||||
dy = paddle.clip(dy, max=ctr_clip, min=-ctr_clip)
|
||||
dw = paddle.clip(dw, max=clip_scale)
|
||||
dh = paddle.clip(dh, max=clip_scale)
|
||||
else:
|
||||
dw = dw.clip(min=-clip_scale, max=clip_scale)
|
||||
dh = dh.clip(min=-clip_scale, max=clip_scale)
|
||||
|
||||
pred_ctr_x = dx + ctr_x.unsqueeze(1)
|
||||
pred_ctr_y = dy + ctr_y.unsqueeze(1)
|
||||
pred_w = paddle.exp(dw) * widths.unsqueeze(1)
|
||||
pred_h = paddle.exp(dh) * heights.unsqueeze(1)
|
||||
|
||||
pred_boxes = []
|
||||
pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
|
||||
pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
|
||||
pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
|
||||
pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
|
||||
pred_boxes = paddle.stack(pred_boxes, axis=-1)
|
||||
|
||||
if max_shape is not None:
|
||||
pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(
|
||||
min=0, max=max_shape[1])
|
||||
pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(
|
||||
min=0, max=max_shape[0])
|
||||
return pred_boxes
|
||||
|
||||
|
||||
def expand_bbox(bboxes, scale):
|
||||
w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5
|
||||
h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5
|
||||
x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5
|
||||
y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5
|
||||
|
||||
w_half *= scale
|
||||
h_half *= scale
|
||||
|
||||
bboxes_exp = np.zeros(bboxes.shape, dtype=np.float32)
|
||||
bboxes_exp[:, 0] = x_c - w_half
|
||||
bboxes_exp[:, 2] = x_c + w_half
|
||||
bboxes_exp[:, 1] = y_c - h_half
|
||||
bboxes_exp[:, 3] = y_c + h_half
|
||||
|
||||
return bboxes_exp
|
||||
|
||||
|
||||
def clip_bbox(boxes, im_shape):
|
||||
h, w = im_shape[0], im_shape[1]
|
||||
x1 = boxes[:, 0].clip(0, w)
|
||||
y1 = boxes[:, 1].clip(0, h)
|
||||
x2 = boxes[:, 2].clip(0, w)
|
||||
y2 = boxes[:, 3].clip(0, h)
|
||||
return paddle.stack([x1, y1, x2, y2], axis=1)
|
||||
|
||||
|
||||
def nonempty_bbox(boxes, min_size=0, return_mask=False):
|
||||
w = boxes[:, 2] - boxes[:, 0]
|
||||
h = boxes[:, 3] - boxes[:, 1]
|
||||
mask = paddle.logical_and(h > min_size, w > min_size)
|
||||
if return_mask:
|
||||
return mask
|
||||
keep = paddle.nonzero(mask).flatten()
|
||||
return keep
|
||||
|
||||
|
||||
def bbox_area(boxes):
|
||||
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
|
||||
|
||||
|
||||
def bbox_overlaps(boxes1, boxes2):
|
||||
"""
|
||||
Calculate overlaps between boxes1 and boxes2
|
||||
|
||||
Args:
|
||||
boxes1 (Tensor): boxes with shape [M, 4]
|
||||
boxes2 (Tensor): boxes with shape [N, 4]
|
||||
|
||||
Return:
|
||||
overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N]
|
||||
"""
|
||||
M = boxes1.shape[0]
|
||||
N = boxes2.shape[0]
|
||||
if M * N == 0:
|
||||
return paddle.zeros([M, N], dtype='float32')
|
||||
area1 = bbox_area(boxes1)
|
||||
area2 = bbox_area(boxes2)
|
||||
|
||||
xy_max = paddle.minimum(
|
||||
paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:])
|
||||
xy_min = paddle.maximum(
|
||||
paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2])
|
||||
width_height = xy_max - xy_min
|
||||
width_height = width_height.clip(min=0)
|
||||
inter = width_height.prod(axis=2)
|
||||
|
||||
overlaps = paddle.where(inter > 0, inter /
|
||||
(paddle.unsqueeze(area1, 1) + area2 - inter),
|
||||
paddle.zeros_like(inter))
|
||||
return overlaps
|
||||
|
||||
|
||||
def batch_bbox_overlaps(bboxes1,
|
||||
bboxes2,
|
||||
mode='iou',
|
||||
is_aligned=False,
|
||||
eps=1e-6):
|
||||
"""Calculate overlap between two set of bboxes.
|
||||
If ``is_aligned `` is ``False``, then calculate the overlaps between each
|
||||
bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
|
||||
pair of bboxes1 and bboxes2.
|
||||
Args:
|
||||
bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
|
||||
bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
|
||||
B indicates the batch dim, in shape (B1, B2, ..., Bn).
|
||||
If ``is_aligned `` is ``True``, then m and n must be equal.
|
||||
mode (str): "iou" (intersection over union) or "iof" (intersection over
|
||||
foreground).
|
||||
is_aligned (bool, optional): If True, then m and n must be equal.
|
||||
Default False.
|
||||
eps (float, optional): A value added to the denominator for numerical
|
||||
stability. Default 1e-6.
|
||||
Returns:
|
||||
Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
|
||||
"""
|
||||
assert mode in ['iou', 'iof', 'giou'], 'Unsupported mode {}'.format(mode)
|
||||
# Either the boxes are empty or the length of boxes's last dimenstion is 4
|
||||
assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0)
|
||||
assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0)
|
||||
|
||||
# Batch dim must be the same
|
||||
# Batch dim: (B1, B2, ... Bn)
|
||||
assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
|
||||
batch_shape = bboxes1.shape[:-2]
|
||||
|
||||
rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0
|
||||
cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0
|
||||
if is_aligned:
|
||||
assert rows == cols
|
||||
|
||||
if rows * cols == 0:
|
||||
if is_aligned:
|
||||
return paddle.full(batch_shape + (rows, ), 1)
|
||||
else:
|
||||
return paddle.full(batch_shape + (rows, cols), 1)
|
||||
|
||||
area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
|
||||
area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1])
|
||||
|
||||
if is_aligned:
|
||||
lt = paddle.maximum(bboxes1[:, :2], bboxes2[:, :2]) # [B, rows, 2]
|
||||
rb = paddle.minimum(bboxes1[:, 2:], bboxes2[:, 2:]) # [B, rows, 2]
|
||||
|
||||
wh = (rb - lt).clip(min=0) # [B, rows, 2]
|
||||
overlap = wh[:, 0] * wh[:, 1]
|
||||
|
||||
if mode in ['iou', 'giou']:
|
||||
union = area1 + area2 - overlap
|
||||
else:
|
||||
union = area1
|
||||
if mode == 'giou':
|
||||
enclosed_lt = paddle.minimum(bboxes1[:, :2], bboxes2[:, :2])
|
||||
enclosed_rb = paddle.maximum(bboxes1[:, 2:], bboxes2[:, 2:])
|
||||
else:
|
||||
lt = paddle.maximum(bboxes1[:, :2].reshape([rows, 1, 2]),
|
||||
bboxes2[:, :2]) # [B, rows, cols, 2]
|
||||
rb = paddle.minimum(bboxes1[:, 2:].reshape([rows, 1, 2]),
|
||||
bboxes2[:, 2:]) # [B, rows, cols, 2]
|
||||
|
||||
wh = (rb - lt).clip(min=0) # [B, rows, cols, 2]
|
||||
overlap = wh[:, :, 0] * wh[:, :, 1]
|
||||
|
||||
if mode in ['iou', 'giou']:
|
||||
union = area1.reshape([rows,1]) \
|
||||
+ area2.reshape([1,cols]) - overlap
|
||||
else:
|
||||
union = area1[:, None]
|
||||
if mode == 'giou':
|
||||
enclosed_lt = paddle.minimum(bboxes1[:, :2].reshape([rows, 1, 2]),
|
||||
bboxes2[:, :2])
|
||||
enclosed_rb = paddle.maximum(bboxes1[:, 2:].reshape([rows, 1, 2]),
|
||||
bboxes2[:, 2:])
|
||||
|
||||
eps = paddle.to_tensor([eps])
|
||||
union = paddle.maximum(union, eps)
|
||||
ious = overlap / union
|
||||
if mode in ['iou', 'iof']:
|
||||
return ious
|
||||
# calculate gious
|
||||
enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
|
||||
enclose_area = enclose_wh[:, :, 0] * enclose_wh[:, :, 1]
|
||||
enclose_area = paddle.maximum(enclose_area, eps)
|
||||
gious = ious - (enclose_area - union) / enclose_area
|
||||
return 1 - gious
|
||||
|
||||
|
||||
def xywh2xyxy(box):
|
||||
x, y, w, h = box
|
||||
x1 = x - w * 0.5
|
||||
y1 = y - h * 0.5
|
||||
x2 = x + w * 0.5
|
||||
y2 = y + h * 0.5
|
||||
return [x1, y1, x2, y2]
|
||||
|
||||
|
||||
def make_grid(h, w, dtype):
|
||||
yv, xv = paddle.meshgrid([paddle.arange(h), paddle.arange(w)])
|
||||
return paddle.stack((xv, yv), 2).cast(dtype=dtype)
|
||||
|
||||
|
||||
def decode_yolo(box, anchor, downsample_ratio):
|
||||
"""decode yolo box
|
||||
|
||||
Args:
|
||||
box (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
|
||||
anchor (list): anchor with the shape [na, 2]
|
||||
downsample_ratio (int): downsample ratio, default 32
|
||||
scale (float): scale, default 1.
|
||||
|
||||
Return:
|
||||
box (list): decoded box, [x, y, w, h], all have the shape [b, na, h, w, 1]
|
||||
"""
|
||||
x, y, w, h = box
|
||||
na, grid_h, grid_w = x.shape[1:4]
|
||||
grid = make_grid(grid_h, grid_w, x.dtype).reshape((1, 1, grid_h, grid_w, 2))
|
||||
x1 = (x + grid[:, :, :, :, 0:1]) / grid_w
|
||||
y1 = (y + grid[:, :, :, :, 1:2]) / grid_h
|
||||
|
||||
anchor = paddle.to_tensor(anchor, dtype=x.dtype)
|
||||
anchor = anchor.reshape((1, na, 1, 1, 2))
|
||||
w1 = paddle.exp(w) * anchor[:, :, :, :, 0:1] / (downsample_ratio * grid_w)
|
||||
h1 = paddle.exp(h) * anchor[:, :, :, :, 1:2] / (downsample_ratio * grid_h)
|
||||
|
||||
return [x1, y1, w1, h1]
|
||||
|
||||
|
||||
def batch_iou_similarity(box1, box2, eps=1e-9):
|
||||
"""Calculate iou of box1 and box2 in batch
|
||||
|
||||
Args:
|
||||
box1 (Tensor): box with the shape [N, M1, 4]
|
||||
box2 (Tensor): box with the shape [N, M2, 4]
|
||||
|
||||
Return:
|
||||
iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2]
|
||||
"""
|
||||
box1 = box1.unsqueeze(2) # [N, M1, 4] -> [N, M1, 1, 4]
|
||||
box2 = box2.unsqueeze(1) # [N, M2, 4] -> [N, 1, M2, 4]
|
||||
px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4]
|
||||
gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4]
|
||||
x1y1 = paddle.maximum(px1y1, gx1y1)
|
||||
x2y2 = paddle.minimum(px2y2, gx2y2)
|
||||
overlap = (x2y2 - x1y1).clip(0).prod(-1)
|
||||
area1 = (px2y2 - px1y1).clip(0).prod(-1)
|
||||
area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
|
||||
union = area1 + area2 - overlap + eps
|
||||
return overlap / union
|
||||
|
||||
|
||||
def bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9):
|
||||
"""calculate the iou of box1 and box2
|
||||
|
||||
Args:
|
||||
box1 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
|
||||
box2 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
|
||||
giou (bool): whether use giou or not, default False
|
||||
diou (bool): whether use diou or not, default False
|
||||
ciou (bool): whether use ciou or not, default False
|
||||
eps (float): epsilon to avoid divide by zero
|
||||
|
||||
Return:
|
||||
iou (Tensor): iou of box1 and box1, with the shape [b, na, h, w, 1]
|
||||
"""
|
||||
px1, py1, px2, py2 = box1
|
||||
gx1, gy1, gx2, gy2 = box2
|
||||
x1 = paddle.maximum(px1, gx1)
|
||||
y1 = paddle.maximum(py1, gy1)
|
||||
x2 = paddle.minimum(px2, gx2)
|
||||
y2 = paddle.minimum(py2, gy2)
|
||||
|
||||
overlap = ((x2 - x1).clip(0)) * ((y2 - y1).clip(0))
|
||||
|
||||
area1 = (px2 - px1) * (py2 - py1)
|
||||
area1 = area1.clip(0)
|
||||
|
||||
area2 = (gx2 - gx1) * (gy2 - gy1)
|
||||
area2 = area2.clip(0)
|
||||
|
||||
union = area1 + area2 - overlap + eps
|
||||
iou = overlap / union
|
||||
|
||||
if giou or ciou or diou:
|
||||
# convex w, h
|
||||
cw = paddle.maximum(px2, gx2) - paddle.minimum(px1, gx1)
|
||||
ch = paddle.maximum(py2, gy2) - paddle.minimum(py1, gy1)
|
||||
if giou:
|
||||
c_area = cw * ch + eps
|
||||
return iou - (c_area - union) / c_area
|
||||
else:
|
||||
# convex diagonal squared
|
||||
c2 = cw**2 + ch**2 + eps
|
||||
# center distance
|
||||
rho2 = ((px1 + px2 - gx1 - gx2)**2 + (py1 + py2 - gy1 - gy2)**2) / 4
|
||||
if diou:
|
||||
return iou - rho2 / c2
|
||||
else:
|
||||
w1, h1 = px2 - px1, py2 - py1 + eps
|
||||
w2, h2 = gx2 - gx1, gy2 - gy1 + eps
|
||||
delta = paddle.atan(w1 / h1) - paddle.atan(w2 / h2)
|
||||
v = (4 / math.pi**2) * paddle.pow(delta, 2)
|
||||
alpha = v / (1 + eps - iou + v)
|
||||
alpha.stop_gradient = True
|
||||
return iou - (rho2 / c2 + v * alpha)
|
||||
else:
|
||||
return iou
|
||||
|
||||
|
||||
def bbox_iou_np_expand(box1, box2, x1y1x2y2=True, eps=1e-16):
|
||||
"""
|
||||
Calculate the iou of box1 and box2 with numpy.
|
||||
|
||||
Args:
|
||||
box1 (ndarray): [N, 4]
|
||||
box2 (ndarray): [M, 4], usually N != M
|
||||
x1y1x2y2 (bool): whether in x1y1x2y2 stype, default True
|
||||
eps (float): epsilon to avoid divide by zero
|
||||
Return:
|
||||
iou (ndarray): iou of box1 and box2, [N, M]
|
||||
"""
|
||||
N, M = len(box1), len(box2) # usually N != M
|
||||
if x1y1x2y2:
|
||||
b1_x1, b1_y1 = box1[:, 0], box1[:, 1]
|
||||
b1_x2, b1_y2 = box1[:, 2], box1[:, 3]
|
||||
b2_x1, b2_y1 = box2[:, 0], box2[:, 1]
|
||||
b2_x2, b2_y2 = box2[:, 2], box2[:, 3]
|
||||
else:
|
||||
# cxcywh style
|
||||
# Transform from center and width to exact coordinates
|
||||
b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
|
||||
b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
|
||||
b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
|
||||
b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
|
||||
|
||||
# get the coordinates of the intersection rectangle
|
||||
inter_rect_x1 = np.zeros((N, M), dtype=np.float32)
|
||||
inter_rect_y1 = np.zeros((N, M), dtype=np.float32)
|
||||
inter_rect_x2 = np.zeros((N, M), dtype=np.float32)
|
||||
inter_rect_y2 = np.zeros((N, M), dtype=np.float32)
|
||||
for i in range(len(box2)):
|
||||
inter_rect_x1[:, i] = np.maximum(b1_x1, b2_x1[i])
|
||||
inter_rect_y1[:, i] = np.maximum(b1_y1, b2_y1[i])
|
||||
inter_rect_x2[:, i] = np.minimum(b1_x2, b2_x2[i])
|
||||
inter_rect_y2[:, i] = np.minimum(b1_y2, b2_y2[i])
|
||||
# Intersection area
|
||||
inter_area = np.maximum(inter_rect_x2 - inter_rect_x1, 0) * np.maximum(
|
||||
inter_rect_y2 - inter_rect_y1, 0)
|
||||
# Union Area
|
||||
b1_area = np.repeat(
|
||||
((b1_x2 - b1_x1) * (b1_y2 - b1_y1)).reshape(-1, 1), M, axis=-1)
|
||||
b2_area = np.repeat(
|
||||
((b2_x2 - b2_x1) * (b2_y2 - b2_y1)).reshape(1, -1), N, axis=0)
|
||||
|
||||
ious = inter_area / (b1_area + b2_area - inter_area + eps)
|
||||
return ious
|
||||
|
||||
|
||||
def bbox2distance(points, bbox, max_dis=None, eps=0.1):
|
||||
"""Decode bounding box based on distances.
|
||||
Args:
|
||||
points (Tensor): Shape (n, 2), [x, y].
|
||||
bbox (Tensor): Shape (n, 4), "xyxy" format
|
||||
max_dis (float): Upper bound of the distance.
|
||||
eps (float): a small value to ensure target < max_dis, instead <=
|
||||
Returns:
|
||||
Tensor: Decoded distances.
|
||||
"""
|
||||
left = points[:, 0] - bbox[:, 0]
|
||||
top = points[:, 1] - bbox[:, 1]
|
||||
right = bbox[:, 2] - points[:, 0]
|
||||
bottom = bbox[:, 3] - points[:, 1]
|
||||
if max_dis is not None:
|
||||
left = left.clip(min=0, max=max_dis - eps)
|
||||
top = top.clip(min=0, max=max_dis - eps)
|
||||
right = right.clip(min=0, max=max_dis - eps)
|
||||
bottom = bottom.clip(min=0, max=max_dis - eps)
|
||||
return paddle.stack([left, top, right, bottom], -1)
|
||||
|
||||
|
||||
def distance2bbox(points, distance, max_shape=None):
|
||||
"""Decode distance prediction to bounding box.
|
||||
Args:
|
||||
points (Tensor): Shape (n, 2), [x, y].
|
||||
distance (Tensor): Distance from the given point to 4
|
||||
boundaries (left, top, right, bottom).
|
||||
max_shape (tuple): Shape of the image.
|
||||
Returns:
|
||||
Tensor: Decoded bboxes.
|
||||
"""
|
||||
x1 = points[:, 0] - distance[:, 0]
|
||||
y1 = points[:, 1] - distance[:, 1]
|
||||
x2 = points[:, 0] + distance[:, 2]
|
||||
y2 = points[:, 1] + distance[:, 3]
|
||||
if max_shape is not None:
|
||||
x1 = x1.clip(min=0, max=max_shape[1])
|
||||
y1 = y1.clip(min=0, max=max_shape[0])
|
||||
x2 = x2.clip(min=0, max=max_shape[1])
|
||||
y2 = y2.clip(min=0, max=max_shape[0])
|
||||
return paddle.stack([x1, y1, x2, y2], -1)
|
||||
|
||||
|
||||
def bbox_center(boxes):
|
||||
"""Get bbox centers from boxes.
|
||||
Args:
|
||||
boxes (Tensor): boxes with shape (..., 4), "xmin, ymin, xmax, ymax" format.
|
||||
Returns:
|
||||
Tensor: boxes centers with shape (..., 2), "cx, cy" format.
|
||||
"""
|
||||
boxes_cx = (boxes[..., 0] + boxes[..., 2]) / 2
|
||||
boxes_cy = (boxes[..., 1] + boxes[..., 3]) / 2
|
||||
return paddle.stack([boxes_cx, boxes_cy], axis=-1)
|
||||
|
||||
|
||||
def batch_distance2bbox(points, distance, max_shapes=None):
|
||||
"""Decode distance prediction to bounding box for batch.
|
||||
Args:
|
||||
points (Tensor): [B, ..., 2], "xy" format
|
||||
distance (Tensor): [B, ..., 4], "ltrb" format
|
||||
max_shapes (Tensor): [B, 2], "h,w" format, Shape of the image.
|
||||
Returns:
|
||||
Tensor: Decoded bboxes, "x1y1x2y2" format.
|
||||
"""
|
||||
lt, rb = paddle.split(distance, 2, -1)
|
||||
# while tensor add parameters, parameters should be better placed on the second place
|
||||
x1y1 = -lt + points
|
||||
x2y2 = rb + points
|
||||
out_bbox = paddle.concat([x1y1, x2y2], -1)
|
||||
if max_shapes is not None:
|
||||
max_shapes = max_shapes.flip(-1).tile([1, 2])
|
||||
delta_dim = out_bbox.ndim - max_shapes.ndim
|
||||
for _ in range(delta_dim):
|
||||
max_shapes.unsqueeze_(1)
|
||||
out_bbox = paddle.where(out_bbox < max_shapes, out_bbox, max_shapes)
|
||||
out_bbox = paddle.where(out_bbox > 0, out_bbox,
|
||||
paddle.zeros_like(out_bbox))
|
||||
return out_bbox
|
||||
|
||||
|
||||
def iou_similarity(box1, box2, eps=1e-10):
|
||||
"""Calculate iou of box1 and box2
|
||||
|
||||
Args:
|
||||
box1 (Tensor): box with the shape [M1, 4]
|
||||
box2 (Tensor): box with the shape [M2, 4]
|
||||
|
||||
Return:
|
||||
iou (Tensor): iou between box1 and box2 with the shape [M1, M2]
|
||||
"""
|
||||
box1 = box1.unsqueeze(1) # [M1, 4] -> [M1, 1, 4]
|
||||
box2 = box2.unsqueeze(0) # [M2, 4] -> [1, M2, 4]
|
||||
px1y1, px2y2 = box1[:, :, 0:2], box1[:, :, 2:4]
|
||||
gx1y1, gx2y2 = box2[:, :, 0:2], box2[:, :, 2:4]
|
||||
x1y1 = paddle.maximum(px1y1, gx1y1)
|
||||
x2y2 = paddle.minimum(px2y2, gx2y2)
|
||||
overlap = (x2y2 - x1y1).clip(0).prod(-1)
|
||||
area1 = (px2y2 - px1y1).clip(0).prod(-1)
|
||||
area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
|
||||
union = area1 + area2 - overlap + eps
|
||||
return overlap / union
|
||||
40
rtdetr_paddle/ppdet/modeling/cls_utils.py
Normal file
40
rtdetr_paddle/ppdet/modeling/cls_utils.py
Normal file
@@ -0,0 +1,40 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
def _get_class_default_kwargs(cls, *args, **kwargs):
|
||||
"""
|
||||
Get default arguments of a class in dict format, if args and
|
||||
kwargs is specified, it will replace default arguments
|
||||
"""
|
||||
varnames = cls.__init__.__code__.co_varnames
|
||||
argcount = cls.__init__.__code__.co_argcount
|
||||
keys = varnames[:argcount]
|
||||
assert keys[0] == 'self'
|
||||
keys = keys[1:]
|
||||
|
||||
values = list(cls.__init__.__defaults__)
|
||||
assert len(values) == len(keys)
|
||||
|
||||
if len(args) > 0:
|
||||
for i, arg in enumerate(args):
|
||||
values[i] = arg
|
||||
|
||||
default_kwargs = dict(zip(keys, values))
|
||||
|
||||
if len(kwargs) > 0:
|
||||
for k, v in kwargs.items():
|
||||
default_kwargs[k] = v
|
||||
|
||||
return default_kwargs
|
||||
16
rtdetr_paddle/ppdet/modeling/heads/__init__.py
Normal file
16
rtdetr_paddle/ppdet/modeling/heads/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .detr_head import *
|
||||
|
||||
534
rtdetr_paddle/ppdet/modeling/heads/detr_head.py
Normal file
534
rtdetr_paddle/ppdet/modeling/heads/detr_head.py
Normal file
@@ -0,0 +1,534 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from ppdet.core.workspace import register
|
||||
from ..initializer import linear_init_, constant_
|
||||
from ..transformers.utils import inverse_sigmoid
|
||||
|
||||
import pycocotools.mask as mask_util
|
||||
|
||||
__all__ = ['DETRHead', 'DeformableDETRHead', 'DINOHead', 'MaskDINOHead']
|
||||
|
||||
|
||||
class MLP(nn.Layer):
|
||||
"""This code is based on
|
||||
https://github.com/facebookresearch/detr/blob/main/models/detr.py
|
||||
"""
|
||||
|
||||
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
|
||||
super().__init__()
|
||||
self.num_layers = num_layers
|
||||
h = [hidden_dim] * (num_layers - 1)
|
||||
self.layers = nn.LayerList(
|
||||
nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
|
||||
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
for l in self.layers:
|
||||
linear_init_(l)
|
||||
|
||||
def forward(self, x):
|
||||
for i, layer in enumerate(self.layers):
|
||||
x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
|
||||
return x
|
||||
|
||||
|
||||
class MultiHeadAttentionMap(nn.Layer):
|
||||
"""This code is based on
|
||||
https://github.com/facebookresearch/detr/blob/main/models/segmentation.py
|
||||
|
||||
This is a 2D attention module, which only returns the attention softmax (no multiplication by value)
|
||||
"""
|
||||
|
||||
def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0,
|
||||
bias=True):
|
||||
super().__init__()
|
||||
self.num_heads = num_heads
|
||||
self.hidden_dim = hidden_dim
|
||||
self.dropout = nn.Dropout(dropout)
|
||||
|
||||
weight_attr = paddle.ParamAttr(
|
||||
initializer=paddle.nn.initializer.XavierUniform())
|
||||
bias_attr = paddle.framework.ParamAttr(
|
||||
initializer=paddle.nn.initializer.Constant()) if bias else False
|
||||
|
||||
self.q_proj = nn.Linear(query_dim, hidden_dim, weight_attr, bias_attr)
|
||||
self.k_proj = nn.Conv2D(
|
||||
query_dim,
|
||||
hidden_dim,
|
||||
1,
|
||||
weight_attr=weight_attr,
|
||||
bias_attr=bias_attr)
|
||||
|
||||
self.normalize_fact = float(hidden_dim / self.num_heads)**-0.5
|
||||
|
||||
def forward(self, q, k, mask=None):
|
||||
q = self.q_proj(q)
|
||||
k = self.k_proj(k)
|
||||
bs, num_queries, n, c, h, w = q.shape[0], q.shape[1], self.num_heads,\
|
||||
self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1]
|
||||
qh = q.reshape([bs, num_queries, n, c])
|
||||
kh = k.reshape([bs, n, c, h, w])
|
||||
# weights = paddle.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh)
|
||||
qh = qh.transpose([0, 2, 1, 3]).reshape([-1, num_queries, c])
|
||||
kh = kh.reshape([-1, c, h * w])
|
||||
weights = paddle.bmm(qh * self.normalize_fact, kh).reshape(
|
||||
[bs, n, num_queries, h, w]).transpose([0, 2, 1, 3, 4])
|
||||
|
||||
if mask is not None:
|
||||
weights += mask
|
||||
# fix a potenial bug: https://github.com/facebookresearch/detr/issues/247
|
||||
weights = F.softmax(weights.flatten(3), axis=-1).reshape(weights.shape)
|
||||
weights = self.dropout(weights)
|
||||
return weights
|
||||
|
||||
|
||||
class MaskHeadFPNConv(nn.Layer):
|
||||
"""This code is based on
|
||||
https://github.com/facebookresearch/detr/blob/main/models/segmentation.py
|
||||
|
||||
Simple convolutional head, using group norm.
|
||||
Upsampling is done using a FPN approach
|
||||
"""
|
||||
|
||||
def __init__(self, input_dim, fpn_dims, context_dim, num_groups=8):
|
||||
super().__init__()
|
||||
|
||||
inter_dims = [input_dim,
|
||||
] + [context_dim // (2**i) for i in range(1, 5)]
|
||||
weight_attr = paddle.ParamAttr(
|
||||
initializer=paddle.nn.initializer.KaimingUniform())
|
||||
bias_attr = paddle.framework.ParamAttr(
|
||||
initializer=paddle.nn.initializer.Constant())
|
||||
|
||||
self.conv0 = self._make_layers(input_dim, input_dim, 3, num_groups,
|
||||
weight_attr, bias_attr)
|
||||
self.conv_inter = nn.LayerList()
|
||||
for in_dims, out_dims in zip(inter_dims[:-1], inter_dims[1:]):
|
||||
self.conv_inter.append(
|
||||
self._make_layers(in_dims, out_dims, 3, num_groups, weight_attr,
|
||||
bias_attr))
|
||||
|
||||
self.conv_out = nn.Conv2D(
|
||||
inter_dims[-1],
|
||||
1,
|
||||
3,
|
||||
padding=1,
|
||||
weight_attr=weight_attr,
|
||||
bias_attr=bias_attr)
|
||||
|
||||
self.adapter = nn.LayerList()
|
||||
for i in range(len(fpn_dims)):
|
||||
self.adapter.append(
|
||||
nn.Conv2D(
|
||||
fpn_dims[i],
|
||||
inter_dims[i + 1],
|
||||
1,
|
||||
weight_attr=weight_attr,
|
||||
bias_attr=bias_attr))
|
||||
|
||||
def _make_layers(self,
|
||||
in_dims,
|
||||
out_dims,
|
||||
kernel_size,
|
||||
num_groups,
|
||||
weight_attr=None,
|
||||
bias_attr=None):
|
||||
return nn.Sequential(
|
||||
nn.Conv2D(
|
||||
in_dims,
|
||||
out_dims,
|
||||
kernel_size,
|
||||
padding=kernel_size // 2,
|
||||
weight_attr=weight_attr,
|
||||
bias_attr=bias_attr),
|
||||
nn.GroupNorm(num_groups, out_dims),
|
||||
nn.ReLU())
|
||||
|
||||
def forward(self, x, bbox_attention_map, fpns):
|
||||
x = paddle.concat([
|
||||
x.tile([bbox_attention_map.shape[1], 1, 1, 1]),
|
||||
bbox_attention_map.flatten(0, 1)
|
||||
], 1)
|
||||
x = self.conv0(x)
|
||||
for inter_layer, adapter_layer, feat in zip(self.conv_inter[:-1],
|
||||
self.adapter, fpns):
|
||||
feat = adapter_layer(feat).tile(
|
||||
[bbox_attention_map.shape[1], 1, 1, 1])
|
||||
x = inter_layer(x)
|
||||
x = feat + F.interpolate(x, size=feat.shape[-2:])
|
||||
|
||||
x = self.conv_inter[-1](x)
|
||||
x = self.conv_out(x)
|
||||
return x
|
||||
|
||||
|
||||
@register
|
||||
class DETRHead(nn.Layer):
|
||||
__shared__ = ['num_classes', 'hidden_dim', 'use_focal_loss']
|
||||
__inject__ = ['loss']
|
||||
|
||||
def __init__(self,
|
||||
num_classes=80,
|
||||
hidden_dim=256,
|
||||
nhead=8,
|
||||
num_mlp_layers=3,
|
||||
loss='DETRLoss',
|
||||
fpn_dims=[1024, 512, 256],
|
||||
with_mask_head=False,
|
||||
use_focal_loss=False):
|
||||
super(DETRHead, self).__init__()
|
||||
# add background class
|
||||
self.num_classes = num_classes if use_focal_loss else num_classes + 1
|
||||
self.hidden_dim = hidden_dim
|
||||
self.loss = loss
|
||||
self.with_mask_head = with_mask_head
|
||||
self.use_focal_loss = use_focal_loss
|
||||
|
||||
self.score_head = nn.Linear(hidden_dim, self.num_classes)
|
||||
self.bbox_head = MLP(hidden_dim,
|
||||
hidden_dim,
|
||||
output_dim=4,
|
||||
num_layers=num_mlp_layers)
|
||||
if self.with_mask_head:
|
||||
self.bbox_attention = MultiHeadAttentionMap(hidden_dim, hidden_dim,
|
||||
nhead)
|
||||
self.mask_head = MaskHeadFPNConv(hidden_dim + nhead, fpn_dims,
|
||||
hidden_dim)
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
linear_init_(self.score_head)
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, cfg, hidden_dim, nhead, input_shape):
|
||||
|
||||
return {
|
||||
'hidden_dim': hidden_dim,
|
||||
'nhead': nhead,
|
||||
'fpn_dims': [i.channels for i in input_shape[::-1]][1:]
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def get_gt_mask_from_polygons(gt_poly, pad_mask):
|
||||
out_gt_mask = []
|
||||
for polygons, padding in zip(gt_poly, pad_mask):
|
||||
height, width = int(padding[:, 0].sum()), int(padding[0, :].sum())
|
||||
masks = []
|
||||
for obj_poly in polygons:
|
||||
rles = mask_util.frPyObjects(obj_poly, height, width)
|
||||
rle = mask_util.merge(rles)
|
||||
masks.append(
|
||||
paddle.to_tensor(mask_util.decode(rle)).astype('float32'))
|
||||
masks = paddle.stack(masks)
|
||||
masks_pad = paddle.zeros(
|
||||
[masks.shape[0], pad_mask.shape[1], pad_mask.shape[2]])
|
||||
masks_pad[:, :height, :width] = masks
|
||||
out_gt_mask.append(masks_pad)
|
||||
return out_gt_mask
|
||||
|
||||
def forward(self, out_transformer, body_feats, inputs=None):
|
||||
r"""
|
||||
Args:
|
||||
out_transformer (Tuple): (feats: [num_levels, batch_size,
|
||||
num_queries, hidden_dim],
|
||||
memory: [batch_size, hidden_dim, h, w],
|
||||
src_proj: [batch_size, h*w, hidden_dim],
|
||||
src_mask: [batch_size, 1, 1, h, w])
|
||||
body_feats (List(Tensor)): list[[B, C, H, W]]
|
||||
inputs (dict): dict(inputs)
|
||||
"""
|
||||
feats, memory, src_proj, src_mask = out_transformer
|
||||
outputs_logit = self.score_head(feats)
|
||||
outputs_bbox = F.sigmoid(self.bbox_head(feats))
|
||||
outputs_seg = None
|
||||
if self.with_mask_head:
|
||||
bbox_attention_map = self.bbox_attention(feats[-1], memory,
|
||||
src_mask)
|
||||
fpn_feats = [a for a in body_feats[::-1]][1:]
|
||||
outputs_seg = self.mask_head(src_proj, bbox_attention_map,
|
||||
fpn_feats)
|
||||
outputs_seg = outputs_seg.reshape([
|
||||
feats.shape[1], feats.shape[2], outputs_seg.shape[-2],
|
||||
outputs_seg.shape[-1]
|
||||
])
|
||||
|
||||
if self.training:
|
||||
assert inputs is not None
|
||||
assert 'gt_bbox' in inputs and 'gt_class' in inputs
|
||||
gt_mask = self.get_gt_mask_from_polygons(
|
||||
inputs['gt_poly'],
|
||||
inputs['pad_mask']) if 'gt_poly' in inputs else None
|
||||
return self.loss(
|
||||
outputs_bbox,
|
||||
outputs_logit,
|
||||
inputs['gt_bbox'],
|
||||
inputs['gt_class'],
|
||||
masks=outputs_seg,
|
||||
gt_mask=gt_mask)
|
||||
else:
|
||||
return (outputs_bbox[-1], outputs_logit[-1], outputs_seg)
|
||||
|
||||
|
||||
@register
|
||||
class DeformableDETRHead(nn.Layer):
|
||||
__shared__ = ['num_classes', 'hidden_dim']
|
||||
__inject__ = ['loss']
|
||||
|
||||
def __init__(self,
|
||||
num_classes=80,
|
||||
hidden_dim=512,
|
||||
nhead=8,
|
||||
num_mlp_layers=3,
|
||||
loss='DETRLoss'):
|
||||
super(DeformableDETRHead, self).__init__()
|
||||
self.num_classes = num_classes
|
||||
self.hidden_dim = hidden_dim
|
||||
self.nhead = nhead
|
||||
self.loss = loss
|
||||
|
||||
self.score_head = nn.Linear(hidden_dim, self.num_classes)
|
||||
self.bbox_head = MLP(hidden_dim,
|
||||
hidden_dim,
|
||||
output_dim=4,
|
||||
num_layers=num_mlp_layers)
|
||||
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
linear_init_(self.score_head)
|
||||
constant_(self.score_head.bias, -4.595)
|
||||
constant_(self.bbox_head.layers[-1].weight)
|
||||
|
||||
with paddle.no_grad():
|
||||
bias = paddle.zeros_like(self.bbox_head.layers[-1].bias)
|
||||
bias[2:] = -2.0
|
||||
self.bbox_head.layers[-1].bias.set_value(bias)
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, cfg, hidden_dim, nhead, input_shape):
|
||||
return {'hidden_dim': hidden_dim, 'nhead': nhead}
|
||||
|
||||
def forward(self, out_transformer, body_feats, inputs=None):
|
||||
r"""
|
||||
Args:
|
||||
out_transformer (Tuple): (feats: [num_levels, batch_size,
|
||||
num_queries, hidden_dim],
|
||||
memory: [batch_size,
|
||||
\sum_{l=0}^{L-1} H_l \cdot W_l, hidden_dim],
|
||||
reference_points: [batch_size, num_queries, 2])
|
||||
body_feats (List(Tensor)): list[[B, C, H, W]]
|
||||
inputs (dict): dict(inputs)
|
||||
"""
|
||||
feats, memory, reference_points = out_transformer
|
||||
reference_points = inverse_sigmoid(reference_points.unsqueeze(0))
|
||||
outputs_bbox = self.bbox_head(feats)
|
||||
|
||||
# It's equivalent to "outputs_bbox[:, :, :, :2] += reference_points",
|
||||
# but the gradient is wrong in paddle.
|
||||
outputs_bbox = paddle.concat(
|
||||
[
|
||||
outputs_bbox[:, :, :, :2] + reference_points,
|
||||
outputs_bbox[:, :, :, 2:]
|
||||
],
|
||||
axis=-1)
|
||||
|
||||
outputs_bbox = F.sigmoid(outputs_bbox)
|
||||
outputs_logit = self.score_head(feats)
|
||||
|
||||
if self.training:
|
||||
assert inputs is not None
|
||||
assert 'gt_bbox' in inputs and 'gt_class' in inputs
|
||||
|
||||
return self.loss(outputs_bbox, outputs_logit, inputs['gt_bbox'],
|
||||
inputs['gt_class'])
|
||||
else:
|
||||
return (outputs_bbox[-1], outputs_logit[-1], None)
|
||||
|
||||
|
||||
@register
|
||||
class DINOHead(nn.Layer):
|
||||
__inject__ = ['loss']
|
||||
|
||||
def __init__(self, loss='DINOLoss'):
|
||||
super(DINOHead, self).__init__()
|
||||
self.loss = loss
|
||||
|
||||
def forward(self, out_transformer, body_feats, inputs=None):
|
||||
(dec_out_bboxes, dec_out_logits, enc_topk_bboxes, enc_topk_logits,
|
||||
dn_meta) = out_transformer
|
||||
if self.training:
|
||||
assert inputs is not None
|
||||
assert 'gt_bbox' in inputs and 'gt_class' in inputs
|
||||
|
||||
if dn_meta is not None:
|
||||
if isinstance(dn_meta, list):
|
||||
dual_groups = len(dn_meta) - 1
|
||||
dec_out_bboxes = paddle.split(
|
||||
dec_out_bboxes, dual_groups + 1, axis=2)
|
||||
dec_out_logits = paddle.split(
|
||||
dec_out_logits, dual_groups + 1, axis=2)
|
||||
enc_topk_bboxes = paddle.split(
|
||||
enc_topk_bboxes, dual_groups + 1, axis=1)
|
||||
enc_topk_logits = paddle.split(
|
||||
enc_topk_logits, dual_groups + 1, axis=1)
|
||||
|
||||
dec_out_bboxes_list = []
|
||||
dec_out_logits_list = []
|
||||
dn_out_bboxes_list = []
|
||||
dn_out_logits_list = []
|
||||
loss = {}
|
||||
for g_id in range(dual_groups + 1):
|
||||
if dn_meta[g_id] is not None:
|
||||
dn_out_bboxes_gid, dec_out_bboxes_gid = paddle.split(
|
||||
dec_out_bboxes[g_id],
|
||||
dn_meta[g_id]['dn_num_split'],
|
||||
axis=2)
|
||||
dn_out_logits_gid, dec_out_logits_gid = paddle.split(
|
||||
dec_out_logits[g_id],
|
||||
dn_meta[g_id]['dn_num_split'],
|
||||
axis=2)
|
||||
else:
|
||||
dn_out_bboxes_gid, dn_out_logits_gid = None, None
|
||||
dec_out_bboxes_gid = dec_out_bboxes[g_id]
|
||||
dec_out_logits_gid = dec_out_logits[g_id]
|
||||
out_bboxes_gid = paddle.concat([
|
||||
enc_topk_bboxes[g_id].unsqueeze(0),
|
||||
dec_out_bboxes_gid
|
||||
])
|
||||
out_logits_gid = paddle.concat([
|
||||
enc_topk_logits[g_id].unsqueeze(0),
|
||||
dec_out_logits_gid
|
||||
])
|
||||
loss_gid = self.loss(
|
||||
out_bboxes_gid,
|
||||
out_logits_gid,
|
||||
inputs['gt_bbox'],
|
||||
inputs['gt_class'],
|
||||
dn_out_bboxes=dn_out_bboxes_gid,
|
||||
dn_out_logits=dn_out_logits_gid,
|
||||
dn_meta=dn_meta[g_id])
|
||||
# sum loss
|
||||
for key, value in loss_gid.items():
|
||||
loss.update({
|
||||
key: loss.get(key, paddle.zeros([1])) + value
|
||||
})
|
||||
|
||||
# average across (dual_groups + 1)
|
||||
for key, value in loss.items():
|
||||
loss.update({key: value / (dual_groups + 1)})
|
||||
return loss
|
||||
else:
|
||||
dn_out_bboxes, dec_out_bboxes = paddle.split(
|
||||
dec_out_bboxes, dn_meta['dn_num_split'], axis=2)
|
||||
dn_out_logits, dec_out_logits = paddle.split(
|
||||
dec_out_logits, dn_meta['dn_num_split'], axis=2)
|
||||
else:
|
||||
dn_out_bboxes, dn_out_logits = None, None
|
||||
|
||||
out_bboxes = paddle.concat(
|
||||
[enc_topk_bboxes.unsqueeze(0), dec_out_bboxes])
|
||||
out_logits = paddle.concat(
|
||||
[enc_topk_logits.unsqueeze(0), dec_out_logits])
|
||||
|
||||
return self.loss(
|
||||
out_bboxes,
|
||||
out_logits,
|
||||
inputs['gt_bbox'],
|
||||
inputs['gt_class'],
|
||||
dn_out_bboxes=dn_out_bboxes,
|
||||
dn_out_logits=dn_out_logits,
|
||||
dn_meta=dn_meta)
|
||||
else:
|
||||
return (dec_out_bboxes[-1], dec_out_logits[-1], None)
|
||||
|
||||
|
||||
@register
|
||||
class MaskDINOHead(nn.Layer):
|
||||
__inject__ = ['loss']
|
||||
|
||||
def __init__(self, loss='DINOLoss'):
|
||||
super(MaskDINOHead, self).__init__()
|
||||
self.loss = loss
|
||||
|
||||
def forward(self, out_transformer, body_feats, inputs=None):
|
||||
(dec_out_logits, dec_out_bboxes, dec_out_masks, enc_out, init_out,
|
||||
dn_meta) = out_transformer
|
||||
if self.training:
|
||||
assert inputs is not None
|
||||
assert 'gt_bbox' in inputs and 'gt_class' in inputs
|
||||
assert 'gt_segm' in inputs
|
||||
|
||||
if dn_meta is not None:
|
||||
dn_out_logits, dec_out_logits = paddle.split(
|
||||
dec_out_logits, dn_meta['dn_num_split'], axis=2)
|
||||
dn_out_bboxes, dec_out_bboxes = paddle.split(
|
||||
dec_out_bboxes, dn_meta['dn_num_split'], axis=2)
|
||||
dn_out_masks, dec_out_masks = paddle.split(
|
||||
dec_out_masks, dn_meta['dn_num_split'], axis=2)
|
||||
if init_out is not None:
|
||||
init_out_logits, init_out_bboxes, init_out_masks = init_out
|
||||
init_out_logits_dn, init_out_logits = paddle.split(
|
||||
init_out_logits, dn_meta['dn_num_split'], axis=1)
|
||||
init_out_bboxes_dn, init_out_bboxes = paddle.split(
|
||||
init_out_bboxes, dn_meta['dn_num_split'], axis=1)
|
||||
init_out_masks_dn, init_out_masks = paddle.split(
|
||||
init_out_masks, dn_meta['dn_num_split'], axis=1)
|
||||
|
||||
dec_out_logits = paddle.concat(
|
||||
[init_out_logits.unsqueeze(0), dec_out_logits])
|
||||
dec_out_bboxes = paddle.concat(
|
||||
[init_out_bboxes.unsqueeze(0), dec_out_bboxes])
|
||||
dec_out_masks = paddle.concat(
|
||||
[init_out_masks.unsqueeze(0), dec_out_masks])
|
||||
|
||||
dn_out_logits = paddle.concat(
|
||||
[init_out_logits_dn.unsqueeze(0), dn_out_logits])
|
||||
dn_out_bboxes = paddle.concat(
|
||||
[init_out_bboxes_dn.unsqueeze(0), dn_out_bboxes])
|
||||
dn_out_masks = paddle.concat(
|
||||
[init_out_masks_dn.unsqueeze(0), dn_out_masks])
|
||||
else:
|
||||
dn_out_bboxes, dn_out_logits = None, None
|
||||
dn_out_masks = None
|
||||
|
||||
enc_out_logits, enc_out_bboxes, enc_out_masks = enc_out
|
||||
out_logits = paddle.concat(
|
||||
[enc_out_logits.unsqueeze(0), dec_out_logits])
|
||||
out_bboxes = paddle.concat(
|
||||
[enc_out_bboxes.unsqueeze(0), dec_out_bboxes])
|
||||
out_masks = paddle.concat(
|
||||
[enc_out_masks.unsqueeze(0), dec_out_masks])
|
||||
|
||||
return self.loss(
|
||||
out_bboxes,
|
||||
out_logits,
|
||||
inputs['gt_bbox'],
|
||||
inputs['gt_class'],
|
||||
masks=out_masks,
|
||||
gt_mask=inputs['gt_segm'],
|
||||
dn_out_logits=dn_out_logits,
|
||||
dn_out_bboxes=dn_out_bboxes,
|
||||
dn_out_masks=dn_out_masks,
|
||||
dn_meta=dn_meta)
|
||||
else:
|
||||
return (dec_out_bboxes[-1], dec_out_logits[-1], dec_out_masks[-1])
|
||||
325
rtdetr_paddle/ppdet/modeling/initializer.py
Normal file
325
rtdetr_paddle/ppdet/modeling/initializer.py
Normal file
@@ -0,0 +1,325 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
|
||||
Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file.
|
||||
"""
|
||||
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
|
||||
__all__ = [
|
||||
'uniform_',
|
||||
'normal_',
|
||||
'constant_',
|
||||
'ones_',
|
||||
'zeros_',
|
||||
'xavier_uniform_',
|
||||
'xavier_normal_',
|
||||
'kaiming_uniform_',
|
||||
'kaiming_normal_',
|
||||
'linear_init_',
|
||||
'conv_init_',
|
||||
'reset_initialized_parameter',
|
||||
]
|
||||
|
||||
|
||||
def _no_grad_uniform_(tensor, a, b):
|
||||
with paddle.no_grad():
|
||||
tensor.set_value(
|
||||
paddle.uniform(
|
||||
shape=tensor.shape, dtype=tensor.dtype, min=a, max=b))
|
||||
return tensor
|
||||
|
||||
|
||||
def _no_grad_normal_(tensor, mean=0., std=1.):
|
||||
with paddle.no_grad():
|
||||
tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape))
|
||||
return tensor
|
||||
|
||||
|
||||
def _no_grad_fill_(tensor, value=0.):
|
||||
with paddle.no_grad():
|
||||
tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype))
|
||||
return tensor
|
||||
|
||||
|
||||
def uniform_(tensor, a, b):
|
||||
"""
|
||||
Modified tensor inspace using uniform_
|
||||
Args:
|
||||
tensor (paddle.Tensor): paddle Tensor
|
||||
a (float|int): min value.
|
||||
b (float|int): max value.
|
||||
Return:
|
||||
tensor
|
||||
"""
|
||||
return _no_grad_uniform_(tensor, a, b)
|
||||
|
||||
|
||||
def normal_(tensor, mean=0., std=1.):
|
||||
"""
|
||||
Modified tensor inspace using normal_
|
||||
Args:
|
||||
tensor (paddle.Tensor): paddle Tensor
|
||||
mean (float|int): mean value.
|
||||
std (float|int): std value.
|
||||
Return:
|
||||
tensor
|
||||
"""
|
||||
return _no_grad_normal_(tensor, mean, std)
|
||||
|
||||
|
||||
def constant_(tensor, value=0.):
|
||||
"""
|
||||
Modified tensor inspace using constant_
|
||||
Args:
|
||||
tensor (paddle.Tensor): paddle Tensor
|
||||
value (float|int): value to fill tensor.
|
||||
Return:
|
||||
tensor
|
||||
"""
|
||||
return _no_grad_fill_(tensor, value)
|
||||
|
||||
|
||||
def ones_(tensor):
|
||||
"""
|
||||
Modified tensor inspace using ones_
|
||||
Args:
|
||||
tensor (paddle.Tensor): paddle Tensor
|
||||
Return:
|
||||
tensor
|
||||
"""
|
||||
return _no_grad_fill_(tensor, 1)
|
||||
|
||||
|
||||
def zeros_(tensor):
|
||||
"""
|
||||
Modified tensor inspace using zeros_
|
||||
Args:
|
||||
tensor (paddle.Tensor): paddle Tensor
|
||||
Return:
|
||||
tensor
|
||||
"""
|
||||
return _no_grad_fill_(tensor, 0)
|
||||
|
||||
|
||||
def vector_(tensor, vector):
|
||||
with paddle.no_grad():
|
||||
tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype))
|
||||
return tensor
|
||||
|
||||
|
||||
def _calculate_fan_in_and_fan_out(tensor, reverse=False):
|
||||
"""
|
||||
Calculate (fan_in, _fan_out) for tensor
|
||||
|
||||
Args:
|
||||
tensor (Tensor): paddle.Tensor
|
||||
reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True
|
||||
|
||||
Return:
|
||||
Tuple[fan_in, fan_out]
|
||||
"""
|
||||
if tensor.ndim < 2:
|
||||
raise ValueError(
|
||||
"Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
|
||||
)
|
||||
|
||||
if reverse:
|
||||
num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
|
||||
else:
|
||||
num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0]
|
||||
|
||||
receptive_field_size = 1
|
||||
if tensor.ndim > 2:
|
||||
receptive_field_size = np.prod(tensor.shape[2:])
|
||||
|
||||
fan_in = num_input_fmaps * receptive_field_size
|
||||
fan_out = num_output_fmaps * receptive_field_size
|
||||
|
||||
return fan_in, fan_out
|
||||
|
||||
|
||||
def xavier_uniform_(tensor, gain=1., reverse=False):
|
||||
"""
|
||||
Modified tensor inspace using xavier_uniform_
|
||||
Args:
|
||||
tensor (paddle.Tensor): paddle Tensor
|
||||
gain (float): super parameter, 1. default.
|
||||
reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
|
||||
Return:
|
||||
tensor
|
||||
"""
|
||||
fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
|
||||
std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
|
||||
k = math.sqrt(3.0) * std
|
||||
return _no_grad_uniform_(tensor, -k, k)
|
||||
|
||||
|
||||
def xavier_normal_(tensor, gain=1., reverse=False):
|
||||
"""
|
||||
Modified tensor inspace using xavier_normal_
|
||||
Args:
|
||||
tensor (paddle.Tensor): paddle Tensor
|
||||
gain (float): super parameter, 1. default.
|
||||
reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
|
||||
Return:
|
||||
tensor
|
||||
"""
|
||||
fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
|
||||
std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
|
||||
return _no_grad_normal_(tensor, 0, std)
|
||||
|
||||
|
||||
# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html
|
||||
def _calculate_correct_fan(tensor, mode, reverse=False):
|
||||
mode = mode.lower()
|
||||
valid_modes = ['fan_in', 'fan_out']
|
||||
if mode not in valid_modes:
|
||||
raise ValueError("Mode {} not supported, please use one of {}".format(
|
||||
mode, valid_modes))
|
||||
|
||||
fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
|
||||
|
||||
return fan_in if mode == 'fan_in' else fan_out
|
||||
|
||||
|
||||
def _calculate_gain(nonlinearity, param=None):
|
||||
linear_fns = [
|
||||
'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',
|
||||
'conv_transpose2d', 'conv_transpose3d'
|
||||
]
|
||||
if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
|
||||
return 1
|
||||
elif nonlinearity == 'tanh':
|
||||
return 5.0 / 3
|
||||
elif nonlinearity == 'relu':
|
||||
return math.sqrt(2.0)
|
||||
elif nonlinearity == 'leaky_relu':
|
||||
if param is None:
|
||||
negative_slope = 0.01
|
||||
elif not isinstance(param, bool) and isinstance(
|
||||
param, int) or isinstance(param, float):
|
||||
# True/False are instances of int, hence check above
|
||||
negative_slope = param
|
||||
else:
|
||||
raise ValueError("negative_slope {} not a valid number".format(
|
||||
param))
|
||||
return math.sqrt(2.0 / (1 + negative_slope**2))
|
||||
elif nonlinearity == 'selu':
|
||||
return 3.0 / 4
|
||||
else:
|
||||
raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
|
||||
|
||||
|
||||
def kaiming_uniform_(tensor,
|
||||
a=0,
|
||||
mode='fan_in',
|
||||
nonlinearity='leaky_relu',
|
||||
reverse=False):
|
||||
"""
|
||||
Modified tensor inspace using kaiming_uniform method
|
||||
Args:
|
||||
tensor (paddle.Tensor): paddle Tensor
|
||||
mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
|
||||
nonlinearity (str): nonlinearity method name
|
||||
reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
|
||||
Return:
|
||||
tensor
|
||||
"""
|
||||
fan = _calculate_correct_fan(tensor, mode, reverse)
|
||||
gain = _calculate_gain(nonlinearity, a)
|
||||
std = gain / math.sqrt(fan)
|
||||
k = math.sqrt(3.0) * std
|
||||
return _no_grad_uniform_(tensor, -k, k)
|
||||
|
||||
|
||||
def kaiming_normal_(tensor,
|
||||
a=0,
|
||||
mode='fan_in',
|
||||
nonlinearity='leaky_relu',
|
||||
reverse=False):
|
||||
"""
|
||||
Modified tensor inspace using kaiming_normal_
|
||||
Args:
|
||||
tensor (paddle.Tensor): paddle Tensor
|
||||
mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
|
||||
nonlinearity (str): nonlinearity method name
|
||||
reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
|
||||
Return:
|
||||
tensor
|
||||
"""
|
||||
fan = _calculate_correct_fan(tensor, mode, reverse)
|
||||
gain = _calculate_gain(nonlinearity, a)
|
||||
std = gain / math.sqrt(fan)
|
||||
return _no_grad_normal_(tensor, 0, std)
|
||||
|
||||
|
||||
def linear_init_(module):
|
||||
bound = 1 / math.sqrt(module.weight.shape[0])
|
||||
uniform_(module.weight, -bound, bound)
|
||||
if hasattr(module, "bias") and module.bias is not None:
|
||||
uniform_(module.bias, -bound, bound)
|
||||
|
||||
|
||||
def conv_init_(module):
|
||||
bound = 1 / np.sqrt(np.prod(module.weight.shape[1:]))
|
||||
uniform_(module.weight, -bound, bound)
|
||||
if module.bias is not None:
|
||||
uniform_(module.bias, -bound, bound)
|
||||
|
||||
|
||||
def bias_init_with_prob(prior_prob=0.01):
|
||||
"""initialize conv/fc bias value according to a given probability value."""
|
||||
bias_init = float(-np.log((1 - prior_prob) / prior_prob))
|
||||
return bias_init
|
||||
|
||||
|
||||
@paddle.no_grad()
|
||||
def reset_initialized_parameter(model, include_self=True):
|
||||
"""
|
||||
Reset initialized parameter using following method for [conv, linear, embedding, bn]
|
||||
|
||||
Args:
|
||||
model (paddle.Layer): paddle Layer
|
||||
include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself
|
||||
Return:
|
||||
None
|
||||
"""
|
||||
for _, m in model.named_sublayers(include_self=include_self):
|
||||
if isinstance(m, nn.Conv2D):
|
||||
k = float(m._groups) / (m._in_channels * m._kernel_size[0] *
|
||||
m._kernel_size[1])
|
||||
k = math.sqrt(k)
|
||||
_no_grad_uniform_(m.weight, -k, k)
|
||||
if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
|
||||
_no_grad_uniform_(m.bias, -k, k)
|
||||
|
||||
elif isinstance(m, nn.Linear):
|
||||
k = math.sqrt(1. / m.weight.shape[0])
|
||||
_no_grad_uniform_(m.weight, -k, k)
|
||||
if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
|
||||
_no_grad_uniform_(m.bias, -k, k)
|
||||
|
||||
elif isinstance(m, nn.Embedding):
|
||||
_no_grad_normal_(m.weight, mean=0., std=1.)
|
||||
|
||||
elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)):
|
||||
_no_grad_fill_(m.weight, 1.)
|
||||
if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
|
||||
_no_grad_fill_(m.bias, 0)
|
||||
403
rtdetr_paddle/ppdet/modeling/keypoint_utils.py
Normal file
403
rtdetr_paddle/ppdet/modeling/keypoint_utils.py
Normal file
@@ -0,0 +1,403 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
this code is based on https://github.com/open-mmlab/mmpose
|
||||
"""
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import paddle.nn.functional as F
|
||||
|
||||
|
||||
def get_affine_mat_kernel(h, w, s, inv=False):
|
||||
if w < h:
|
||||
w_ = s
|
||||
h_ = int(np.ceil((s / w * h) / 64.) * 64)
|
||||
scale_w = w
|
||||
scale_h = h_ / w_ * w
|
||||
|
||||
else:
|
||||
h_ = s
|
||||
w_ = int(np.ceil((s / h * w) / 64.) * 64)
|
||||
scale_h = h
|
||||
scale_w = w_ / h_ * h
|
||||
|
||||
center = np.array([np.round(w / 2.), np.round(h / 2.)])
|
||||
|
||||
size_resized = (w_, h_)
|
||||
trans = get_affine_transform(
|
||||
center, np.array([scale_w, scale_h]), 0, size_resized, inv=inv)
|
||||
|
||||
return trans, size_resized
|
||||
|
||||
|
||||
def get_affine_transform(center,
|
||||
input_size,
|
||||
rot,
|
||||
output_size,
|
||||
shift=(0., 0.),
|
||||
inv=False):
|
||||
"""Get the affine transform matrix, given the center/scale/rot/output_size.
|
||||
|
||||
Args:
|
||||
center (np.ndarray[2, ]): Center of the bounding box (x, y).
|
||||
input_size (np.ndarray[2, ]): Size of input feature (width, height).
|
||||
rot (float): Rotation angle (degree).
|
||||
output_size (np.ndarray[2, ]): Size of the destination heatmaps.
|
||||
shift (0-100%): Shift translation ratio wrt the width/height.
|
||||
Default (0., 0.).
|
||||
inv (bool): Option to inverse the affine transform direction.
|
||||
(inv=False: src->dst or inv=True: dst->src)
|
||||
|
||||
Returns:
|
||||
np.ndarray: The transform matrix.
|
||||
"""
|
||||
assert len(center) == 2
|
||||
assert len(output_size) == 2
|
||||
assert len(shift) == 2
|
||||
|
||||
if not isinstance(input_size, (np.ndarray, list)):
|
||||
input_size = np.array([input_size, input_size], dtype=np.float32)
|
||||
scale_tmp = input_size
|
||||
|
||||
shift = np.array(shift)
|
||||
src_w = scale_tmp[0]
|
||||
dst_w = output_size[0]
|
||||
dst_h = output_size[1]
|
||||
|
||||
rot_rad = np.pi * rot / 180
|
||||
src_dir = rotate_point([0., src_w * -0.5], rot_rad)
|
||||
dst_dir = np.array([0., dst_w * -0.5])
|
||||
|
||||
src = np.zeros((3, 2), dtype=np.float32)
|
||||
|
||||
src[0, :] = center + scale_tmp * shift
|
||||
src[1, :] = center + src_dir + scale_tmp * shift
|
||||
src[2, :] = _get_3rd_point(src[0, :], src[1, :])
|
||||
|
||||
dst = np.zeros((3, 2), dtype=np.float32)
|
||||
dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
|
||||
dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
|
||||
dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
|
||||
|
||||
if inv:
|
||||
trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
|
||||
else:
|
||||
trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
|
||||
|
||||
return trans
|
||||
|
||||
|
||||
def get_warp_matrix(theta, size_input, size_dst, size_target):
|
||||
"""This code is based on
|
||||
https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py
|
||||
|
||||
Calculate the transformation matrix under the constraint of unbiased.
|
||||
Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased
|
||||
Data Processing for Human Pose Estimation (CVPR 2020).
|
||||
|
||||
Args:
|
||||
theta (float): Rotation angle in degrees.
|
||||
size_input (np.ndarray): Size of input image [w, h].
|
||||
size_dst (np.ndarray): Size of output image [w, h].
|
||||
size_target (np.ndarray): Size of ROI in input plane [w, h].
|
||||
|
||||
Returns:
|
||||
matrix (np.ndarray): A matrix for transformation.
|
||||
"""
|
||||
theta = np.deg2rad(theta)
|
||||
matrix = np.zeros((2, 3), dtype=np.float32)
|
||||
scale_x = size_dst[0] / size_target[0]
|
||||
scale_y = size_dst[1] / size_target[1]
|
||||
matrix[0, 0] = np.cos(theta) * scale_x
|
||||
matrix[0, 1] = -np.sin(theta) * scale_x
|
||||
matrix[0, 2] = scale_x * (
|
||||
-0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] *
|
||||
np.sin(theta) + 0.5 * size_target[0])
|
||||
matrix[1, 0] = np.sin(theta) * scale_y
|
||||
matrix[1, 1] = np.cos(theta) * scale_y
|
||||
matrix[1, 2] = scale_y * (
|
||||
-0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] *
|
||||
np.cos(theta) + 0.5 * size_target[1])
|
||||
return matrix
|
||||
|
||||
|
||||
def _get_3rd_point(a, b):
|
||||
"""To calculate the affine matrix, three pairs of points are required. This
|
||||
function is used to get the 3rd point, given 2D points a & b.
|
||||
|
||||
The 3rd point is defined by rotating vector `a - b` by 90 degrees
|
||||
anticlockwise, using b as the rotation center.
|
||||
|
||||
Args:
|
||||
a (np.ndarray): point(x,y)
|
||||
b (np.ndarray): point(x,y)
|
||||
|
||||
Returns:
|
||||
np.ndarray: The 3rd point.
|
||||
"""
|
||||
assert len(
|
||||
a) == 2, 'input of _get_3rd_point should be point with length of 2'
|
||||
assert len(
|
||||
b) == 2, 'input of _get_3rd_point should be point with length of 2'
|
||||
direction = a - b
|
||||
third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)
|
||||
|
||||
return third_pt
|
||||
|
||||
|
||||
def rotate_point(pt, angle_rad):
|
||||
"""Rotate a point by an angle.
|
||||
|
||||
Args:
|
||||
pt (list[float]): 2 dimensional point to be rotated
|
||||
angle_rad (float): rotation angle by radian
|
||||
|
||||
Returns:
|
||||
list[float]: Rotated point.
|
||||
"""
|
||||
assert len(pt) == 2
|
||||
sn, cs = np.sin(angle_rad), np.cos(angle_rad)
|
||||
new_x = pt[0] * cs - pt[1] * sn
|
||||
new_y = pt[0] * sn + pt[1] * cs
|
||||
rotated_pt = [new_x, new_y]
|
||||
|
||||
return rotated_pt
|
||||
|
||||
|
||||
def transpred(kpts, h, w, s):
|
||||
trans, _ = get_affine_mat_kernel(h, w, s, inv=True)
|
||||
|
||||
return warp_affine_joints(kpts[..., :2].copy(), trans)
|
||||
|
||||
|
||||
def warp_affine_joints(joints, mat):
|
||||
"""Apply affine transformation defined by the transform matrix on the
|
||||
joints.
|
||||
|
||||
Args:
|
||||
joints (np.ndarray[..., 2]): Origin coordinate of joints.
|
||||
mat (np.ndarray[3, 2]): The affine matrix.
|
||||
|
||||
Returns:
|
||||
matrix (np.ndarray[..., 2]): Result coordinate of joints.
|
||||
"""
|
||||
joints = np.array(joints)
|
||||
shape = joints.shape
|
||||
joints = joints.reshape(-1, 2)
|
||||
return np.dot(np.concatenate(
|
||||
(joints, joints[:, 0:1] * 0 + 1), axis=1),
|
||||
mat.T).reshape(shape)
|
||||
|
||||
|
||||
def affine_transform(pt, t):
|
||||
new_pt = np.array([pt[0], pt[1], 1.]).T
|
||||
new_pt = np.dot(t, new_pt)
|
||||
return new_pt[:2]
|
||||
|
||||
|
||||
def transform_preds(coords, center, scale, output_size):
|
||||
target_coords = np.zeros(coords.shape)
|
||||
trans = get_affine_transform(center, scale * 200, 0, output_size, inv=1)
|
||||
for p in range(coords.shape[0]):
|
||||
target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
|
||||
return target_coords
|
||||
|
||||
|
||||
def oks_iou(g, d, a_g, a_d, sigmas=None, in_vis_thre=None):
|
||||
if not isinstance(sigmas, np.ndarray):
|
||||
sigmas = np.array([
|
||||
.26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07,
|
||||
.87, .87, .89, .89
|
||||
]) / 10.0
|
||||
vars = (sigmas * 2)**2
|
||||
xg = g[0::3]
|
||||
yg = g[1::3]
|
||||
vg = g[2::3]
|
||||
ious = np.zeros((d.shape[0]))
|
||||
for n_d in range(0, d.shape[0]):
|
||||
xd = d[n_d, 0::3]
|
||||
yd = d[n_d, 1::3]
|
||||
vd = d[n_d, 2::3]
|
||||
dx = xd - xg
|
||||
dy = yd - yg
|
||||
e = (dx**2 + dy**2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2
|
||||
if in_vis_thre is not None:
|
||||
ind = list(vg > in_vis_thre) and list(vd > in_vis_thre)
|
||||
e = e[ind]
|
||||
ious[n_d] = np.sum(np.exp(-e)) / e.shape[0] if e.shape[0] != 0 else 0.0
|
||||
return ious
|
||||
|
||||
|
||||
def oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None):
|
||||
"""greedily select boxes with high confidence and overlap with current maximum <= thresh
|
||||
rule out overlap >= thresh
|
||||
|
||||
Args:
|
||||
kpts_db (list): The predicted keypoints within the image
|
||||
thresh (float): The threshold to select the boxes
|
||||
sigmas (np.array): The variance to calculate the oks iou
|
||||
Default: None
|
||||
in_vis_thre (float): The threshold to select the high confidence boxes
|
||||
Default: None
|
||||
|
||||
Return:
|
||||
keep (list): indexes to keep
|
||||
"""
|
||||
|
||||
if len(kpts_db) == 0:
|
||||
return []
|
||||
|
||||
scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))])
|
||||
kpts = np.array(
|
||||
[kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))])
|
||||
areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))])
|
||||
|
||||
order = scores.argsort()[::-1]
|
||||
|
||||
keep = []
|
||||
while order.size > 0:
|
||||
i = order[0]
|
||||
keep.append(i)
|
||||
|
||||
oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
|
||||
sigmas, in_vis_thre)
|
||||
|
||||
inds = np.where(oks_ovr <= thresh)[0]
|
||||
order = order[inds + 1]
|
||||
|
||||
return keep
|
||||
|
||||
|
||||
def rescore(overlap, scores, thresh, type='gaussian'):
|
||||
assert overlap.shape[0] == scores.shape[0]
|
||||
if type == 'linear':
|
||||
inds = np.where(overlap >= thresh)[0]
|
||||
scores[inds] = scores[inds] * (1 - overlap[inds])
|
||||
else:
|
||||
scores = scores * np.exp(-overlap**2 / thresh)
|
||||
|
||||
return scores
|
||||
|
||||
|
||||
def soft_oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None):
|
||||
"""greedily select boxes with high confidence and overlap with current maximum <= thresh
|
||||
rule out overlap >= thresh
|
||||
|
||||
Args:
|
||||
kpts_db (list): The predicted keypoints within the image
|
||||
thresh (float): The threshold to select the boxes
|
||||
sigmas (np.array): The variance to calculate the oks iou
|
||||
Default: None
|
||||
in_vis_thre (float): The threshold to select the high confidence boxes
|
||||
Default: None
|
||||
|
||||
Return:
|
||||
keep (list): indexes to keep
|
||||
"""
|
||||
|
||||
if len(kpts_db) == 0:
|
||||
return []
|
||||
|
||||
scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))])
|
||||
kpts = np.array(
|
||||
[kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))])
|
||||
areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))])
|
||||
|
||||
order = scores.argsort()[::-1]
|
||||
scores = scores[order]
|
||||
|
||||
# max_dets = order.size
|
||||
max_dets = 20
|
||||
keep = np.zeros(max_dets, dtype=np.intp)
|
||||
keep_cnt = 0
|
||||
while order.size > 0 and keep_cnt < max_dets:
|
||||
i = order[0]
|
||||
|
||||
oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
|
||||
sigmas, in_vis_thre)
|
||||
|
||||
order = order[1:]
|
||||
scores = rescore(oks_ovr, scores[1:], thresh)
|
||||
|
||||
tmp = scores.argsort()[::-1]
|
||||
order = order[tmp]
|
||||
scores = scores[tmp]
|
||||
|
||||
keep[keep_cnt] = i
|
||||
keep_cnt += 1
|
||||
|
||||
keep = keep[:keep_cnt]
|
||||
|
||||
return keep
|
||||
|
||||
|
||||
def resize(input,
|
||||
size=None,
|
||||
scale_factor=None,
|
||||
mode='nearest',
|
||||
align_corners=None,
|
||||
warning=True):
|
||||
if warning:
|
||||
if size is not None and align_corners:
|
||||
input_h, input_w = tuple(int(x) for x in input.shape[2:])
|
||||
output_h, output_w = tuple(int(x) for x in size)
|
||||
if output_h > input_h or output_w > output_h:
|
||||
if ((output_h > 1 and output_w > 1 and input_h > 1 and
|
||||
input_w > 1) and (output_h - 1) % (input_h - 1) and
|
||||
(output_w - 1) % (input_w - 1)):
|
||||
warnings.warn(
|
||||
f'When align_corners={align_corners}, '
|
||||
'the output would more aligned if '
|
||||
f'input size {(input_h, input_w)} is `x+1` and '
|
||||
f'out size {(output_h, output_w)} is `nx+1`')
|
||||
|
||||
return F.interpolate(input, size, scale_factor, mode, align_corners)
|
||||
|
||||
|
||||
def flip_back(output_flipped, flip_pairs, target_type='GaussianHeatmap'):
|
||||
"""Flip the flipped heatmaps back to the original form.
|
||||
Note:
|
||||
- batch_size: N
|
||||
- num_keypoints: K
|
||||
- heatmap height: H
|
||||
- heatmap width: W
|
||||
Args:
|
||||
output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained
|
||||
from the flipped images.
|
||||
flip_pairs (list[tuple()): Pairs of keypoints which are mirrored
|
||||
(for example, left ear -- right ear).
|
||||
target_type (str): GaussianHeatmap or CombinedTarget
|
||||
Returns:
|
||||
np.ndarray: heatmaps that flipped back to the original image
|
||||
"""
|
||||
assert len(output_flipped.shape) == 4, \
|
||||
'output_flipped should be [batch_size, num_keypoints, height, width]'
|
||||
shape_ori = output_flipped.shape
|
||||
channels = 1
|
||||
if target_type.lower() == 'CombinedTarget'.lower():
|
||||
channels = 3
|
||||
output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...]
|
||||
output_flipped = output_flipped.reshape((shape_ori[0], -1, channels,
|
||||
shape_ori[2], shape_ori[3]))
|
||||
output_flipped_back = output_flipped.clone()
|
||||
|
||||
# Swap left-right parts
|
||||
for left, right in flip_pairs:
|
||||
output_flipped_back[:, left, ...] = output_flipped[:, right, ...]
|
||||
output_flipped_back[:, right, ...] = output_flipped[:, left, ...]
|
||||
output_flipped_back = output_flipped_back.reshape(shape_ori)
|
||||
# Flip horizontally
|
||||
output_flipped_back = output_flipped_back[..., ::-1]
|
||||
return output_flipped_back
|
||||
1346
rtdetr_paddle/ppdet/modeling/layers.py
Normal file
1346
rtdetr_paddle/ppdet/modeling/layers.py
Normal file
File diff suppressed because it is too large
Load Diff
19
rtdetr_paddle/ppdet/modeling/losses/__init__.py
Normal file
19
rtdetr_paddle/ppdet/modeling/losses/__init__.py
Normal file
@@ -0,0 +1,19 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .iou_loss import *
|
||||
from .gfocal_loss import *
|
||||
from .detr_loss import *
|
||||
from .focal_loss import *
|
||||
from .smooth_l1_loss import *
|
||||
578
rtdetr_paddle/ppdet/modeling/losses/detr_loss.py
Normal file
578
rtdetr_paddle/ppdet/modeling/losses/detr_loss.py
Normal file
@@ -0,0 +1,578 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from ppdet.core.workspace import register
|
||||
from .iou_loss import GIoULoss
|
||||
from ..transformers import bbox_cxcywh_to_xyxy, sigmoid_focal_loss, varifocal_loss_with_logits
|
||||
from ..bbox_utils import bbox_iou
|
||||
|
||||
__all__ = ['DETRLoss', 'DINOLoss']
|
||||
|
||||
|
||||
@register
|
||||
class DETRLoss(nn.Layer):
|
||||
__shared__ = ['num_classes', 'use_focal_loss']
|
||||
__inject__ = ['matcher']
|
||||
|
||||
def __init__(self,
|
||||
num_classes=80,
|
||||
matcher='HungarianMatcher',
|
||||
loss_coeff={
|
||||
'class': 1,
|
||||
'bbox': 5,
|
||||
'giou': 2,
|
||||
'no_object': 0.1,
|
||||
'mask': 1,
|
||||
'dice': 1
|
||||
},
|
||||
aux_loss=True,
|
||||
use_focal_loss=False,
|
||||
use_vfl=False,
|
||||
use_uni_match=False,
|
||||
uni_match_ind=0):
|
||||
r"""
|
||||
Args:
|
||||
num_classes (int): The number of classes.
|
||||
matcher (HungarianMatcher): It computes an assignment between the targets
|
||||
and the predictions of the network.
|
||||
loss_coeff (dict): The coefficient of loss.
|
||||
aux_loss (bool): If 'aux_loss = True', loss at each decoder layer are to be used.
|
||||
use_focal_loss (bool): Use focal loss or not.
|
||||
"""
|
||||
super(DETRLoss, self).__init__()
|
||||
|
||||
self.num_classes = num_classes
|
||||
self.matcher = matcher
|
||||
self.loss_coeff = loss_coeff
|
||||
self.aux_loss = aux_loss
|
||||
self.use_focal_loss = use_focal_loss
|
||||
self.use_vfl = use_vfl
|
||||
self.use_uni_match = use_uni_match
|
||||
self.uni_match_ind = uni_match_ind
|
||||
|
||||
if not self.use_focal_loss:
|
||||
self.loss_coeff['class'] = paddle.full([num_classes + 1],
|
||||
loss_coeff['class'])
|
||||
self.loss_coeff['class'][-1] = loss_coeff['no_object']
|
||||
self.giou_loss = GIoULoss()
|
||||
|
||||
def _get_loss_class(self,
|
||||
logits,
|
||||
gt_class,
|
||||
match_indices,
|
||||
bg_index,
|
||||
num_gts,
|
||||
postfix="",
|
||||
iou_score=None):
|
||||
# logits: [b, query, num_classes], gt_class: list[[n, 1]]
|
||||
name_class = "loss_class" + postfix
|
||||
|
||||
target_label = paddle.full(logits.shape[:2], bg_index, dtype='int64')
|
||||
bs, num_query_objects = target_label.shape
|
||||
num_gt = sum(len(a) for a in gt_class)
|
||||
if num_gt > 0:
|
||||
index, updates = self._get_index_updates(num_query_objects,
|
||||
gt_class, match_indices)
|
||||
target_label = paddle.scatter(
|
||||
target_label.reshape([-1, 1]), index, updates.astype('int64'))
|
||||
target_label = target_label.reshape([bs, num_query_objects])
|
||||
if self.use_focal_loss:
|
||||
target_label = F.one_hot(target_label,
|
||||
self.num_classes + 1)[..., :-1]
|
||||
if iou_score is not None and self.use_vfl:
|
||||
target_score = paddle.zeros([bs, num_query_objects])
|
||||
if num_gt > 0:
|
||||
target_score = paddle.scatter(
|
||||
target_score.reshape([-1, 1]), index, iou_score)
|
||||
target_score = target_score.reshape(
|
||||
[bs, num_query_objects, 1]) * target_label
|
||||
loss_ = self.loss_coeff['class'] * varifocal_loss_with_logits(
|
||||
logits, target_score, target_label,
|
||||
num_gts / num_query_objects)
|
||||
else:
|
||||
loss_ = self.loss_coeff['class'] * sigmoid_focal_loss(
|
||||
logits, target_label, num_gts / num_query_objects)
|
||||
else:
|
||||
loss_ = F.cross_entropy(
|
||||
logits, target_label, weight=self.loss_coeff['class'])
|
||||
return {name_class: loss_}
|
||||
|
||||
def _get_loss_bbox(self, boxes, gt_bbox, match_indices, num_gts,
|
||||
postfix=""):
|
||||
# boxes: [b, query, 4], gt_bbox: list[[n, 4]]
|
||||
name_bbox = "loss_bbox" + postfix
|
||||
name_giou = "loss_giou" + postfix
|
||||
|
||||
loss = dict()
|
||||
if sum(len(a) for a in gt_bbox) == 0:
|
||||
loss[name_bbox] = paddle.to_tensor([0.])
|
||||
loss[name_giou] = paddle.to_tensor([0.])
|
||||
return loss
|
||||
|
||||
src_bbox, target_bbox = self._get_src_target_assign(boxes, gt_bbox,
|
||||
match_indices)
|
||||
loss[name_bbox] = self.loss_coeff['bbox'] * F.l1_loss(
|
||||
src_bbox, target_bbox, reduction='sum') / num_gts
|
||||
loss[name_giou] = self.giou_loss(
|
||||
bbox_cxcywh_to_xyxy(src_bbox), bbox_cxcywh_to_xyxy(target_bbox))
|
||||
loss[name_giou] = loss[name_giou].sum() / num_gts
|
||||
loss[name_giou] = self.loss_coeff['giou'] * loss[name_giou]
|
||||
return loss
|
||||
|
||||
def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts,
|
||||
postfix=""):
|
||||
# masks: [b, query, h, w], gt_mask: list[[n, H, W]]
|
||||
name_mask = "loss_mask" + postfix
|
||||
name_dice = "loss_dice" + postfix
|
||||
|
||||
loss = dict()
|
||||
if sum(len(a) for a in gt_mask) == 0:
|
||||
loss[name_mask] = paddle.to_tensor([0.])
|
||||
loss[name_dice] = paddle.to_tensor([0.])
|
||||
return loss
|
||||
|
||||
src_masks, target_masks = self._get_src_target_assign(masks, gt_mask,
|
||||
match_indices)
|
||||
src_masks = F.interpolate(
|
||||
src_masks.unsqueeze(0),
|
||||
size=target_masks.shape[-2:],
|
||||
mode="bilinear")[0]
|
||||
loss[name_mask] = self.loss_coeff['mask'] * F.sigmoid_focal_loss(
|
||||
src_masks,
|
||||
target_masks,
|
||||
paddle.to_tensor(
|
||||
[num_gts], dtype='float32'))
|
||||
loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss(
|
||||
src_masks, target_masks, num_gts)
|
||||
return loss
|
||||
|
||||
def _dice_loss(self, inputs, targets, num_gts):
|
||||
inputs = F.sigmoid(inputs)
|
||||
inputs = inputs.flatten(1)
|
||||
targets = targets.flatten(1)
|
||||
numerator = 2 * (inputs * targets).sum(1)
|
||||
denominator = inputs.sum(-1) + targets.sum(-1)
|
||||
loss = 1 - (numerator + 1) / (denominator + 1)
|
||||
return loss.sum() / num_gts
|
||||
|
||||
def _get_loss_aux(self,
|
||||
boxes,
|
||||
logits,
|
||||
gt_bbox,
|
||||
gt_class,
|
||||
bg_index,
|
||||
num_gts,
|
||||
dn_match_indices=None,
|
||||
postfix="",
|
||||
masks=None,
|
||||
gt_mask=None):
|
||||
loss_class = []
|
||||
loss_bbox, loss_giou = [], []
|
||||
loss_mask, loss_dice = [], []
|
||||
if dn_match_indices is not None:
|
||||
match_indices = dn_match_indices
|
||||
elif self.use_uni_match:
|
||||
match_indices = self.matcher(
|
||||
boxes[self.uni_match_ind],
|
||||
logits[self.uni_match_ind],
|
||||
gt_bbox,
|
||||
gt_class,
|
||||
masks=masks[self.uni_match_ind] if masks is not None else None,
|
||||
gt_mask=gt_mask)
|
||||
for i, (aux_boxes, aux_logits) in enumerate(zip(boxes, logits)):
|
||||
aux_masks = masks[i] if masks is not None else None
|
||||
if not self.use_uni_match and dn_match_indices is None:
|
||||
match_indices = self.matcher(
|
||||
aux_boxes,
|
||||
aux_logits,
|
||||
gt_bbox,
|
||||
gt_class,
|
||||
masks=aux_masks,
|
||||
gt_mask=gt_mask)
|
||||
if self.use_vfl:
|
||||
if sum(len(a) for a in gt_bbox) > 0:
|
||||
src_bbox, target_bbox = self._get_src_target_assign(
|
||||
aux_boxes.detach(), gt_bbox, match_indices)
|
||||
iou_score = bbox_iou(
|
||||
bbox_cxcywh_to_xyxy(src_bbox).split(4, -1),
|
||||
bbox_cxcywh_to_xyxy(target_bbox).split(4, -1))
|
||||
else:
|
||||
iou_score = None
|
||||
else:
|
||||
iou_score = None
|
||||
loss_class.append(
|
||||
self._get_loss_class(aux_logits, gt_class, match_indices,
|
||||
bg_index, num_gts, postfix, iou_score)[
|
||||
'loss_class' + postfix])
|
||||
loss_ = self._get_loss_bbox(aux_boxes, gt_bbox, match_indices,
|
||||
num_gts, postfix)
|
||||
loss_bbox.append(loss_['loss_bbox' + postfix])
|
||||
loss_giou.append(loss_['loss_giou' + postfix])
|
||||
if masks is not None and gt_mask is not None:
|
||||
loss_ = self._get_loss_mask(aux_masks, gt_mask, match_indices,
|
||||
num_gts, postfix)
|
||||
loss_mask.append(loss_['loss_mask' + postfix])
|
||||
loss_dice.append(loss_['loss_dice' + postfix])
|
||||
loss = {
|
||||
"loss_class_aux" + postfix: paddle.add_n(loss_class),
|
||||
"loss_bbox_aux" + postfix: paddle.add_n(loss_bbox),
|
||||
"loss_giou_aux" + postfix: paddle.add_n(loss_giou)
|
||||
}
|
||||
if masks is not None and gt_mask is not None:
|
||||
loss["loss_mask_aux" + postfix] = paddle.add_n(loss_mask)
|
||||
loss["loss_dice_aux" + postfix] = paddle.add_n(loss_dice)
|
||||
return loss
|
||||
|
||||
def _get_index_updates(self, num_query_objects, target, match_indices):
|
||||
batch_idx = paddle.concat([
|
||||
paddle.full_like(src, i) for i, (src, _) in enumerate(match_indices)
|
||||
])
|
||||
src_idx = paddle.concat([src for (src, _) in match_indices])
|
||||
src_idx += (batch_idx * num_query_objects)
|
||||
target_assign = paddle.concat([
|
||||
paddle.gather(
|
||||
t, dst, axis=0) for t, (_, dst) in zip(target, match_indices)
|
||||
])
|
||||
return src_idx, target_assign
|
||||
|
||||
def _get_src_target_assign(self, src, target, match_indices):
|
||||
src_assign = paddle.concat([
|
||||
paddle.gather(
|
||||
t, I, axis=0) if len(I) > 0 else paddle.zeros([0, t.shape[-1]])
|
||||
for t, (I, _) in zip(src, match_indices)
|
||||
])
|
||||
target_assign = paddle.concat([
|
||||
paddle.gather(
|
||||
t, J, axis=0) if len(J) > 0 else paddle.zeros([0, t.shape[-1]])
|
||||
for t, (_, J) in zip(target, match_indices)
|
||||
])
|
||||
return src_assign, target_assign
|
||||
|
||||
def _get_num_gts(self, targets, dtype="float32"):
|
||||
num_gts = sum(len(a) for a in targets)
|
||||
num_gts = paddle.to_tensor([num_gts], dtype=dtype)
|
||||
if paddle.distributed.get_world_size() > 1:
|
||||
paddle.distributed.all_reduce(num_gts)
|
||||
num_gts /= paddle.distributed.get_world_size()
|
||||
num_gts = paddle.clip(num_gts, min=1.)
|
||||
return num_gts
|
||||
|
||||
def _get_prediction_loss(self,
|
||||
boxes,
|
||||
logits,
|
||||
gt_bbox,
|
||||
gt_class,
|
||||
masks=None,
|
||||
gt_mask=None,
|
||||
postfix="",
|
||||
dn_match_indices=None,
|
||||
num_gts=1):
|
||||
if dn_match_indices is None:
|
||||
match_indices = self.matcher(
|
||||
boxes, logits, gt_bbox, gt_class, masks=masks, gt_mask=gt_mask)
|
||||
else:
|
||||
match_indices = dn_match_indices
|
||||
|
||||
if self.use_vfl:
|
||||
if sum(len(a) for a in gt_bbox) > 0:
|
||||
src_bbox, target_bbox = self._get_src_target_assign(
|
||||
boxes.detach(), gt_bbox, match_indices)
|
||||
iou_score = bbox_iou(
|
||||
bbox_cxcywh_to_xyxy(src_bbox).split(4, -1),
|
||||
bbox_cxcywh_to_xyxy(target_bbox).split(4, -1))
|
||||
else:
|
||||
iou_score = None
|
||||
else:
|
||||
iou_score = None
|
||||
|
||||
loss = dict()
|
||||
loss.update(
|
||||
self._get_loss_class(logits, gt_class, match_indices,
|
||||
self.num_classes, num_gts, postfix, iou_score))
|
||||
loss.update(
|
||||
self._get_loss_bbox(boxes, gt_bbox, match_indices, num_gts,
|
||||
postfix))
|
||||
if masks is not None and gt_mask is not None:
|
||||
loss.update(
|
||||
self._get_loss_mask(masks, gt_mask, match_indices, num_gts,
|
||||
postfix))
|
||||
return loss
|
||||
|
||||
def forward(self,
|
||||
boxes,
|
||||
logits,
|
||||
gt_bbox,
|
||||
gt_class,
|
||||
masks=None,
|
||||
gt_mask=None,
|
||||
postfix="",
|
||||
**kwargs):
|
||||
r"""
|
||||
Args:
|
||||
boxes (Tensor): [l, b, query, 4]
|
||||
logits (Tensor): [l, b, query, num_classes]
|
||||
gt_bbox (List(Tensor)): list[[n, 4]]
|
||||
gt_class (List(Tensor)): list[[n, 1]]
|
||||
masks (Tensor, optional): [l, b, query, h, w]
|
||||
gt_mask (List(Tensor), optional): list[[n, H, W]]
|
||||
postfix (str): postfix of loss name
|
||||
"""
|
||||
|
||||
dn_match_indices = kwargs.get("dn_match_indices", None)
|
||||
num_gts = kwargs.get("num_gts", None)
|
||||
if num_gts is None:
|
||||
num_gts = self._get_num_gts(gt_class)
|
||||
|
||||
total_loss = self._get_prediction_loss(
|
||||
boxes[-1],
|
||||
logits[-1],
|
||||
gt_bbox,
|
||||
gt_class,
|
||||
masks=masks[-1] if masks is not None else None,
|
||||
gt_mask=gt_mask,
|
||||
postfix=postfix,
|
||||
dn_match_indices=dn_match_indices,
|
||||
num_gts=num_gts)
|
||||
|
||||
if self.aux_loss:
|
||||
total_loss.update(
|
||||
self._get_loss_aux(
|
||||
boxes[:-1],
|
||||
logits[:-1],
|
||||
gt_bbox,
|
||||
gt_class,
|
||||
self.num_classes,
|
||||
num_gts,
|
||||
dn_match_indices,
|
||||
postfix,
|
||||
masks=masks[:-1] if masks is not None else None,
|
||||
gt_mask=gt_mask))
|
||||
|
||||
return total_loss
|
||||
|
||||
|
||||
@register
|
||||
class DINOLoss(DETRLoss):
|
||||
def forward(self,
|
||||
boxes,
|
||||
logits,
|
||||
gt_bbox,
|
||||
gt_class,
|
||||
masks=None,
|
||||
gt_mask=None,
|
||||
postfix="",
|
||||
dn_out_bboxes=None,
|
||||
dn_out_logits=None,
|
||||
dn_meta=None,
|
||||
**kwargs):
|
||||
num_gts = self._get_num_gts(gt_class)
|
||||
total_loss = super(DINOLoss, self).forward(
|
||||
boxes, logits, gt_bbox, gt_class, num_gts=num_gts)
|
||||
|
||||
if dn_meta is not None:
|
||||
dn_positive_idx, dn_num_group = \
|
||||
dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
|
||||
assert len(gt_class) == len(dn_positive_idx)
|
||||
|
||||
# denoising match indices
|
||||
dn_match_indices = self.get_dn_match_indices(
|
||||
gt_class, dn_positive_idx, dn_num_group)
|
||||
|
||||
# compute denoising training loss
|
||||
num_gts *= dn_num_group
|
||||
dn_loss = super(DINOLoss, self).forward(
|
||||
dn_out_bboxes,
|
||||
dn_out_logits,
|
||||
gt_bbox,
|
||||
gt_class,
|
||||
postfix="_dn",
|
||||
dn_match_indices=dn_match_indices,
|
||||
num_gts=num_gts)
|
||||
total_loss.update(dn_loss)
|
||||
else:
|
||||
total_loss.update(
|
||||
{k + '_dn': paddle.to_tensor([0.])
|
||||
for k in total_loss.keys()})
|
||||
|
||||
return total_loss
|
||||
|
||||
@staticmethod
|
||||
def get_dn_match_indices(labels, dn_positive_idx, dn_num_group):
|
||||
dn_match_indices = []
|
||||
for i in range(len(labels)):
|
||||
num_gt = len(labels[i])
|
||||
if num_gt > 0:
|
||||
gt_idx = paddle.arange(end=num_gt, dtype="int64")
|
||||
gt_idx = gt_idx.tile([dn_num_group])
|
||||
assert len(dn_positive_idx[i]) == len(gt_idx)
|
||||
dn_match_indices.append((dn_positive_idx[i], gt_idx))
|
||||
else:
|
||||
dn_match_indices.append((paddle.zeros(
|
||||
[0], dtype="int64"), paddle.zeros(
|
||||
[0], dtype="int64")))
|
||||
return dn_match_indices
|
||||
|
||||
|
||||
@register
|
||||
class MaskDINOLoss(DETRLoss):
|
||||
__shared__ = ['num_classes', 'use_focal_loss', 'num_sample_points']
|
||||
__inject__ = ['matcher']
|
||||
|
||||
def __init__(self,
|
||||
num_classes=80,
|
||||
matcher='HungarianMatcher',
|
||||
loss_coeff={
|
||||
'class': 4,
|
||||
'bbox': 5,
|
||||
'giou': 2,
|
||||
'mask': 5,
|
||||
'dice': 5
|
||||
},
|
||||
aux_loss=True,
|
||||
use_focal_loss=False,
|
||||
num_sample_points=12544,
|
||||
oversample_ratio=3.0,
|
||||
important_sample_ratio=0.75):
|
||||
super(MaskDINOLoss, self).__init__(num_classes, matcher, loss_coeff,
|
||||
aux_loss, use_focal_loss)
|
||||
assert oversample_ratio >= 1
|
||||
assert important_sample_ratio <= 1 and important_sample_ratio >= 0
|
||||
|
||||
self.num_sample_points = num_sample_points
|
||||
self.oversample_ratio = oversample_ratio
|
||||
self.important_sample_ratio = important_sample_ratio
|
||||
self.num_oversample_points = int(num_sample_points * oversample_ratio)
|
||||
self.num_important_points = int(num_sample_points *
|
||||
important_sample_ratio)
|
||||
self.num_random_points = num_sample_points - self.num_important_points
|
||||
|
||||
def forward(self,
|
||||
boxes,
|
||||
logits,
|
||||
gt_bbox,
|
||||
gt_class,
|
||||
masks=None,
|
||||
gt_mask=None,
|
||||
postfix="",
|
||||
dn_out_bboxes=None,
|
||||
dn_out_logits=None,
|
||||
dn_out_masks=None,
|
||||
dn_meta=None,
|
||||
**kwargs):
|
||||
num_gts = self._get_num_gts(gt_class)
|
||||
total_loss = super(MaskDINOLoss, self).forward(
|
||||
boxes,
|
||||
logits,
|
||||
gt_bbox,
|
||||
gt_class,
|
||||
masks=masks,
|
||||
gt_mask=gt_mask,
|
||||
num_gts=num_gts)
|
||||
|
||||
if dn_meta is not None:
|
||||
dn_positive_idx, dn_num_group = \
|
||||
dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
|
||||
assert len(gt_class) == len(dn_positive_idx)
|
||||
|
||||
# denoising match indices
|
||||
dn_match_indices = DINOLoss.get_dn_match_indices(
|
||||
gt_class, dn_positive_idx, dn_num_group)
|
||||
|
||||
# compute denoising training loss
|
||||
num_gts *= dn_num_group
|
||||
dn_loss = super(MaskDINOLoss, self).forward(
|
||||
dn_out_bboxes,
|
||||
dn_out_logits,
|
||||
gt_bbox,
|
||||
gt_class,
|
||||
masks=dn_out_masks,
|
||||
gt_mask=gt_mask,
|
||||
postfix="_dn",
|
||||
dn_match_indices=dn_match_indices,
|
||||
num_gts=num_gts)
|
||||
total_loss.update(dn_loss)
|
||||
else:
|
||||
total_loss.update(
|
||||
{k + '_dn': paddle.to_tensor([0.])
|
||||
for k in total_loss.keys()})
|
||||
|
||||
return total_loss
|
||||
|
||||
def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts,
|
||||
postfix=""):
|
||||
# masks: [b, query, h, w], gt_mask: list[[n, H, W]]
|
||||
name_mask = "loss_mask" + postfix
|
||||
name_dice = "loss_dice" + postfix
|
||||
|
||||
loss = dict()
|
||||
if sum(len(a) for a in gt_mask) == 0:
|
||||
loss[name_mask] = paddle.to_tensor([0.])
|
||||
loss[name_dice] = paddle.to_tensor([0.])
|
||||
return loss
|
||||
|
||||
src_masks, target_masks = self._get_src_target_assign(masks, gt_mask,
|
||||
match_indices)
|
||||
# sample points
|
||||
sample_points = self._get_point_coords_by_uncertainty(src_masks)
|
||||
sample_points = 2.0 * sample_points.unsqueeze(1) - 1.0
|
||||
|
||||
src_masks = F.grid_sample(
|
||||
src_masks.unsqueeze(1), sample_points,
|
||||
align_corners=False).squeeze([1, 2])
|
||||
|
||||
target_masks = F.grid_sample(
|
||||
target_masks.unsqueeze(1), sample_points,
|
||||
align_corners=False).squeeze([1, 2]).detach()
|
||||
|
||||
loss[name_mask] = self.loss_coeff[
|
||||
'mask'] * F.binary_cross_entropy_with_logits(
|
||||
src_masks, target_masks,
|
||||
reduction='none').mean(1).sum() / num_gts
|
||||
loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss(
|
||||
src_masks, target_masks, num_gts)
|
||||
return loss
|
||||
|
||||
def _get_point_coords_by_uncertainty(self, masks):
|
||||
# Sample points based on their uncertainty.
|
||||
masks = masks.detach()
|
||||
num_masks = masks.shape[0]
|
||||
sample_points = paddle.rand(
|
||||
[num_masks, 1, self.num_oversample_points, 2])
|
||||
|
||||
out_mask = F.grid_sample(
|
||||
masks.unsqueeze(1), 2.0 * sample_points - 1.0,
|
||||
align_corners=False).squeeze([1, 2])
|
||||
out_mask = -paddle.abs(out_mask)
|
||||
|
||||
_, topk_ind = paddle.topk(out_mask, self.num_important_points, axis=1)
|
||||
batch_ind = paddle.arange(end=num_masks, dtype=topk_ind.dtype)
|
||||
batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_important_points])
|
||||
topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
|
||||
|
||||
sample_points = paddle.gather_nd(sample_points.squeeze(1), topk_ind)
|
||||
if self.num_random_points > 0:
|
||||
sample_points = paddle.concat(
|
||||
[
|
||||
sample_points,
|
||||
paddle.rand([num_masks, self.num_random_points, 2])
|
||||
],
|
||||
axis=1)
|
||||
return sample_points
|
||||
138
rtdetr_paddle/ppdet/modeling/losses/focal_loss.py
Normal file
138
rtdetr_paddle/ppdet/modeling/losses/focal_loss.py
Normal file
@@ -0,0 +1,138 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.nn.functional as F
|
||||
import paddle.nn as nn
|
||||
from ppdet.core.workspace import register
|
||||
|
||||
__all__ = ['FocalLoss', 'Weighted_FocalLoss']
|
||||
|
||||
@register
|
||||
class FocalLoss(nn.Layer):
|
||||
"""A wrapper around paddle.nn.functional.sigmoid_focal_loss.
|
||||
Args:
|
||||
use_sigmoid (bool): currently only support use_sigmoid=True
|
||||
alpha (float): parameter alpha in Focal Loss
|
||||
gamma (float): parameter gamma in Focal Loss
|
||||
loss_weight (float): final loss will be multiplied by this
|
||||
"""
|
||||
def __init__(self,
|
||||
use_sigmoid=True,
|
||||
alpha=0.25,
|
||||
gamma=2.0,
|
||||
loss_weight=1.0):
|
||||
super(FocalLoss, self).__init__()
|
||||
assert use_sigmoid == True, \
|
||||
'Focal Loss only supports sigmoid at the moment'
|
||||
self.use_sigmoid = use_sigmoid
|
||||
self.alpha = alpha
|
||||
self.gamma = gamma
|
||||
self.loss_weight = loss_weight
|
||||
|
||||
def forward(self, pred, target, reduction='none'):
|
||||
"""forward function.
|
||||
Args:
|
||||
pred (Tensor): logits of class prediction, of shape (N, num_classes)
|
||||
target (Tensor): target class label, of shape (N, )
|
||||
reduction (str): the way to reduce loss, one of (none, sum, mean)
|
||||
"""
|
||||
num_classes = pred.shape[1]
|
||||
target = F.one_hot(target, num_classes+1).cast(pred.dtype)
|
||||
target = target[:, :-1].detach()
|
||||
loss = F.sigmoid_focal_loss(
|
||||
pred, target, alpha=self.alpha, gamma=self.gamma,
|
||||
reduction=reduction)
|
||||
return loss * self.loss_weight
|
||||
|
||||
|
||||
@register
|
||||
class Weighted_FocalLoss(FocalLoss):
|
||||
"""A wrapper around paddle.nn.functional.sigmoid_focal_loss.
|
||||
Args:
|
||||
use_sigmoid (bool): currently only support use_sigmoid=True
|
||||
alpha (float): parameter alpha in Focal Loss
|
||||
gamma (float): parameter gamma in Focal Loss
|
||||
loss_weight (float): final loss will be multiplied by this
|
||||
"""
|
||||
def __init__(self,
|
||||
use_sigmoid=True,
|
||||
alpha=0.25,
|
||||
gamma=2.0,
|
||||
loss_weight=1.0,
|
||||
reduction="mean"):
|
||||
super(FocalLoss, self).__init__()
|
||||
assert use_sigmoid == True, \
|
||||
'Focal Loss only supports sigmoid at the moment'
|
||||
self.use_sigmoid = use_sigmoid
|
||||
self.alpha = alpha
|
||||
self.gamma = gamma
|
||||
self.loss_weight = loss_weight
|
||||
self.reduction = reduction
|
||||
|
||||
def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None):
|
||||
"""forward function.
|
||||
Args:
|
||||
pred (Tensor): logits of class prediction, of shape (N, num_classes)
|
||||
target (Tensor): target class label, of shape (N, )
|
||||
reduction (str): the way to reduce loss, one of (none, sum, mean)
|
||||
"""
|
||||
assert reduction_override in (None, 'none', 'mean', 'sum')
|
||||
reduction = (
|
||||
reduction_override if reduction_override else self.reduction)
|
||||
num_classes = pred.shape[1]
|
||||
target = F.one_hot(target, num_classes + 1).astype(pred.dtype)
|
||||
target = target[:, :-1].detach()
|
||||
loss = F.sigmoid_focal_loss(
|
||||
pred, target, alpha=self.alpha, gamma=self.gamma,
|
||||
reduction='none')
|
||||
|
||||
if weight is not None:
|
||||
if weight.shape != loss.shape:
|
||||
if weight.shape[0] == loss.shape[0]:
|
||||
# For most cases, weight is of shape (num_priors, ),
|
||||
# which means it does not have the second axis num_class
|
||||
weight = weight.reshape((-1, 1))
|
||||
else:
|
||||
# Sometimes, weight per anchor per class is also needed. e.g.
|
||||
# in FSAF. But it may be flattened of shape
|
||||
# (num_priors x num_class, ), while loss is still of shape
|
||||
# (num_priors, num_class).
|
||||
assert weight.numel() == loss.numel()
|
||||
weight = weight.reshape((loss.shape[0], -1))
|
||||
assert weight.ndim == loss.ndim
|
||||
loss = loss * weight
|
||||
|
||||
# if avg_factor is not specified, just reduce the loss
|
||||
if avg_factor is None:
|
||||
if reduction == 'mean':
|
||||
loss = loss.mean()
|
||||
elif reduction == 'sum':
|
||||
loss = loss.sum()
|
||||
else:
|
||||
# if reduction is mean, then average the loss by avg_factor
|
||||
if reduction == 'mean':
|
||||
# Avoid causing ZeroDivisionError when avg_factor is 0.0,
|
||||
# i.e., all labels of an image belong to ignore index.
|
||||
eps = 1e-10
|
||||
loss = loss.sum() / (avg_factor + eps)
|
||||
# if reduction is 'none', then do nothing, otherwise raise an error
|
||||
elif reduction != 'none':
|
||||
raise ValueError('avg_factor can not be used with reduction="sum"')
|
||||
|
||||
return loss * self.loss_weight
|
||||
217
rtdetr_paddle/ppdet/modeling/losses/gfocal_loss.py
Normal file
217
rtdetr_paddle/ppdet/modeling/losses/gfocal_loss.py
Normal file
@@ -0,0 +1,217 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# The code is based on:
|
||||
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/gfocal_loss.py
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
import numpy as np
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from ppdet.modeling import ops
|
||||
|
||||
__all__ = ['QualityFocalLoss', 'DistributionFocalLoss']
|
||||
|
||||
|
||||
def quality_focal_loss(pred, target, beta=2.0, use_sigmoid=True):
|
||||
"""
|
||||
Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning
|
||||
Qualified and Distributed Bounding Boxes for Dense Object Detection
|
||||
<https://arxiv.org/abs/2006.04388>`_.
|
||||
Args:
|
||||
pred (Tensor): Predicted joint representation of classification
|
||||
and quality (IoU) estimation with shape (N, C), C is the number of
|
||||
classes.
|
||||
target (tuple([Tensor])): Target category label with shape (N,)
|
||||
and target quality label with shape (N,).
|
||||
beta (float): The beta parameter for calculating the modulating factor.
|
||||
Defaults to 2.0.
|
||||
Returns:
|
||||
Tensor: Loss tensor with shape (N,).
|
||||
"""
|
||||
assert len(target) == 2, """target for QFL must be a tuple of two elements,
|
||||
including category label and quality label, respectively"""
|
||||
# label denotes the category id, score denotes the quality score
|
||||
label, score = target
|
||||
if use_sigmoid:
|
||||
func = F.binary_cross_entropy_with_logits
|
||||
else:
|
||||
func = F.binary_cross_entropy
|
||||
|
||||
# negatives are supervised by 0 quality score
|
||||
pred_sigmoid = F.sigmoid(pred) if use_sigmoid else pred
|
||||
scale_factor = pred_sigmoid
|
||||
zerolabel = paddle.zeros(pred.shape, dtype='float32')
|
||||
loss = func(pred, zerolabel, reduction='none') * scale_factor.pow(beta)
|
||||
|
||||
# FG cat_id: [0, num_classes -1], BG cat_id: num_classes
|
||||
bg_class_ind = pred.shape[1]
|
||||
pos = paddle.logical_and((label >= 0),
|
||||
(label < bg_class_ind)).nonzero().squeeze(1)
|
||||
if pos.shape[0] == 0:
|
||||
return loss.sum(axis=1)
|
||||
pos_label = paddle.gather(label, pos, axis=0)
|
||||
pos_mask = np.zeros(pred.shape, dtype=np.int32)
|
||||
pos_mask[pos.numpy(), pos_label.numpy()] = 1
|
||||
pos_mask = paddle.to_tensor(pos_mask, dtype='bool')
|
||||
score = score.unsqueeze(-1).expand([-1, pred.shape[1]]).cast('float32')
|
||||
# positives are supervised by bbox quality (IoU) score
|
||||
scale_factor_new = score - pred_sigmoid
|
||||
|
||||
loss_pos = func(
|
||||
pred, score, reduction='none') * scale_factor_new.abs().pow(beta)
|
||||
loss = loss * paddle.logical_not(pos_mask) + loss_pos * pos_mask
|
||||
loss = loss.sum(axis=1)
|
||||
return loss
|
||||
|
||||
|
||||
def distribution_focal_loss(pred, label):
|
||||
"""Distribution Focal Loss (DFL) is from `Generalized Focal Loss: Learning
|
||||
Qualified and Distributed Bounding Boxes for Dense Object Detection
|
||||
<https://arxiv.org/abs/2006.04388>`_.
|
||||
Args:
|
||||
pred (Tensor): Predicted general distribution of bounding boxes
|
||||
(before softmax) with shape (N, n+1), n is the max value of the
|
||||
integral set `{0, ..., n}` in paper.
|
||||
label (Tensor): Target distance label for bounding boxes with
|
||||
shape (N,).
|
||||
Returns:
|
||||
Tensor: Loss tensor with shape (N,).
|
||||
"""
|
||||
dis_left = label.cast('int64')
|
||||
dis_right = dis_left + 1
|
||||
weight_left = dis_right.cast('float32') - label
|
||||
weight_right = label - dis_left.cast('float32')
|
||||
loss = F.cross_entropy(pred, dis_left, reduction='none') * weight_left \
|
||||
+ F.cross_entropy(pred, dis_right, reduction='none') * weight_right
|
||||
return loss
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class QualityFocalLoss(nn.Layer):
|
||||
r"""Quality Focal Loss (QFL) is a variant of `Generalized Focal Loss:
|
||||
Learning Qualified and Distributed Bounding Boxes for Dense Object
|
||||
Detection <https://arxiv.org/abs/2006.04388>`_.
|
||||
Args:
|
||||
use_sigmoid (bool): Whether sigmoid operation is conducted in QFL.
|
||||
Defaults to True.
|
||||
beta (float): The beta parameter for calculating the modulating factor.
|
||||
Defaults to 2.0.
|
||||
reduction (str): Options are "none", "mean" and "sum".
|
||||
loss_weight (float): Loss weight of current loss.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
use_sigmoid=True,
|
||||
beta=2.0,
|
||||
reduction='mean',
|
||||
loss_weight=1.0):
|
||||
super(QualityFocalLoss, self).__init__()
|
||||
self.use_sigmoid = use_sigmoid
|
||||
self.beta = beta
|
||||
assert reduction in ('none', 'mean', 'sum')
|
||||
self.reduction = reduction
|
||||
self.loss_weight = loss_weight
|
||||
|
||||
def forward(self, pred, target, weight=None, avg_factor=None):
|
||||
"""Forward function.
|
||||
Args:
|
||||
pred (Tensor): Predicted joint representation of
|
||||
classification and quality (IoU) estimation with shape (N, C),
|
||||
C is the number of classes.
|
||||
target (tuple([Tensor])): Target category label with shape
|
||||
(N,) and target quality label with shape (N,).
|
||||
weight (Tensor, optional): The weight of loss for each
|
||||
prediction. Defaults to None.
|
||||
avg_factor (int, optional): Average factor that is used to average
|
||||
the loss. Defaults to None.
|
||||
"""
|
||||
|
||||
loss = self.loss_weight * quality_focal_loss(
|
||||
pred, target, beta=self.beta, use_sigmoid=self.use_sigmoid)
|
||||
|
||||
if weight is not None:
|
||||
loss = loss * weight
|
||||
if avg_factor is None:
|
||||
if self.reduction == 'none':
|
||||
return loss
|
||||
elif self.reduction == 'mean':
|
||||
return loss.mean()
|
||||
elif self.reduction == 'sum':
|
||||
return loss.sum()
|
||||
else:
|
||||
# if reduction is mean, then average the loss by avg_factor
|
||||
if self.reduction == 'mean':
|
||||
loss = loss.sum() / avg_factor
|
||||
# if reduction is 'none', then do nothing, otherwise raise an error
|
||||
elif self.reduction != 'none':
|
||||
raise ValueError(
|
||||
'avg_factor can not be used with reduction="sum"')
|
||||
return loss
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class DistributionFocalLoss(nn.Layer):
|
||||
"""Distribution Focal Loss (DFL) is a variant of `Generalized Focal Loss:
|
||||
Learning Qualified and Distributed Bounding Boxes for Dense Object
|
||||
Detection <https://arxiv.org/abs/2006.04388>`_.
|
||||
Args:
|
||||
reduction (str): Options are `'none'`, `'mean'` and `'sum'`.
|
||||
loss_weight (float): Loss weight of current loss.
|
||||
"""
|
||||
|
||||
def __init__(self, reduction='mean', loss_weight=1.0):
|
||||
super(DistributionFocalLoss, self).__init__()
|
||||
assert reduction in ('none', 'mean', 'sum')
|
||||
self.reduction = reduction
|
||||
self.loss_weight = loss_weight
|
||||
|
||||
def forward(self, pred, target, weight=None, avg_factor=None):
|
||||
"""Forward function.
|
||||
Args:
|
||||
pred (Tensor): Predicted general distribution of bounding
|
||||
boxes (before softmax) with shape (N, n+1), n is the max value
|
||||
of the integral set `{0, ..., n}` in paper.
|
||||
target (Tensor): Target distance label for bounding boxes
|
||||
with shape (N,).
|
||||
weight (Tensor, optional): The weight of loss for each
|
||||
prediction. Defaults to None.
|
||||
avg_factor (int, optional): Average factor that is used to average
|
||||
the loss. Defaults to None.
|
||||
"""
|
||||
loss = self.loss_weight * distribution_focal_loss(pred, target)
|
||||
if weight is not None:
|
||||
loss = loss * weight
|
||||
if avg_factor is None:
|
||||
if self.reduction == 'none':
|
||||
return loss
|
||||
elif self.reduction == 'mean':
|
||||
return loss.mean()
|
||||
elif self.reduction == 'sum':
|
||||
return loss.sum()
|
||||
else:
|
||||
# if reduction is mean, then average the loss by avg_factor
|
||||
if self.reduction == 'mean':
|
||||
loss = loss.sum() / avg_factor
|
||||
# if reduction is 'none', then do nothing, otherwise raise an error
|
||||
elif self.reduction != 'none':
|
||||
raise ValueError(
|
||||
'avg_factor can not be used with reduction="sum"')
|
||||
return loss
|
||||
295
rtdetr_paddle/ppdet/modeling/losses/iou_loss.py
Normal file
295
rtdetr_paddle/ppdet/modeling/losses/iou_loss.py
Normal file
@@ -0,0 +1,295 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import math
|
||||
import paddle
|
||||
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from ..bbox_utils import bbox_iou
|
||||
|
||||
__all__ = ['IouLoss', 'GIoULoss', 'DIouLoss', 'SIoULoss']
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class IouLoss(object):
|
||||
"""
|
||||
iou loss, see https://arxiv.org/abs/1908.03851
|
||||
loss = 1.0 - iou * iou
|
||||
Args:
|
||||
loss_weight (float): iou loss weight, default is 2.5
|
||||
max_height (int): max height of input to support random shape input
|
||||
max_width (int): max width of input to support random shape input
|
||||
ciou_term (bool): whether to add ciou_term
|
||||
loss_square (bool): whether to square the iou term
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
loss_weight=2.5,
|
||||
giou=False,
|
||||
diou=False,
|
||||
ciou=False,
|
||||
loss_square=True):
|
||||
self.loss_weight = loss_weight
|
||||
self.giou = giou
|
||||
self.diou = diou
|
||||
self.ciou = ciou
|
||||
self.loss_square = loss_square
|
||||
|
||||
def __call__(self, pbox, gbox):
|
||||
iou = bbox_iou(
|
||||
pbox, gbox, giou=self.giou, diou=self.diou, ciou=self.ciou)
|
||||
if self.loss_square:
|
||||
loss_iou = 1 - iou * iou
|
||||
else:
|
||||
loss_iou = 1 - iou
|
||||
|
||||
loss_iou = loss_iou * self.loss_weight
|
||||
return loss_iou
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class GIoULoss(object):
|
||||
"""
|
||||
Generalized Intersection over Union, see https://arxiv.org/abs/1902.09630
|
||||
Args:
|
||||
loss_weight (float): giou loss weight, default as 1
|
||||
eps (float): epsilon to avoid divide by zero, default as 1e-10
|
||||
reduction (string): Options are "none", "mean" and "sum". default as none
|
||||
"""
|
||||
|
||||
def __init__(self, loss_weight=1., eps=1e-10, reduction='none'):
|
||||
self.loss_weight = loss_weight
|
||||
self.eps = eps
|
||||
assert reduction in ('none', 'mean', 'sum')
|
||||
self.reduction = reduction
|
||||
|
||||
def bbox_overlap(self, box1, box2, eps=1e-10):
|
||||
"""calculate the iou of box1 and box2
|
||||
Args:
|
||||
box1 (Tensor): box1 with the shape (..., 4)
|
||||
box2 (Tensor): box1 with the shape (..., 4)
|
||||
eps (float): epsilon to avoid divide by zero
|
||||
Return:
|
||||
iou (Tensor): iou of box1 and box2
|
||||
overlap (Tensor): overlap of box1 and box2
|
||||
union (Tensor): union of box1 and box2
|
||||
"""
|
||||
x1, y1, x2, y2 = box1
|
||||
x1g, y1g, x2g, y2g = box2
|
||||
|
||||
xkis1 = paddle.maximum(x1, x1g)
|
||||
ykis1 = paddle.maximum(y1, y1g)
|
||||
xkis2 = paddle.minimum(x2, x2g)
|
||||
ykis2 = paddle.minimum(y2, y2g)
|
||||
w_inter = (xkis2 - xkis1).clip(0)
|
||||
h_inter = (ykis2 - ykis1).clip(0)
|
||||
overlap = w_inter * h_inter
|
||||
|
||||
area1 = (x2 - x1) * (y2 - y1)
|
||||
area2 = (x2g - x1g) * (y2g - y1g)
|
||||
union = area1 + area2 - overlap + eps
|
||||
iou = overlap / union
|
||||
|
||||
return iou, overlap, union
|
||||
|
||||
def __call__(self, pbox, gbox, iou_weight=1., loc_reweight=None):
|
||||
x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
|
||||
x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)
|
||||
box1 = [x1, y1, x2, y2]
|
||||
box2 = [x1g, y1g, x2g, y2g]
|
||||
iou, overlap, union = self.bbox_overlap(box1, box2, self.eps)
|
||||
xc1 = paddle.minimum(x1, x1g)
|
||||
yc1 = paddle.minimum(y1, y1g)
|
||||
xc2 = paddle.maximum(x2, x2g)
|
||||
yc2 = paddle.maximum(y2, y2g)
|
||||
|
||||
area_c = (xc2 - xc1) * (yc2 - yc1) + self.eps
|
||||
miou = iou - ((area_c - union) / area_c)
|
||||
if loc_reweight is not None:
|
||||
loc_reweight = paddle.reshape(loc_reweight, shape=(-1, 1))
|
||||
loc_thresh = 0.9
|
||||
giou = 1 - (1 - loc_thresh
|
||||
) * miou - loc_thresh * miou * loc_reweight
|
||||
else:
|
||||
giou = 1 - miou
|
||||
if self.reduction == 'none':
|
||||
loss = giou
|
||||
elif self.reduction == 'sum':
|
||||
loss = paddle.sum(giou * iou_weight)
|
||||
else:
|
||||
loss = paddle.mean(giou * iou_weight)
|
||||
return loss * self.loss_weight
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class DIouLoss(GIoULoss):
|
||||
"""
|
||||
Distance-IoU Loss, see https://arxiv.org/abs/1911.08287
|
||||
Args:
|
||||
loss_weight (float): giou loss weight, default as 1
|
||||
eps (float): epsilon to avoid divide by zero, default as 1e-10
|
||||
use_complete_iou_loss (bool): whether to use complete iou loss
|
||||
"""
|
||||
|
||||
def __init__(self, loss_weight=1., eps=1e-10, use_complete_iou_loss=True):
|
||||
super(DIouLoss, self).__init__(loss_weight=loss_weight, eps=eps)
|
||||
self.use_complete_iou_loss = use_complete_iou_loss
|
||||
|
||||
def __call__(self, pbox, gbox, iou_weight=1.):
|
||||
x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
|
||||
x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)
|
||||
cx = (x1 + x2) / 2
|
||||
cy = (y1 + y2) / 2
|
||||
w = x2 - x1
|
||||
h = y2 - y1
|
||||
|
||||
cxg = (x1g + x2g) / 2
|
||||
cyg = (y1g + y2g) / 2
|
||||
wg = x2g - x1g
|
||||
hg = y2g - y1g
|
||||
|
||||
x2 = paddle.maximum(x1, x2)
|
||||
y2 = paddle.maximum(y1, y2)
|
||||
|
||||
# A and B
|
||||
xkis1 = paddle.maximum(x1, x1g)
|
||||
ykis1 = paddle.maximum(y1, y1g)
|
||||
xkis2 = paddle.minimum(x2, x2g)
|
||||
ykis2 = paddle.minimum(y2, y2g)
|
||||
|
||||
# A or B
|
||||
xc1 = paddle.minimum(x1, x1g)
|
||||
yc1 = paddle.minimum(y1, y1g)
|
||||
xc2 = paddle.maximum(x2, x2g)
|
||||
yc2 = paddle.maximum(y2, y2g)
|
||||
|
||||
intsctk = (xkis2 - xkis1) * (ykis2 - ykis1)
|
||||
intsctk = intsctk * paddle.greater_than(
|
||||
xkis2, xkis1) * paddle.greater_than(ykis2, ykis1)
|
||||
unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g
|
||||
) - intsctk + self.eps
|
||||
iouk = intsctk / unionk
|
||||
|
||||
# DIOU term
|
||||
dist_intersection = (cx - cxg) * (cx - cxg) + (cy - cyg) * (cy - cyg)
|
||||
dist_union = (xc2 - xc1) * (xc2 - xc1) + (yc2 - yc1) * (yc2 - yc1)
|
||||
diou_term = (dist_intersection + self.eps) / (dist_union + self.eps)
|
||||
|
||||
# CIOU term
|
||||
ciou_term = 0
|
||||
if self.use_complete_iou_loss:
|
||||
ar_gt = wg / hg
|
||||
ar_pred = w / h
|
||||
arctan = paddle.atan(ar_gt) - paddle.atan(ar_pred)
|
||||
ar_loss = 4. / np.pi / np.pi * arctan * arctan
|
||||
alpha = ar_loss / (1 - iouk + ar_loss + self.eps)
|
||||
alpha.stop_gradient = True
|
||||
ciou_term = alpha * ar_loss
|
||||
|
||||
diou = paddle.mean((1 - iouk + ciou_term + diou_term) * iou_weight)
|
||||
|
||||
return diou * self.loss_weight
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class SIoULoss(GIoULoss):
|
||||
"""
|
||||
see https://arxiv.org/pdf/2205.12740.pdf
|
||||
Args:
|
||||
loss_weight (float): siou loss weight, default as 1
|
||||
eps (float): epsilon to avoid divide by zero, default as 1e-10
|
||||
theta (float): default as 4
|
||||
reduction (str): Options are "none", "mean" and "sum". default as none
|
||||
"""
|
||||
|
||||
def __init__(self, loss_weight=1., eps=1e-10, theta=4., reduction='none'):
|
||||
super(SIoULoss, self).__init__(loss_weight=loss_weight, eps=eps)
|
||||
self.loss_weight = loss_weight
|
||||
self.eps = eps
|
||||
self.theta = theta
|
||||
self.reduction = reduction
|
||||
|
||||
def __call__(self, pbox, gbox):
|
||||
x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
|
||||
x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)
|
||||
|
||||
box1 = [x1, y1, x2, y2]
|
||||
box2 = [x1g, y1g, x2g, y2g]
|
||||
iou = bbox_iou(box1, box2)
|
||||
|
||||
cx = (x1 + x2) / 2
|
||||
cy = (y1 + y2) / 2
|
||||
w = x2 - x1 + self.eps
|
||||
h = y2 - y1 + self.eps
|
||||
|
||||
cxg = (x1g + x2g) / 2
|
||||
cyg = (y1g + y2g) / 2
|
||||
wg = x2g - x1g + self.eps
|
||||
hg = y2g - y1g + self.eps
|
||||
|
||||
x2 = paddle.maximum(x1, x2)
|
||||
y2 = paddle.maximum(y1, y2)
|
||||
|
||||
# A or B
|
||||
xc1 = paddle.minimum(x1, x1g)
|
||||
yc1 = paddle.minimum(y1, y1g)
|
||||
xc2 = paddle.maximum(x2, x2g)
|
||||
yc2 = paddle.maximum(y2, y2g)
|
||||
|
||||
cw_out = xc2 - xc1
|
||||
ch_out = yc2 - yc1
|
||||
|
||||
ch = paddle.maximum(cy, cyg) - paddle.minimum(cy, cyg)
|
||||
cw = paddle.maximum(cx, cxg) - paddle.minimum(cx, cxg)
|
||||
|
||||
# angle cost
|
||||
dist_intersection = paddle.sqrt((cx - cxg)**2 + (cy - cyg)**2)
|
||||
sin_angle_alpha = ch / dist_intersection
|
||||
sin_angle_beta = cw / dist_intersection
|
||||
thred = paddle.pow(paddle.to_tensor(2), 0.5) / 2
|
||||
thred.stop_gradient = True
|
||||
sin_alpha = paddle.where(sin_angle_alpha > thred, sin_angle_beta,
|
||||
sin_angle_alpha)
|
||||
angle_cost = paddle.cos(paddle.asin(sin_alpha) * 2 - math.pi / 2)
|
||||
|
||||
# distance cost
|
||||
gamma = 2 - angle_cost
|
||||
# gamma.stop_gradient = True
|
||||
beta_x = ((cxg - cx) / cw_out)**2
|
||||
beta_y = ((cyg - cy) / ch_out)**2
|
||||
dist_cost = 1 - paddle.exp(-gamma * beta_x) + 1 - paddle.exp(-gamma *
|
||||
beta_y)
|
||||
|
||||
# shape cost
|
||||
omega_w = paddle.abs(w - wg) / paddle.maximum(w, wg)
|
||||
omega_h = paddle.abs(hg - h) / paddle.maximum(h, hg)
|
||||
omega = (1 - paddle.exp(-omega_w))**self.theta + (
|
||||
1 - paddle.exp(-omega_h))**self.theta
|
||||
siou_loss = 1 - iou + (omega + dist_cost) / 2
|
||||
|
||||
if self.reduction == 'mean':
|
||||
siou_loss = paddle.mean(siou_loss)
|
||||
elif self.reduction == 'sum':
|
||||
siou_loss = paddle.sum(siou_loss)
|
||||
|
||||
return siou_loss * self.loss_weight
|
||||
60
rtdetr_paddle/ppdet/modeling/losses/smooth_l1_loss.py
Normal file
60
rtdetr_paddle/ppdet/modeling/losses/smooth_l1_loss.py
Normal file
@@ -0,0 +1,60 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from ppdet.core.workspace import register
|
||||
|
||||
__all__ = ['SmoothL1Loss']
|
||||
|
||||
@register
|
||||
class SmoothL1Loss(nn.Layer):
|
||||
"""Smooth L1 Loss.
|
||||
Args:
|
||||
beta (float): controls smooth region, it becomes L1 Loss when beta=0.0
|
||||
loss_weight (float): the final loss will be multiplied by this
|
||||
"""
|
||||
def __init__(self,
|
||||
beta=1.0,
|
||||
loss_weight=1.0):
|
||||
super(SmoothL1Loss, self).__init__()
|
||||
assert beta >= 0
|
||||
self.beta = beta
|
||||
self.loss_weight = loss_weight
|
||||
|
||||
def forward(self, pred, target, reduction='none'):
|
||||
"""forward function, based on fvcore.
|
||||
Args:
|
||||
pred (Tensor): prediction tensor
|
||||
target (Tensor): target tensor, pred.shape must be the same as target.shape
|
||||
reduction (str): the way to reduce loss, one of (none, sum, mean)
|
||||
"""
|
||||
assert reduction in ('none', 'sum', 'mean')
|
||||
target = target.detach()
|
||||
if self.beta < 1e-5:
|
||||
loss = paddle.abs(pred - target)
|
||||
else:
|
||||
n = paddle.abs(pred - target)
|
||||
cond = n < self.beta
|
||||
loss = paddle.where(cond, 0.5 * n ** 2 / self.beta, n - 0.5 * self.beta)
|
||||
if reduction == 'mean':
|
||||
loss = loss.mean() if loss.size > 0 else 0.0 * loss.sum()
|
||||
elif reduction == 'sum':
|
||||
loss = loss.sum()
|
||||
return loss * self.loss_weight
|
||||
152
rtdetr_paddle/ppdet/modeling/losses/varifocal_loss.py
Normal file
152
rtdetr_paddle/ppdet/modeling/losses/varifocal_loss.py
Normal file
@@ -0,0 +1,152 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# The code is based on:
|
||||
# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/varifocal_loss.py
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
import numpy as np
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from ppdet.modeling import ops
|
||||
|
||||
__all__ = ['VarifocalLoss']
|
||||
|
||||
|
||||
def varifocal_loss(pred,
|
||||
target,
|
||||
alpha=0.75,
|
||||
gamma=2.0,
|
||||
iou_weighted=True,
|
||||
use_sigmoid=True):
|
||||
"""`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_
|
||||
|
||||
Args:
|
||||
pred (Tensor): The prediction with shape (N, C), C is the
|
||||
number of classes
|
||||
target (Tensor): The learning target of the iou-aware
|
||||
classification score with shape (N, C), C is the number of classes.
|
||||
alpha (float, optional): A balance factor for the negative part of
|
||||
Varifocal Loss, which is different from the alpha of Focal Loss.
|
||||
Defaults to 0.75.
|
||||
gamma (float, optional): The gamma for calculating the modulating
|
||||
factor. Defaults to 2.0.
|
||||
iou_weighted (bool, optional): Whether to weight the loss of the
|
||||
positive example with the iou target. Defaults to True.
|
||||
"""
|
||||
# pred and target should be of the same size
|
||||
assert pred.shape == target.shape
|
||||
if use_sigmoid:
|
||||
pred_new = F.sigmoid(pred)
|
||||
else:
|
||||
pred_new = pred
|
||||
target = target.cast(pred.dtype)
|
||||
if iou_weighted:
|
||||
focal_weight = target * (target > 0.0).cast('float32') + \
|
||||
alpha * (pred_new - target).abs().pow(gamma) * \
|
||||
(target <= 0.0).cast('float32')
|
||||
else:
|
||||
focal_weight = (target > 0.0).cast('float32') + \
|
||||
alpha * (pred_new - target).abs().pow(gamma) * \
|
||||
(target <= 0.0).cast('float32')
|
||||
|
||||
if use_sigmoid:
|
||||
loss = F.binary_cross_entropy_with_logits(
|
||||
pred, target, reduction='none') * focal_weight
|
||||
else:
|
||||
loss = F.binary_cross_entropy(
|
||||
pred, target, reduction='none') * focal_weight
|
||||
loss = loss.sum(axis=1)
|
||||
return loss
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class VarifocalLoss(nn.Layer):
|
||||
def __init__(self,
|
||||
use_sigmoid=True,
|
||||
alpha=0.75,
|
||||
gamma=2.0,
|
||||
iou_weighted=True,
|
||||
reduction='mean',
|
||||
loss_weight=1.0):
|
||||
"""`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_
|
||||
|
||||
Args:
|
||||
use_sigmoid (bool, optional): Whether the prediction is
|
||||
used for sigmoid or softmax. Defaults to True.
|
||||
alpha (float, optional): A balance factor for the negative part of
|
||||
Varifocal Loss, which is different from the alpha of Focal
|
||||
Loss. Defaults to 0.75.
|
||||
gamma (float, optional): The gamma for calculating the modulating
|
||||
factor. Defaults to 2.0.
|
||||
iou_weighted (bool, optional): Whether to weight the loss of the
|
||||
positive examples with the iou target. Defaults to True.
|
||||
reduction (str, optional): The method used to reduce the loss into
|
||||
a scalar. Defaults to 'mean'. Options are "none", "mean" and
|
||||
"sum".
|
||||
loss_weight (float, optional): Weight of loss. Defaults to 1.0.
|
||||
"""
|
||||
super(VarifocalLoss, self).__init__()
|
||||
assert alpha >= 0.0
|
||||
self.use_sigmoid = use_sigmoid
|
||||
self.alpha = alpha
|
||||
self.gamma = gamma
|
||||
self.iou_weighted = iou_weighted
|
||||
self.reduction = reduction
|
||||
self.loss_weight = loss_weight
|
||||
|
||||
def forward(self, pred, target, weight=None, avg_factor=None):
|
||||
"""Forward function.
|
||||
|
||||
Args:
|
||||
pred (Tensor): The prediction.
|
||||
target (Tensor): The learning target of the prediction.
|
||||
weight (Tensor, optional): The weight of loss for each
|
||||
prediction. Defaults to None.
|
||||
avg_factor (int, optional): Average factor that is used to average
|
||||
the loss. Defaults to None.
|
||||
Returns:
|
||||
Tensor: The calculated loss
|
||||
"""
|
||||
loss = self.loss_weight * varifocal_loss(
|
||||
pred,
|
||||
target,
|
||||
alpha=self.alpha,
|
||||
gamma=self.gamma,
|
||||
iou_weighted=self.iou_weighted,
|
||||
use_sigmoid=self.use_sigmoid)
|
||||
|
||||
if weight is not None:
|
||||
loss = loss * weight
|
||||
if avg_factor is None:
|
||||
if self.reduction == 'none':
|
||||
return loss
|
||||
elif self.reduction == 'mean':
|
||||
return loss.mean()
|
||||
elif self.reduction == 'sum':
|
||||
return loss.sum()
|
||||
else:
|
||||
# if reduction is mean, then average the loss by avg_factor
|
||||
if self.reduction == 'mean':
|
||||
loss = loss.sum() / avg_factor
|
||||
# if reduction is 'none', then do nothing, otherwise raise an error
|
||||
elif self.reduction != 'none':
|
||||
raise ValueError(
|
||||
'avg_factor can not be used with reduction="sum"')
|
||||
return loss
|
||||
1114
rtdetr_paddle/ppdet/modeling/ops.py
Normal file
1114
rtdetr_paddle/ppdet/modeling/ops.py
Normal file
File diff suppressed because it is too large
Load Diff
244
rtdetr_paddle/ppdet/modeling/post_process.py
Normal file
244
rtdetr_paddle/ppdet/modeling/post_process.py
Normal file
@@ -0,0 +1,244 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
import paddle.nn.functional as F
|
||||
from ppdet.core.workspace import register
|
||||
from .transformers import bbox_cxcywh_to_xyxy
|
||||
|
||||
__all__ = [
|
||||
'DETRPostProcess',
|
||||
]
|
||||
|
||||
@register
|
||||
class DETRPostProcess(object):
|
||||
__shared__ = ['num_classes', 'use_focal_loss', 'with_mask']
|
||||
__inject__ = []
|
||||
|
||||
def __init__(self,
|
||||
num_classes=80,
|
||||
num_top_queries=100,
|
||||
dual_queries=False,
|
||||
dual_groups=0,
|
||||
use_focal_loss=False,
|
||||
with_mask=False,
|
||||
mask_threshold=0.5,
|
||||
use_avg_mask_score=False,
|
||||
bbox_decode_type='origin'):
|
||||
super(DETRPostProcess, self).__init__()
|
||||
assert bbox_decode_type in ['origin', 'pad']
|
||||
|
||||
self.num_classes = num_classes
|
||||
self.num_top_queries = num_top_queries
|
||||
self.dual_queries = dual_queries
|
||||
self.dual_groups = dual_groups
|
||||
self.use_focal_loss = use_focal_loss
|
||||
self.with_mask = with_mask
|
||||
self.mask_threshold = mask_threshold
|
||||
self.use_avg_mask_score = use_avg_mask_score
|
||||
self.bbox_decode_type = bbox_decode_type
|
||||
|
||||
def _mask_postprocess(self, mask_pred, score_pred, index):
|
||||
mask_score = F.sigmoid(paddle.gather_nd(mask_pred, index))
|
||||
mask_pred = (mask_score > self.mask_threshold).astype(mask_score.dtype)
|
||||
if self.use_avg_mask_score:
|
||||
avg_mask_score = (mask_pred * mask_score).sum([-2, -1]) / (
|
||||
mask_pred.sum([-2, -1]) + 1e-6)
|
||||
score_pred *= avg_mask_score
|
||||
|
||||
return mask_pred[0].astype('int32'), score_pred
|
||||
|
||||
def __call__(self, head_out, im_shape, scale_factor, pad_shape):
|
||||
"""
|
||||
Decode the bbox and mask.
|
||||
|
||||
Args:
|
||||
head_out (tuple): bbox_pred, cls_logit and masks of bbox_head output.
|
||||
im_shape (Tensor): The shape of the input image without padding.
|
||||
scale_factor (Tensor): The scale factor of the input image.
|
||||
pad_shape (Tensor): The shape of the input image with padding.
|
||||
Returns:
|
||||
bbox_pred (Tensor): The output prediction with shape [N, 6], including
|
||||
labels, scores and bboxes. The size of bboxes are corresponding
|
||||
to the input image, the bboxes may be used in other branch.
|
||||
bbox_num (Tensor): The number of prediction boxes of each batch with
|
||||
shape [bs], and is N.
|
||||
"""
|
||||
bboxes, logits, masks = head_out
|
||||
if self.dual_queries:
|
||||
num_queries = logits.shape[1]
|
||||
logits, bboxes = logits[:, :int(num_queries // (self.dual_groups + 1)), :], \
|
||||
bboxes[:, :int(num_queries // (self.dual_groups + 1)), :]
|
||||
|
||||
bbox_pred = bbox_cxcywh_to_xyxy(bboxes)
|
||||
# calculate the original shape of the image
|
||||
origin_shape = paddle.floor(im_shape / scale_factor + 0.5)
|
||||
img_h, img_w = paddle.split(origin_shape, 2, axis=-1)
|
||||
if self.bbox_decode_type == 'pad':
|
||||
# calculate the shape of the image with padding
|
||||
out_shape = pad_shape / im_shape * origin_shape
|
||||
out_shape = out_shape.flip(1).tile([1, 2]).unsqueeze(1)
|
||||
elif self.bbox_decode_type == 'origin':
|
||||
out_shape = origin_shape.flip(1).tile([1, 2]).unsqueeze(1)
|
||||
else:
|
||||
raise Exception(
|
||||
f'Wrong `bbox_decode_type`: {self.bbox_decode_type}.')
|
||||
bbox_pred *= out_shape
|
||||
|
||||
scores = F.sigmoid(logits) if self.use_focal_loss else F.softmax(
|
||||
logits)[:, :, :-1]
|
||||
|
||||
if not self.use_focal_loss:
|
||||
scores, labels = scores.max(-1), scores.argmax(-1)
|
||||
if scores.shape[1] > self.num_top_queries:
|
||||
scores, index = paddle.topk(
|
||||
scores, self.num_top_queries, axis=-1)
|
||||
batch_ind = paddle.arange(
|
||||
end=scores.shape[0]).unsqueeze(-1).tile(
|
||||
[1, self.num_top_queries])
|
||||
index = paddle.stack([batch_ind, index], axis=-1)
|
||||
labels = paddle.gather_nd(labels, index)
|
||||
bbox_pred = paddle.gather_nd(bbox_pred, index)
|
||||
else:
|
||||
scores, index = paddle.topk(
|
||||
scores.flatten(1), self.num_top_queries, axis=-1)
|
||||
labels = index % self.num_classes
|
||||
index = index // self.num_classes
|
||||
batch_ind = paddle.arange(end=scores.shape[0]).unsqueeze(-1).tile(
|
||||
[1, self.num_top_queries])
|
||||
index = paddle.stack([batch_ind, index], axis=-1)
|
||||
bbox_pred = paddle.gather_nd(bbox_pred, index)
|
||||
|
||||
mask_pred = None
|
||||
if self.with_mask:
|
||||
assert masks is not None
|
||||
masks = F.interpolate(
|
||||
masks, scale_factor=4, mode="bilinear", align_corners=False)
|
||||
# TODO: Support prediction with bs>1.
|
||||
# remove padding for input image
|
||||
h, w = im_shape.astype('int32')[0]
|
||||
masks = masks[..., :h, :w]
|
||||
# get pred_mask in the original resolution.
|
||||
img_h = img_h[0].astype('int32')
|
||||
img_w = img_w[0].astype('int32')
|
||||
masks = F.interpolate(
|
||||
masks,
|
||||
size=(img_h, img_w),
|
||||
mode="bilinear",
|
||||
align_corners=False)
|
||||
mask_pred, scores = self._mask_postprocess(masks, scores, index)
|
||||
|
||||
bbox_pred = paddle.concat(
|
||||
[
|
||||
labels.unsqueeze(-1).astype('float32'), scores.unsqueeze(-1),
|
||||
bbox_pred
|
||||
],
|
||||
axis=-1)
|
||||
bbox_num = paddle.to_tensor(
|
||||
self.num_top_queries, dtype='int32').tile([bbox_pred.shape[0]])
|
||||
bbox_pred = bbox_pred.reshape([-1, 6])
|
||||
return bbox_pred, bbox_num, mask_pred
|
||||
|
||||
|
||||
|
||||
def paste_mask(masks, boxes, im_h, im_w, assign_on_cpu=False):
|
||||
"""
|
||||
Paste the mask prediction to the original image.
|
||||
"""
|
||||
x0_int, y0_int = 0, 0
|
||||
x1_int, y1_int = im_w, im_h
|
||||
x0, y0, x1, y1 = paddle.split(boxes, 4, axis=1)
|
||||
N = masks.shape[0]
|
||||
img_y = paddle.arange(y0_int, y1_int) + 0.5
|
||||
img_x = paddle.arange(x0_int, x1_int) + 0.5
|
||||
|
||||
img_y = (img_y - y0) / (y1 - y0) * 2 - 1
|
||||
img_x = (img_x - x0) / (x1 - x0) * 2 - 1
|
||||
# img_x, img_y have shapes (N, w), (N, h)
|
||||
|
||||
if assign_on_cpu:
|
||||
paddle.set_device('cpu')
|
||||
gx = img_x[:, None, :].expand(
|
||||
[N, paddle.shape(img_y)[1], paddle.shape(img_x)[1]])
|
||||
gy = img_y[:, :, None].expand(
|
||||
[N, paddle.shape(img_y)[1], paddle.shape(img_x)[1]])
|
||||
grid = paddle.stack([gx, gy], axis=3)
|
||||
img_masks = F.grid_sample(masks, grid, align_corners=False)
|
||||
return img_masks[:, 0]
|
||||
|
||||
|
||||
def multiclass_nms(bboxs, num_classes, match_threshold=0.6, match_metric='iou'):
|
||||
final_boxes = []
|
||||
for c in range(num_classes):
|
||||
idxs = bboxs[:, 0] == c
|
||||
if np.count_nonzero(idxs) == 0: continue
|
||||
r = nms(bboxs[idxs, 1:], match_threshold, match_metric)
|
||||
final_boxes.append(np.concatenate([np.full((r.shape[0], 1), c), r], 1))
|
||||
return final_boxes
|
||||
|
||||
|
||||
def nms(dets, match_threshold=0.6, match_metric='iou'):
|
||||
""" Apply NMS to avoid detecting too many overlapping bounding boxes.
|
||||
Args:
|
||||
dets: shape [N, 5], [score, x1, y1, x2, y2]
|
||||
match_metric: 'iou' or 'ios'
|
||||
match_threshold: overlap thresh for match metric.
|
||||
"""
|
||||
if dets.shape[0] == 0:
|
||||
return dets[[], :]
|
||||
scores = dets[:, 0]
|
||||
x1 = dets[:, 1]
|
||||
y1 = dets[:, 2]
|
||||
x2 = dets[:, 3]
|
||||
y2 = dets[:, 4]
|
||||
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
|
||||
order = scores.argsort()[::-1]
|
||||
|
||||
ndets = dets.shape[0]
|
||||
suppressed = np.zeros((ndets), dtype=np.int32)
|
||||
|
||||
for _i in range(ndets):
|
||||
i = order[_i]
|
||||
if suppressed[i] == 1:
|
||||
continue
|
||||
ix1 = x1[i]
|
||||
iy1 = y1[i]
|
||||
ix2 = x2[i]
|
||||
iy2 = y2[i]
|
||||
iarea = areas[i]
|
||||
for _j in range(_i + 1, ndets):
|
||||
j = order[_j]
|
||||
if suppressed[j] == 1:
|
||||
continue
|
||||
xx1 = max(ix1, x1[j])
|
||||
yy1 = max(iy1, y1[j])
|
||||
xx2 = min(ix2, x2[j])
|
||||
yy2 = min(iy2, y2[j])
|
||||
w = max(0.0, xx2 - xx1 + 1)
|
||||
h = max(0.0, yy2 - yy1 + 1)
|
||||
inter = w * h
|
||||
if match_metric == 'iou':
|
||||
union = iarea + areas[j] - inter
|
||||
match_value = inter / union
|
||||
elif match_metric == 'ios':
|
||||
smaller = min(iarea, areas[j])
|
||||
match_value = inter / smaller
|
||||
else:
|
||||
raise ValueError()
|
||||
if match_value >= match_threshold:
|
||||
suppressed[j] = 1
|
||||
keep = np.where(suppressed == 0)[0]
|
||||
dets = dets[keep, :]
|
||||
return dets
|
||||
25
rtdetr_paddle/ppdet/modeling/shape_spec.py
Normal file
25
rtdetr_paddle/ppdet/modeling/shape_spec.py
Normal file
@@ -0,0 +1,25 @@
|
||||
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# The code is based on:
|
||||
# https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/shape_spec.py
|
||||
|
||||
from collections import namedtuple
|
||||
|
||||
|
||||
class ShapeSpec(
|
||||
namedtuple("_ShapeSpec", ["channels", "height", "width", "stride"])):
|
||||
def __new__(cls, channels=None, height=None, width=None, stride=None):
|
||||
return super(ShapeSpec, cls).__new__(cls, channels, height, width,
|
||||
stride)
|
||||
20
rtdetr_paddle/ppdet/modeling/transformers/__init__.py
Normal file
20
rtdetr_paddle/ppdet/modeling/transformers/__init__.py
Normal file
@@ -0,0 +1,20 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .utils import *
|
||||
from .matchers import *
|
||||
from .position_encoding import *
|
||||
from .rtdetr_transformer import *
|
||||
from .dino_transformer import *
|
||||
from .hybrid_encoder import *
|
||||
@@ -0,0 +1,537 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
|
||||
# Copyright (c) 2020 SenseTime. All Rights Reserved.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
|
||||
from ppdet.core.workspace import register
|
||||
from ..layers import MultiHeadAttention
|
||||
from .position_encoding import PositionEmbedding
|
||||
from .utils import _get_clones, get_valid_ratio
|
||||
from ..initializer import linear_init_, constant_, xavier_uniform_, normal_
|
||||
|
||||
__all__ = ['DeformableTransformer']
|
||||
|
||||
|
||||
class MSDeformableAttention(nn.Layer):
|
||||
def __init__(self,
|
||||
embed_dim=256,
|
||||
num_heads=8,
|
||||
num_levels=4,
|
||||
num_points=4,
|
||||
lr_mult=0.1):
|
||||
"""
|
||||
Multi-Scale Deformable Attention Module
|
||||
"""
|
||||
super(MSDeformableAttention, self).__init__()
|
||||
self.embed_dim = embed_dim
|
||||
self.num_heads = num_heads
|
||||
self.num_levels = num_levels
|
||||
self.num_points = num_points
|
||||
self.total_points = num_heads * num_levels * num_points
|
||||
|
||||
self.head_dim = embed_dim // num_heads
|
||||
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
|
||||
|
||||
self.sampling_offsets = nn.Linear(
|
||||
embed_dim,
|
||||
self.total_points * 2,
|
||||
weight_attr=ParamAttr(learning_rate=lr_mult),
|
||||
bias_attr=ParamAttr(learning_rate=lr_mult))
|
||||
|
||||
self.attention_weights = nn.Linear(embed_dim, self.total_points)
|
||||
self.value_proj = nn.Linear(embed_dim, embed_dim)
|
||||
self.output_proj = nn.Linear(embed_dim, embed_dim)
|
||||
try:
|
||||
# use cuda op
|
||||
from deformable_detr_ops import ms_deformable_attn
|
||||
except:
|
||||
# use paddle func
|
||||
from .utils import deformable_attention_core_func as ms_deformable_attn
|
||||
self.ms_deformable_attn_core = ms_deformable_attn
|
||||
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
# sampling_offsets
|
||||
constant_(self.sampling_offsets.weight)
|
||||
thetas = paddle.arange(
|
||||
self.num_heads,
|
||||
dtype=paddle.float32) * (2.0 * math.pi / self.num_heads)
|
||||
grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1)
|
||||
grid_init = grid_init / grid_init.abs().max(-1, keepdim=True)
|
||||
grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile(
|
||||
[1, self.num_levels, self.num_points, 1])
|
||||
scaling = paddle.arange(
|
||||
1, self.num_points + 1,
|
||||
dtype=paddle.float32).reshape([1, 1, -1, 1])
|
||||
grid_init *= scaling
|
||||
self.sampling_offsets.bias.set_value(grid_init.flatten())
|
||||
# attention_weights
|
||||
constant_(self.attention_weights.weight)
|
||||
constant_(self.attention_weights.bias)
|
||||
# proj
|
||||
xavier_uniform_(self.value_proj.weight)
|
||||
constant_(self.value_proj.bias)
|
||||
xavier_uniform_(self.output_proj.weight)
|
||||
constant_(self.output_proj.bias)
|
||||
|
||||
def forward(self,
|
||||
query,
|
||||
reference_points,
|
||||
value,
|
||||
value_spatial_shapes,
|
||||
value_level_start_index,
|
||||
value_mask=None):
|
||||
"""
|
||||
Args:
|
||||
query (Tensor): [bs, query_length, C]
|
||||
reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
|
||||
bottom-right (1, 1), including padding area
|
||||
value (Tensor): [bs, value_length, C]
|
||||
value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
|
||||
value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
|
||||
value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
|
||||
|
||||
Returns:
|
||||
output (Tensor): [bs, Length_{query}, C]
|
||||
"""
|
||||
bs, Len_q = query.shape[:2]
|
||||
Len_v = value.shape[1]
|
||||
assert int(value_spatial_shapes.prod(1).sum()) == Len_v
|
||||
|
||||
value = self.value_proj(value)
|
||||
if value_mask is not None:
|
||||
value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
|
||||
value *= value_mask
|
||||
value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])
|
||||
|
||||
sampling_offsets = self.sampling_offsets(query).reshape(
|
||||
[bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
|
||||
attention_weights = self.attention_weights(query).reshape(
|
||||
[bs, Len_q, self.num_heads, self.num_levels * self.num_points])
|
||||
attention_weights = F.softmax(attention_weights).reshape(
|
||||
[bs, Len_q, self.num_heads, self.num_levels, self.num_points])
|
||||
|
||||
if reference_points.shape[-1] == 2:
|
||||
offset_normalizer = value_spatial_shapes.flip([1]).reshape(
|
||||
[1, 1, 1, self.num_levels, 1, 2])
|
||||
sampling_locations = reference_points.reshape([
|
||||
bs, Len_q, 1, self.num_levels, 1, 2
|
||||
]) + sampling_offsets / offset_normalizer
|
||||
elif reference_points.shape[-1] == 4:
|
||||
sampling_locations = (
|
||||
reference_points[:, :, None, :, None, :2] + sampling_offsets /
|
||||
self.num_points * reference_points[:, :, None, :, None, 2:] *
|
||||
0.5)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Last dim of reference_points must be 2 or 4, but get {} instead.".
|
||||
format(reference_points.shape[-1]))
|
||||
|
||||
output = self.ms_deformable_attn_core(
|
||||
value, value_spatial_shapes, value_level_start_index,
|
||||
sampling_locations, attention_weights)
|
||||
output = self.output_proj(output)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class DeformableTransformerEncoderLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
d_model=256,
|
||||
n_head=8,
|
||||
dim_feedforward=1024,
|
||||
dropout=0.1,
|
||||
activation="relu",
|
||||
n_levels=4,
|
||||
n_points=4,
|
||||
lr_mult=0.1,
|
||||
weight_attr=None,
|
||||
bias_attr=None):
|
||||
super(DeformableTransformerEncoderLayer, self).__init__()
|
||||
# self attention
|
||||
self.self_attn = MSDeformableAttention(d_model, n_head, n_levels,
|
||||
n_points, lr_mult)
|
||||
self.dropout1 = nn.Dropout(dropout)
|
||||
self.norm1 = nn.LayerNorm(
|
||||
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
|
||||
# ffn
|
||||
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
||||
self.activation = getattr(F, activation)
|
||||
self.dropout2 = nn.Dropout(dropout)
|
||||
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
||||
self.dropout3 = nn.Dropout(dropout)
|
||||
self.norm2 = nn.LayerNorm(
|
||||
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
linear_init_(self.linear1)
|
||||
linear_init_(self.linear2)
|
||||
xavier_uniform_(self.linear1.weight)
|
||||
xavier_uniform_(self.linear2.weight)
|
||||
|
||||
def with_pos_embed(self, tensor, pos):
|
||||
return tensor if pos is None else tensor + pos
|
||||
|
||||
def forward_ffn(self, src):
|
||||
src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
|
||||
src = src + self.dropout3(src2)
|
||||
src = self.norm2(src)
|
||||
return src
|
||||
|
||||
def forward(self,
|
||||
src,
|
||||
reference_points,
|
||||
spatial_shapes,
|
||||
level_start_index,
|
||||
src_mask=None,
|
||||
query_pos_embed=None):
|
||||
# self attention
|
||||
src2 = self.self_attn(
|
||||
self.with_pos_embed(src, query_pos_embed), reference_points, src,
|
||||
spatial_shapes, level_start_index, src_mask)
|
||||
src = src + self.dropout1(src2)
|
||||
src = self.norm1(src)
|
||||
# ffn
|
||||
src = self.forward_ffn(src)
|
||||
|
||||
return src
|
||||
|
||||
|
||||
class DeformableTransformerEncoder(nn.Layer):
|
||||
def __init__(self, encoder_layer, num_layers):
|
||||
super(DeformableTransformerEncoder, self).__init__()
|
||||
self.layers = _get_clones(encoder_layer, num_layers)
|
||||
self.num_layers = num_layers
|
||||
|
||||
@staticmethod
|
||||
def get_reference_points(spatial_shapes, valid_ratios, offset=0.5):
|
||||
valid_ratios = valid_ratios.unsqueeze(1)
|
||||
reference_points = []
|
||||
for i, (H, W) in enumerate(spatial_shapes):
|
||||
ref_y, ref_x = paddle.meshgrid(
|
||||
paddle.arange(end=H) + offset, paddle.arange(end=W) + offset)
|
||||
ref_y = ref_y.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 1] *
|
||||
H)
|
||||
ref_x = ref_x.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 0] *
|
||||
W)
|
||||
reference_points.append(paddle.stack((ref_x, ref_y), axis=-1))
|
||||
reference_points = paddle.concat(reference_points, 1).unsqueeze(2)
|
||||
reference_points = reference_points * valid_ratios
|
||||
return reference_points
|
||||
|
||||
def forward(self,
|
||||
feat,
|
||||
spatial_shapes,
|
||||
level_start_index,
|
||||
feat_mask=None,
|
||||
query_pos_embed=None,
|
||||
valid_ratios=None):
|
||||
if valid_ratios is None:
|
||||
valid_ratios = paddle.ones(
|
||||
[feat.shape[0], spatial_shapes.shape[0], 2])
|
||||
reference_points = self.get_reference_points(spatial_shapes,
|
||||
valid_ratios)
|
||||
for layer in self.layers:
|
||||
feat = layer(feat, reference_points, spatial_shapes,
|
||||
level_start_index, feat_mask, query_pos_embed)
|
||||
|
||||
return feat
|
||||
|
||||
|
||||
class DeformableTransformerDecoderLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
d_model=256,
|
||||
n_head=8,
|
||||
dim_feedforward=1024,
|
||||
dropout=0.1,
|
||||
activation="relu",
|
||||
n_levels=4,
|
||||
n_points=4,
|
||||
lr_mult=0.1,
|
||||
weight_attr=None,
|
||||
bias_attr=None):
|
||||
super(DeformableTransformerDecoderLayer, self).__init__()
|
||||
|
||||
# self attention
|
||||
self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
|
||||
self.dropout1 = nn.Dropout(dropout)
|
||||
self.norm1 = nn.LayerNorm(
|
||||
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
|
||||
|
||||
# cross attention
|
||||
self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,
|
||||
n_points, lr_mult)
|
||||
self.dropout2 = nn.Dropout(dropout)
|
||||
self.norm2 = nn.LayerNorm(
|
||||
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
|
||||
|
||||
# ffn
|
||||
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
||||
self.activation = getattr(F, activation)
|
||||
self.dropout3 = nn.Dropout(dropout)
|
||||
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
||||
self.dropout4 = nn.Dropout(dropout)
|
||||
self.norm3 = nn.LayerNorm(
|
||||
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
linear_init_(self.linear1)
|
||||
linear_init_(self.linear2)
|
||||
xavier_uniform_(self.linear1.weight)
|
||||
xavier_uniform_(self.linear2.weight)
|
||||
|
||||
def with_pos_embed(self, tensor, pos):
|
||||
return tensor if pos is None else tensor + pos
|
||||
|
||||
def forward_ffn(self, tgt):
|
||||
tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
|
||||
tgt = tgt + self.dropout4(tgt2)
|
||||
tgt = self.norm3(tgt)
|
||||
return tgt
|
||||
|
||||
def forward(self,
|
||||
tgt,
|
||||
reference_points,
|
||||
memory,
|
||||
memory_spatial_shapes,
|
||||
memory_level_start_index,
|
||||
memory_mask=None,
|
||||
query_pos_embed=None):
|
||||
# self attention
|
||||
q = k = self.with_pos_embed(tgt, query_pos_embed)
|
||||
tgt2 = self.self_attn(q, k, value=tgt)
|
||||
tgt = tgt + self.dropout1(tgt2)
|
||||
tgt = self.norm1(tgt)
|
||||
|
||||
# cross attention
|
||||
tgt2 = self.cross_attn(
|
||||
self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
|
||||
memory_spatial_shapes, memory_level_start_index, memory_mask)
|
||||
tgt = tgt + self.dropout2(tgt2)
|
||||
tgt = self.norm2(tgt)
|
||||
|
||||
# ffn
|
||||
tgt = self.forward_ffn(tgt)
|
||||
|
||||
return tgt
|
||||
|
||||
|
||||
class DeformableTransformerDecoder(nn.Layer):
|
||||
def __init__(self, decoder_layer, num_layers, return_intermediate=False):
|
||||
super(DeformableTransformerDecoder, self).__init__()
|
||||
self.layers = _get_clones(decoder_layer, num_layers)
|
||||
self.num_layers = num_layers
|
||||
self.return_intermediate = return_intermediate
|
||||
|
||||
def forward(self,
|
||||
tgt,
|
||||
reference_points,
|
||||
memory,
|
||||
memory_spatial_shapes,
|
||||
memory_level_start_index,
|
||||
memory_mask=None,
|
||||
query_pos_embed=None):
|
||||
output = tgt
|
||||
intermediate = []
|
||||
for lid, layer in enumerate(self.layers):
|
||||
output = layer(output, reference_points, memory,
|
||||
memory_spatial_shapes, memory_level_start_index,
|
||||
memory_mask, query_pos_embed)
|
||||
|
||||
if self.return_intermediate:
|
||||
intermediate.append(output)
|
||||
|
||||
if self.return_intermediate:
|
||||
return paddle.stack(intermediate)
|
||||
|
||||
return output.unsqueeze(0)
|
||||
|
||||
|
||||
@register
|
||||
class DeformableTransformer(nn.Layer):
|
||||
__shared__ = ['hidden_dim']
|
||||
|
||||
def __init__(self,
|
||||
num_queries=300,
|
||||
position_embed_type='sine',
|
||||
return_intermediate_dec=True,
|
||||
in_feats_channel=[512, 1024, 2048],
|
||||
num_feature_levels=4,
|
||||
num_encoder_points=4,
|
||||
num_decoder_points=4,
|
||||
hidden_dim=256,
|
||||
nhead=8,
|
||||
num_encoder_layers=6,
|
||||
num_decoder_layers=6,
|
||||
dim_feedforward=1024,
|
||||
dropout=0.1,
|
||||
activation="relu",
|
||||
lr_mult=0.1,
|
||||
pe_temperature=10000,
|
||||
pe_offset=-0.5):
|
||||
super(DeformableTransformer, self).__init__()
|
||||
assert position_embed_type in ['sine', 'learned'], \
|
||||
f'ValueError: position_embed_type not supported {position_embed_type}!'
|
||||
assert len(in_feats_channel) <= num_feature_levels
|
||||
|
||||
self.hidden_dim = hidden_dim
|
||||
self.nhead = nhead
|
||||
self.num_feature_levels = num_feature_levels
|
||||
|
||||
encoder_layer = DeformableTransformerEncoderLayer(
|
||||
hidden_dim, nhead, dim_feedforward, dropout, activation,
|
||||
num_feature_levels, num_encoder_points, lr_mult)
|
||||
self.encoder = DeformableTransformerEncoder(encoder_layer,
|
||||
num_encoder_layers)
|
||||
|
||||
decoder_layer = DeformableTransformerDecoderLayer(
|
||||
hidden_dim, nhead, dim_feedforward, dropout, activation,
|
||||
num_feature_levels, num_decoder_points)
|
||||
self.decoder = DeformableTransformerDecoder(
|
||||
decoder_layer, num_decoder_layers, return_intermediate_dec)
|
||||
|
||||
self.level_embed = nn.Embedding(num_feature_levels, hidden_dim)
|
||||
self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
|
||||
self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)
|
||||
|
||||
self.reference_points = nn.Linear(
|
||||
hidden_dim,
|
||||
2,
|
||||
weight_attr=ParamAttr(learning_rate=lr_mult),
|
||||
bias_attr=ParamAttr(learning_rate=lr_mult))
|
||||
|
||||
self.input_proj = nn.LayerList()
|
||||
for in_channels in in_feats_channel:
|
||||
self.input_proj.append(
|
||||
nn.Sequential(
|
||||
nn.Conv2D(
|
||||
in_channels, hidden_dim, kernel_size=1),
|
||||
nn.GroupNorm(32, hidden_dim)))
|
||||
in_channels = in_feats_channel[-1]
|
||||
for _ in range(num_feature_levels - len(in_feats_channel)):
|
||||
self.input_proj.append(
|
||||
nn.Sequential(
|
||||
nn.Conv2D(
|
||||
in_channels,
|
||||
hidden_dim,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1),
|
||||
nn.GroupNorm(32, hidden_dim)))
|
||||
in_channels = hidden_dim
|
||||
|
||||
self.position_embedding = PositionEmbedding(
|
||||
hidden_dim // 2,
|
||||
temperature=pe_temperature,
|
||||
normalize=True if position_embed_type == 'sine' else False,
|
||||
embed_type=position_embed_type,
|
||||
offset=pe_offset,
|
||||
eps=1e-4)
|
||||
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
normal_(self.level_embed.weight)
|
||||
normal_(self.tgt_embed.weight)
|
||||
normal_(self.query_pos_embed.weight)
|
||||
xavier_uniform_(self.reference_points.weight)
|
||||
constant_(self.reference_points.bias)
|
||||
for l in self.input_proj:
|
||||
xavier_uniform_(l[0].weight)
|
||||
constant_(l[0].bias)
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, cfg, input_shape):
|
||||
return {'in_feats_channel': [i.channels for i in input_shape], }
|
||||
|
||||
def forward(self, src_feats, src_mask=None, *args, **kwargs):
|
||||
srcs = []
|
||||
for i in range(len(src_feats)):
|
||||
srcs.append(self.input_proj[i](src_feats[i]))
|
||||
if self.num_feature_levels > len(srcs):
|
||||
len_srcs = len(srcs)
|
||||
for i in range(len_srcs, self.num_feature_levels):
|
||||
if i == len_srcs:
|
||||
srcs.append(self.input_proj[i](src_feats[-1]))
|
||||
else:
|
||||
srcs.append(self.input_proj[i](srcs[-1]))
|
||||
src_flatten = []
|
||||
mask_flatten = []
|
||||
lvl_pos_embed_flatten = []
|
||||
spatial_shapes = []
|
||||
valid_ratios = []
|
||||
for level, src in enumerate(srcs):
|
||||
src_shape = paddle.shape(src)
|
||||
bs = src_shape[0:1]
|
||||
h = src_shape[2:3]
|
||||
w = src_shape[3:4]
|
||||
spatial_shapes.append(paddle.concat([h, w]))
|
||||
src = src.flatten(2).transpose([0, 2, 1])
|
||||
src_flatten.append(src)
|
||||
if src_mask is not None:
|
||||
mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]
|
||||
else:
|
||||
mask = paddle.ones([bs, h, w])
|
||||
valid_ratios.append(get_valid_ratio(mask))
|
||||
pos_embed = self.position_embedding(mask).flatten(1, 2)
|
||||
lvl_pos_embed = pos_embed + self.level_embed.weight[level]
|
||||
lvl_pos_embed_flatten.append(lvl_pos_embed)
|
||||
mask = mask.flatten(1)
|
||||
mask_flatten.append(mask)
|
||||
src_flatten = paddle.concat(src_flatten, 1)
|
||||
mask_flatten = None if src_mask is None else paddle.concat(mask_flatten,
|
||||
1)
|
||||
lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
|
||||
# [l, 2]
|
||||
spatial_shapes = paddle.to_tensor(
|
||||
paddle.stack(spatial_shapes).astype('int64'))
|
||||
# [l], 每一个level的起始index
|
||||
level_start_index = paddle.concat([
|
||||
paddle.zeros(
|
||||
[1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
|
||||
])
|
||||
# [b, l, 2]
|
||||
valid_ratios = paddle.stack(valid_ratios, 1)
|
||||
|
||||
# encoder
|
||||
memory = self.encoder(src_flatten, spatial_shapes, level_start_index,
|
||||
mask_flatten, lvl_pos_embed_flatten, valid_ratios)
|
||||
|
||||
# prepare input for decoder
|
||||
bs, _, c = memory.shape
|
||||
query_embed = self.query_pos_embed.weight.unsqueeze(0).tile([bs, 1, 1])
|
||||
tgt = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
|
||||
reference_points = F.sigmoid(self.reference_points(query_embed))
|
||||
reference_points_input = reference_points.unsqueeze(
|
||||
2) * valid_ratios.unsqueeze(1)
|
||||
|
||||
# decoder
|
||||
hs = self.decoder(tgt, reference_points_input, memory, spatial_shapes,
|
||||
level_start_index, mask_flatten, query_embed)
|
||||
|
||||
return (hs, memory, reference_points)
|
||||
359
rtdetr_paddle/ppdet/modeling/transformers/detr_transformer.py
Normal file
359
rtdetr_paddle/ppdet/modeling/transformers/detr_transformer.py
Normal file
@@ -0,0 +1,359 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Modified from DETR (https://github.com/facebookresearch/detr)
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
|
||||
from ppdet.core.workspace import register
|
||||
from ..layers import MultiHeadAttention, _convert_attention_mask
|
||||
from .position_encoding import PositionEmbedding
|
||||
from .utils import _get_clones
|
||||
from ..initializer import linear_init_, conv_init_, xavier_uniform_, normal_
|
||||
|
||||
__all__ = ['DETRTransformer']
|
||||
|
||||
|
||||
class TransformerEncoderLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
d_model,
|
||||
nhead,
|
||||
dim_feedforward=2048,
|
||||
dropout=0.1,
|
||||
activation="relu",
|
||||
attn_dropout=None,
|
||||
act_dropout=None,
|
||||
normalize_before=False):
|
||||
super(TransformerEncoderLayer, self).__init__()
|
||||
attn_dropout = dropout if attn_dropout is None else attn_dropout
|
||||
act_dropout = dropout if act_dropout is None else act_dropout
|
||||
self.normalize_before = normalize_before
|
||||
|
||||
self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
|
||||
# Implementation of Feedforward model
|
||||
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
||||
self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
|
||||
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
||||
|
||||
self.norm1 = nn.LayerNorm(d_model)
|
||||
self.norm2 = nn.LayerNorm(d_model)
|
||||
self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
|
||||
self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
|
||||
self.activation = getattr(F, activation)
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
linear_init_(self.linear1)
|
||||
linear_init_(self.linear2)
|
||||
|
||||
@staticmethod
|
||||
def with_pos_embed(tensor, pos_embed):
|
||||
return tensor if pos_embed is None else tensor + pos_embed
|
||||
|
||||
def forward(self, src, src_mask=None, pos_embed=None):
|
||||
residual = src
|
||||
if self.normalize_before:
|
||||
src = self.norm1(src)
|
||||
q = k = self.with_pos_embed(src, pos_embed)
|
||||
src = self.self_attn(q, k, value=src, attn_mask=src_mask)
|
||||
|
||||
src = residual + self.dropout1(src)
|
||||
if not self.normalize_before:
|
||||
src = self.norm1(src)
|
||||
|
||||
residual = src
|
||||
if self.normalize_before:
|
||||
src = self.norm2(src)
|
||||
src = self.linear2(self.dropout(self.activation(self.linear1(src))))
|
||||
src = residual + self.dropout2(src)
|
||||
if not self.normalize_before:
|
||||
src = self.norm2(src)
|
||||
return src
|
||||
|
||||
|
||||
class TransformerEncoder(nn.Layer):
|
||||
def __init__(self, encoder_layer, num_layers, norm=None):
|
||||
super(TransformerEncoder, self).__init__()
|
||||
self.layers = _get_clones(encoder_layer, num_layers)
|
||||
self.num_layers = num_layers
|
||||
self.norm = norm
|
||||
|
||||
def forward(self, src, src_mask=None, pos_embed=None):
|
||||
output = src
|
||||
for layer in self.layers:
|
||||
output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
|
||||
|
||||
if self.norm is not None:
|
||||
output = self.norm(output)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class TransformerDecoderLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
d_model,
|
||||
nhead,
|
||||
dim_feedforward=2048,
|
||||
dropout=0.1,
|
||||
activation="relu",
|
||||
attn_dropout=None,
|
||||
act_dropout=None,
|
||||
normalize_before=False):
|
||||
super(TransformerDecoderLayer, self).__init__()
|
||||
attn_dropout = dropout if attn_dropout is None else attn_dropout
|
||||
act_dropout = dropout if act_dropout is None else act_dropout
|
||||
self.normalize_before = normalize_before
|
||||
|
||||
self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
|
||||
self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
|
||||
# Implementation of Feedforward model
|
||||
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
||||
self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
|
||||
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
||||
|
||||
self.norm1 = nn.LayerNorm(d_model)
|
||||
self.norm2 = nn.LayerNorm(d_model)
|
||||
self.norm3 = nn.LayerNorm(d_model)
|
||||
self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
|
||||
self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
|
||||
self.dropout3 = nn.Dropout(dropout, mode="upscale_in_train")
|
||||
self.activation = getattr(F, activation)
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
linear_init_(self.linear1)
|
||||
linear_init_(self.linear2)
|
||||
|
||||
@staticmethod
|
||||
def with_pos_embed(tensor, pos_embed):
|
||||
return tensor if pos_embed is None else tensor + pos_embed
|
||||
|
||||
def forward(self,
|
||||
tgt,
|
||||
memory,
|
||||
tgt_mask=None,
|
||||
memory_mask=None,
|
||||
pos_embed=None,
|
||||
query_pos_embed=None):
|
||||
tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
|
||||
|
||||
residual = tgt
|
||||
if self.normalize_before:
|
||||
tgt = self.norm1(tgt)
|
||||
q = k = self.with_pos_embed(tgt, query_pos_embed)
|
||||
tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask)
|
||||
tgt = residual + self.dropout1(tgt)
|
||||
if not self.normalize_before:
|
||||
tgt = self.norm1(tgt)
|
||||
|
||||
residual = tgt
|
||||
if self.normalize_before:
|
||||
tgt = self.norm2(tgt)
|
||||
q = self.with_pos_embed(tgt, query_pos_embed)
|
||||
k = self.with_pos_embed(memory, pos_embed)
|
||||
tgt = self.cross_attn(q, k, value=memory, attn_mask=memory_mask)
|
||||
tgt = residual + self.dropout2(tgt)
|
||||
if not self.normalize_before:
|
||||
tgt = self.norm2(tgt)
|
||||
|
||||
residual = tgt
|
||||
if self.normalize_before:
|
||||
tgt = self.norm3(tgt)
|
||||
tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
|
||||
tgt = residual + self.dropout3(tgt)
|
||||
if not self.normalize_before:
|
||||
tgt = self.norm3(tgt)
|
||||
return tgt
|
||||
|
||||
|
||||
class TransformerDecoder(nn.Layer):
|
||||
def __init__(self,
|
||||
decoder_layer,
|
||||
num_layers,
|
||||
norm=None,
|
||||
return_intermediate=False):
|
||||
super(TransformerDecoder, self).__init__()
|
||||
self.layers = _get_clones(decoder_layer, num_layers)
|
||||
self.num_layers = num_layers
|
||||
self.norm = norm
|
||||
self.return_intermediate = return_intermediate
|
||||
|
||||
def forward(self,
|
||||
tgt,
|
||||
memory,
|
||||
tgt_mask=None,
|
||||
memory_mask=None,
|
||||
pos_embed=None,
|
||||
query_pos_embed=None):
|
||||
tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
|
||||
|
||||
output = tgt
|
||||
intermediate = []
|
||||
for layer in self.layers:
|
||||
output = layer(
|
||||
output,
|
||||
memory,
|
||||
tgt_mask=tgt_mask,
|
||||
memory_mask=memory_mask,
|
||||
pos_embed=pos_embed,
|
||||
query_pos_embed=query_pos_embed)
|
||||
if self.return_intermediate:
|
||||
intermediate.append(self.norm(output))
|
||||
|
||||
if self.norm is not None:
|
||||
output = self.norm(output)
|
||||
|
||||
if self.return_intermediate:
|
||||
return paddle.stack(intermediate)
|
||||
|
||||
return output.unsqueeze(0)
|
||||
|
||||
|
||||
@register
|
||||
class DETRTransformer(nn.Layer):
|
||||
__shared__ = ['hidden_dim']
|
||||
|
||||
def __init__(self,
|
||||
num_queries=100,
|
||||
position_embed_type='sine',
|
||||
return_intermediate_dec=True,
|
||||
backbone_num_channels=2048,
|
||||
hidden_dim=256,
|
||||
nhead=8,
|
||||
num_encoder_layers=6,
|
||||
num_decoder_layers=6,
|
||||
dim_feedforward=2048,
|
||||
dropout=0.1,
|
||||
activation="relu",
|
||||
pe_temperature=10000,
|
||||
pe_offset=0.,
|
||||
attn_dropout=None,
|
||||
act_dropout=None,
|
||||
normalize_before=False):
|
||||
super(DETRTransformer, self).__init__()
|
||||
assert position_embed_type in ['sine', 'learned'],\
|
||||
f'ValueError: position_embed_type not supported {position_embed_type}!'
|
||||
self.hidden_dim = hidden_dim
|
||||
self.nhead = nhead
|
||||
|
||||
encoder_layer = TransformerEncoderLayer(
|
||||
hidden_dim, nhead, dim_feedforward, dropout, activation,
|
||||
attn_dropout, act_dropout, normalize_before)
|
||||
encoder_norm = nn.LayerNorm(hidden_dim) if normalize_before else None
|
||||
self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
|
||||
encoder_norm)
|
||||
|
||||
decoder_layer = TransformerDecoderLayer(
|
||||
hidden_dim, nhead, dim_feedforward, dropout, activation,
|
||||
attn_dropout, act_dropout, normalize_before)
|
||||
decoder_norm = nn.LayerNorm(hidden_dim)
|
||||
self.decoder = TransformerDecoder(
|
||||
decoder_layer,
|
||||
num_decoder_layers,
|
||||
decoder_norm,
|
||||
return_intermediate=return_intermediate_dec)
|
||||
|
||||
self.input_proj = nn.Conv2D(
|
||||
backbone_num_channels, hidden_dim, kernel_size=1)
|
||||
self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)
|
||||
self.position_embedding = PositionEmbedding(
|
||||
hidden_dim // 2,
|
||||
temperature=pe_temperature,
|
||||
normalize=True if position_embed_type == 'sine' else False,
|
||||
embed_type=position_embed_type,
|
||||
offset=pe_offset)
|
||||
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
for p in self.parameters():
|
||||
if p.dim() > 1:
|
||||
xavier_uniform_(p)
|
||||
conv_init_(self.input_proj)
|
||||
normal_(self.query_pos_embed.weight)
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, cfg, input_shape):
|
||||
return {
|
||||
'backbone_num_channels': [i.channels for i in input_shape][-1],
|
||||
}
|
||||
|
||||
def _convert_attention_mask(self, mask):
|
||||
return (mask - 1.0) * 1e9
|
||||
|
||||
def forward(self, src, src_mask=None, *args, **kwargs):
|
||||
r"""
|
||||
Applies a Transformer model on the inputs.
|
||||
|
||||
Parameters:
|
||||
src (List(Tensor)): Backbone feature maps with shape [[bs, c, h, w]].
|
||||
src_mask (Tensor, optional): A tensor used in multi-head attention
|
||||
to prevents attention to some unwanted positions, usually the
|
||||
paddings or the subsequent positions. It is a tensor with shape
|
||||
[bs, H, W]`. When the data type is bool, the unwanted positions
|
||||
have `False` values and the others have `True` values. When the
|
||||
data type is int, the unwanted positions have 0 values and the
|
||||
others have 1 values. When the data type is float, the unwanted
|
||||
positions have `-INF` values and the others have 0 values. It
|
||||
can be None when nothing wanted or needed to be prevented
|
||||
attention to. Default None.
|
||||
|
||||
Returns:
|
||||
output (Tensor): [num_levels, batch_size, num_queries, hidden_dim]
|
||||
memory (Tensor): [batch_size, hidden_dim, h, w]
|
||||
"""
|
||||
# use last level feature map
|
||||
src_proj = self.input_proj(src[-1])
|
||||
bs, c, h, w = paddle.shape(src_proj)
|
||||
# flatten [B, C, H, W] to [B, HxW, C]
|
||||
src_flatten = src_proj.flatten(2).transpose([0, 2, 1])
|
||||
if src_mask is not None:
|
||||
src_mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]
|
||||
else:
|
||||
src_mask = paddle.ones([bs, h, w])
|
||||
pos_embed = self.position_embedding(src_mask).flatten(1, 2)
|
||||
|
||||
if self.training:
|
||||
src_mask = self._convert_attention_mask(src_mask)
|
||||
src_mask = src_mask.reshape([bs, 1, 1, h * w])
|
||||
else:
|
||||
src_mask = None
|
||||
|
||||
memory = self.encoder(
|
||||
src_flatten, src_mask=src_mask, pos_embed=pos_embed)
|
||||
|
||||
query_pos_embed = self.query_pos_embed.weight.unsqueeze(0).tile(
|
||||
[bs, 1, 1])
|
||||
tgt = paddle.zeros_like(query_pos_embed)
|
||||
output = self.decoder(
|
||||
tgt,
|
||||
memory,
|
||||
memory_mask=src_mask,
|
||||
pos_embed=pos_embed,
|
||||
query_pos_embed=query_pos_embed)
|
||||
|
||||
if self.training:
|
||||
src_mask = src_mask.reshape([bs, 1, 1, h, w])
|
||||
else:
|
||||
src_mask = None
|
||||
|
||||
return (output, memory.transpose([0, 2, 1]).reshape([bs, c, h, w]),
|
||||
src_proj, src_mask)
|
||||
527
rtdetr_paddle/ppdet/modeling/transformers/dino_transformer.py
Normal file
527
rtdetr_paddle/ppdet/modeling/transformers/dino_transformer.py
Normal file
@@ -0,0 +1,527 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
|
||||
# Copyright (c) 2020 SenseTime. All Rights Reserved.
|
||||
# Modified from detrex (https://github.com/IDEA-Research/detrex)
|
||||
# Copyright 2022 The IDEA Authors. All rights reserved.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
from paddle.regularizer import L2Decay
|
||||
|
||||
from ppdet.core.workspace import register
|
||||
from ..layers import MultiHeadAttention
|
||||
from .position_encoding import PositionEmbedding
|
||||
from .deformable_transformer import (MSDeformableAttention,
|
||||
DeformableTransformerEncoderLayer,
|
||||
DeformableTransformerEncoder)
|
||||
from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,
|
||||
bias_init_with_prob)
|
||||
from .utils import (_get_clones, get_valid_ratio,
|
||||
get_contrastive_denoising_training_group,
|
||||
get_sine_pos_embed, inverse_sigmoid, MLP)
|
||||
|
||||
__all__ = ['DINOTransformer']
|
||||
|
||||
|
||||
class DINOTransformerDecoderLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
d_model=256,
|
||||
n_head=8,
|
||||
dim_feedforward=1024,
|
||||
dropout=0.,
|
||||
activation="relu",
|
||||
n_levels=4,
|
||||
n_points=4,
|
||||
lr_mult=1.0,
|
||||
weight_attr=None,
|
||||
bias_attr=None):
|
||||
super(DINOTransformerDecoderLayer, self).__init__()
|
||||
|
||||
# self attention
|
||||
self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
|
||||
self.dropout1 = nn.Dropout(dropout)
|
||||
self.norm1 = nn.LayerNorm(
|
||||
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
|
||||
|
||||
# cross attention
|
||||
self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,
|
||||
n_points, lr_mult)
|
||||
self.dropout2 = nn.Dropout(dropout)
|
||||
self.norm2 = nn.LayerNorm(
|
||||
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
|
||||
|
||||
# ffn
|
||||
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
||||
self.activation = getattr(F, activation)
|
||||
self.dropout3 = nn.Dropout(dropout)
|
||||
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
||||
self.dropout4 = nn.Dropout(dropout)
|
||||
self.norm3 = nn.LayerNorm(
|
||||
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
linear_init_(self.linear1)
|
||||
linear_init_(self.linear2)
|
||||
xavier_uniform_(self.linear1.weight)
|
||||
xavier_uniform_(self.linear2.weight)
|
||||
|
||||
def with_pos_embed(self, tensor, pos):
|
||||
return tensor if pos is None else tensor + pos
|
||||
|
||||
def forward_ffn(self, tgt):
|
||||
return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
|
||||
|
||||
def forward(self,
|
||||
tgt,
|
||||
reference_points,
|
||||
memory,
|
||||
memory_spatial_shapes,
|
||||
memory_level_start_index,
|
||||
attn_mask=None,
|
||||
memory_mask=None,
|
||||
query_pos_embed=None):
|
||||
# self attention
|
||||
q = k = self.with_pos_embed(tgt, query_pos_embed)
|
||||
if attn_mask is not None:
|
||||
attn_mask = paddle.where(
|
||||
attn_mask.astype('bool'),
|
||||
paddle.zeros(attn_mask.shape, tgt.dtype),
|
||||
paddle.full(attn_mask.shape, float("-inf"), tgt.dtype))
|
||||
tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
|
||||
tgt = tgt + self.dropout1(tgt2)
|
||||
tgt = self.norm1(tgt)
|
||||
|
||||
# cross attention
|
||||
tgt2 = self.cross_attn(
|
||||
self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
|
||||
memory_spatial_shapes, memory_level_start_index, memory_mask)
|
||||
tgt = tgt + self.dropout2(tgt2)
|
||||
tgt = self.norm2(tgt)
|
||||
|
||||
# ffn
|
||||
tgt2 = self.forward_ffn(tgt)
|
||||
tgt = tgt + self.dropout4(tgt2)
|
||||
tgt = self.norm3(tgt)
|
||||
|
||||
return tgt
|
||||
|
||||
|
||||
class DINOTransformerDecoder(nn.Layer):
|
||||
def __init__(self,
|
||||
hidden_dim,
|
||||
decoder_layer,
|
||||
num_layers,
|
||||
weight_attr=None,
|
||||
bias_attr=None):
|
||||
super(DINOTransformerDecoder, self).__init__()
|
||||
self.layers = _get_clones(decoder_layer, num_layers)
|
||||
self.hidden_dim = hidden_dim
|
||||
self.num_layers = num_layers
|
||||
self.norm = nn.LayerNorm(
|
||||
hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)
|
||||
|
||||
def forward(self,
|
||||
tgt,
|
||||
ref_points_unact,
|
||||
memory,
|
||||
memory_spatial_shapes,
|
||||
memory_level_start_index,
|
||||
bbox_head,
|
||||
query_pos_head,
|
||||
valid_ratios=None,
|
||||
attn_mask=None,
|
||||
memory_mask=None):
|
||||
if valid_ratios is None:
|
||||
valid_ratios = paddle.ones(
|
||||
[memory.shape[0], memory_spatial_shapes.shape[0], 2])
|
||||
|
||||
output = tgt
|
||||
intermediate = []
|
||||
inter_bboxes = []
|
||||
ref_points = F.sigmoid(ref_points_unact)
|
||||
for i, layer in enumerate(self.layers):
|
||||
reference_points_input = ref_points.detach().unsqueeze(
|
||||
2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1)
|
||||
query_pos_embed = get_sine_pos_embed(
|
||||
reference_points_input[..., 0, :], self.hidden_dim // 2)
|
||||
query_pos_embed = query_pos_head(query_pos_embed)
|
||||
|
||||
output = layer(output, reference_points_input, memory,
|
||||
memory_spatial_shapes, memory_level_start_index,
|
||||
attn_mask, memory_mask, query_pos_embed)
|
||||
|
||||
ref_points = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
|
||||
ref_points.detach()))
|
||||
|
||||
intermediate.append(self.norm(output))
|
||||
inter_bboxes.append(ref_points)
|
||||
|
||||
return paddle.stack(intermediate), paddle.stack(inter_bboxes)
|
||||
|
||||
|
||||
@register
|
||||
class DINOTransformer(nn.Layer):
|
||||
__shared__ = ['num_classes', 'hidden_dim']
|
||||
|
||||
def __init__(self,
|
||||
num_classes=80,
|
||||
hidden_dim=256,
|
||||
num_queries=900,
|
||||
position_embed_type='sine',
|
||||
in_feats_channel=[512, 1024, 2048],
|
||||
num_levels=4,
|
||||
num_encoder_points=4,
|
||||
num_decoder_points=4,
|
||||
nhead=8,
|
||||
num_encoder_layers=6,
|
||||
num_decoder_layers=6,
|
||||
dim_feedforward=1024,
|
||||
dropout=0.,
|
||||
activation="relu",
|
||||
lr_mult=1.0,
|
||||
pe_temperature=10000,
|
||||
pe_offset=-0.5,
|
||||
num_denoising=100,
|
||||
label_noise_ratio=0.5,
|
||||
box_noise_scale=1.0,
|
||||
learnt_init_query=True,
|
||||
eps=1e-2):
|
||||
super(DINOTransformer, self).__init__()
|
||||
assert position_embed_type in ['sine', 'learned'], \
|
||||
f'ValueError: position_embed_type not supported {position_embed_type}!'
|
||||
assert len(in_feats_channel) <= num_levels
|
||||
|
||||
self.hidden_dim = hidden_dim
|
||||
self.nhead = nhead
|
||||
self.num_levels = num_levels
|
||||
self.num_classes = num_classes
|
||||
self.num_queries = num_queries
|
||||
self.eps = eps
|
||||
self.num_decoder_layers = num_decoder_layers
|
||||
|
||||
weight_attr = ParamAttr(regularizer=L2Decay(0.0))
|
||||
bias_attr = ParamAttr(regularizer=L2Decay(0.0))
|
||||
# backbone feature projection
|
||||
self._build_input_proj_layer(in_feats_channel, weight_attr, bias_attr)
|
||||
|
||||
# Transformer module
|
||||
encoder_layer = DeformableTransformerEncoderLayer(
|
||||
hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
|
||||
num_encoder_points, lr_mult, weight_attr, bias_attr)
|
||||
self.encoder = DeformableTransformerEncoder(encoder_layer,
|
||||
num_encoder_layers)
|
||||
decoder_layer = DINOTransformerDecoderLayer(
|
||||
hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
|
||||
num_decoder_points, lr_mult, weight_attr, bias_attr)
|
||||
self.decoder = DINOTransformerDecoder(hidden_dim, decoder_layer,
|
||||
num_decoder_layers, weight_attr,
|
||||
bias_attr)
|
||||
|
||||
# denoising part
|
||||
self.denoising_class_embed = nn.Embedding(
|
||||
num_classes,
|
||||
hidden_dim,
|
||||
weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
|
||||
self.num_denoising = num_denoising
|
||||
self.label_noise_ratio = label_noise_ratio
|
||||
self.box_noise_scale = box_noise_scale
|
||||
|
||||
# position embedding
|
||||
self.position_embedding = PositionEmbedding(
|
||||
hidden_dim // 2,
|
||||
temperature=pe_temperature,
|
||||
normalize=True if position_embed_type == 'sine' else False,
|
||||
embed_type=position_embed_type,
|
||||
offset=pe_offset)
|
||||
self.level_embed = nn.Embedding(num_levels, hidden_dim)
|
||||
# decoder embedding
|
||||
self.learnt_init_query = learnt_init_query
|
||||
if learnt_init_query:
|
||||
self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
|
||||
self.query_pos_head = MLP(2 * hidden_dim,
|
||||
hidden_dim,
|
||||
hidden_dim,
|
||||
num_layers=2)
|
||||
|
||||
# encoder head
|
||||
self.enc_output = nn.Sequential(
|
||||
nn.Linear(hidden_dim, hidden_dim),
|
||||
nn.LayerNorm(
|
||||
hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr))
|
||||
self.enc_score_head = nn.Linear(hidden_dim, num_classes)
|
||||
self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
|
||||
# decoder head
|
||||
self.dec_score_head = nn.LayerList([
|
||||
nn.Linear(hidden_dim, num_classes)
|
||||
for _ in range(num_decoder_layers)
|
||||
])
|
||||
self.dec_bbox_head = nn.LayerList([
|
||||
MLP(hidden_dim, hidden_dim, 4, num_layers=3)
|
||||
for _ in range(num_decoder_layers)
|
||||
])
|
||||
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
# class and bbox head init
|
||||
bias_cls = bias_init_with_prob(0.01)
|
||||
linear_init_(self.enc_score_head)
|
||||
constant_(self.enc_score_head.bias, bias_cls)
|
||||
constant_(self.enc_bbox_head.layers[-1].weight)
|
||||
constant_(self.enc_bbox_head.layers[-1].bias)
|
||||
for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
|
||||
linear_init_(cls_)
|
||||
constant_(cls_.bias, bias_cls)
|
||||
constant_(reg_.layers[-1].weight)
|
||||
constant_(reg_.layers[-1].bias)
|
||||
|
||||
linear_init_(self.enc_output[0])
|
||||
xavier_uniform_(self.enc_output[0].weight)
|
||||
normal_(self.level_embed.weight)
|
||||
if self.learnt_init_query:
|
||||
xavier_uniform_(self.tgt_embed.weight)
|
||||
xavier_uniform_(self.query_pos_head.layers[0].weight)
|
||||
xavier_uniform_(self.query_pos_head.layers[1].weight)
|
||||
for l in self.input_proj:
|
||||
xavier_uniform_(l[0].weight)
|
||||
constant_(l[0].bias)
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, cfg, input_shape):
|
||||
return {'in_feats_channel': [i.channels for i in input_shape], }
|
||||
|
||||
def _build_input_proj_layer(self,
|
||||
in_feats_channel,
|
||||
weight_attr=None,
|
||||
bias_attr=None):
|
||||
self.input_proj = nn.LayerList()
|
||||
for in_channels in in_feats_channel:
|
||||
self.input_proj.append(
|
||||
nn.Sequential(
|
||||
('conv', nn.Conv2D(
|
||||
in_channels, self.hidden_dim, kernel_size=1)), (
|
||||
'norm', nn.GroupNorm(
|
||||
32,
|
||||
self.hidden_dim,
|
||||
weight_attr=weight_attr,
|
||||
bias_attr=bias_attr))))
|
||||
in_channels = in_feats_channel[-1]
|
||||
for _ in range(self.num_levels - len(in_feats_channel)):
|
||||
self.input_proj.append(
|
||||
nn.Sequential(
|
||||
('conv', nn.Conv2D(
|
||||
in_channels,
|
||||
self.hidden_dim,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1)), ('norm', nn.GroupNorm(
|
||||
32,
|
||||
self.hidden_dim,
|
||||
weight_attr=weight_attr,
|
||||
bias_attr=bias_attr))))
|
||||
in_channels = self.hidden_dim
|
||||
|
||||
def _get_encoder_input(self, feats, pad_mask=None):
|
||||
# get projection features
|
||||
proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
|
||||
if self.num_levels > len(proj_feats):
|
||||
len_srcs = len(proj_feats)
|
||||
for i in range(len_srcs, self.num_levels):
|
||||
if i == len_srcs:
|
||||
proj_feats.append(self.input_proj[i](feats[-1]))
|
||||
else:
|
||||
proj_feats.append(self.input_proj[i](proj_feats[-1]))
|
||||
|
||||
# get encoder inputs
|
||||
feat_flatten = []
|
||||
mask_flatten = []
|
||||
lvl_pos_embed_flatten = []
|
||||
spatial_shapes = []
|
||||
valid_ratios = []
|
||||
for i, feat in enumerate(proj_feats):
|
||||
bs, _, h, w = paddle.shape(feat)
|
||||
spatial_shapes.append(paddle.stack([h, w]))
|
||||
# [b,c,h,w] -> [b,h*w,c]
|
||||
feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
|
||||
if pad_mask is not None:
|
||||
mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0]
|
||||
else:
|
||||
mask = paddle.ones([bs, h, w])
|
||||
valid_ratios.append(get_valid_ratio(mask))
|
||||
# [b, h*w, c]
|
||||
pos_embed = self.position_embedding(mask).flatten(1, 2)
|
||||
lvl_pos_embed = pos_embed + self.level_embed.weight[i]
|
||||
lvl_pos_embed_flatten.append(lvl_pos_embed)
|
||||
if pad_mask is not None:
|
||||
# [b, h*w]
|
||||
mask_flatten.append(mask.flatten(1))
|
||||
|
||||
# [b, l, c]
|
||||
feat_flatten = paddle.concat(feat_flatten, 1)
|
||||
# [b, l]
|
||||
mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten,
|
||||
1)
|
||||
# [b, l, c]
|
||||
lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
|
||||
# [num_levels, 2]
|
||||
spatial_shapes = paddle.to_tensor(
|
||||
paddle.stack(spatial_shapes).astype('int64'))
|
||||
# [l] start index of each level
|
||||
level_start_index = paddle.concat([
|
||||
paddle.zeros(
|
||||
[1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
|
||||
])
|
||||
# [b, num_levels, 2]
|
||||
valid_ratios = paddle.stack(valid_ratios, 1)
|
||||
return (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
|
||||
lvl_pos_embed_flatten, valid_ratios)
|
||||
|
||||
def forward(self, feats, pad_mask=None, gt_meta=None):
|
||||
# input projection and embedding
|
||||
(feat_flatten, spatial_shapes, level_start_index, mask_flatten,
|
||||
lvl_pos_embed_flatten,
|
||||
valid_ratios) = self._get_encoder_input(feats, pad_mask)
|
||||
|
||||
# encoder
|
||||
memory = self.encoder(feat_flatten, spatial_shapes, level_start_index,
|
||||
mask_flatten, lvl_pos_embed_flatten, valid_ratios)
|
||||
|
||||
# prepare denoising training
|
||||
if self.training:
|
||||
denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
|
||||
get_contrastive_denoising_training_group(gt_meta,
|
||||
self.num_classes,
|
||||
self.num_queries,
|
||||
self.denoising_class_embed.weight,
|
||||
self.num_denoising,
|
||||
self.label_noise_ratio,
|
||||
self.box_noise_scale)
|
||||
else:
|
||||
denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
|
||||
|
||||
target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
|
||||
self._get_decoder_input(
|
||||
memory, spatial_shapes, mask_flatten, denoising_class,
|
||||
denoising_bbox_unact)
|
||||
|
||||
# decoder
|
||||
inter_feats, inter_bboxes = self.decoder(
|
||||
target, init_ref_points_unact, memory, spatial_shapes,
|
||||
level_start_index, self.dec_bbox_head, self.query_pos_head,
|
||||
valid_ratios, attn_mask, mask_flatten)
|
||||
out_bboxes = []
|
||||
out_logits = []
|
||||
for i in range(self.num_decoder_layers):
|
||||
out_logits.append(self.dec_score_head[i](inter_feats[i]))
|
||||
if i == 0:
|
||||
out_bboxes.append(
|
||||
F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
|
||||
init_ref_points_unact))
|
||||
else:
|
||||
out_bboxes.append(
|
||||
F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
|
||||
inverse_sigmoid(inter_bboxes[i - 1])))
|
||||
out_bboxes = paddle.stack(out_bboxes)
|
||||
out_logits = paddle.stack(out_logits)
|
||||
|
||||
return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
|
||||
dn_meta)
|
||||
|
||||
def _get_encoder_output_anchors(self,
|
||||
memory,
|
||||
spatial_shapes,
|
||||
memory_mask=None,
|
||||
grid_size=0.05):
|
||||
output_anchors = []
|
||||
idx = 0
|
||||
for lvl, (h, w) in enumerate(spatial_shapes):
|
||||
if memory_mask is not None:
|
||||
mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w])
|
||||
valid_H = paddle.sum(mask_[:, :, 0], 1)
|
||||
valid_W = paddle.sum(mask_[:, 0, :], 1)
|
||||
else:
|
||||
valid_H, valid_W = h, w
|
||||
|
||||
grid_y, grid_x = paddle.meshgrid(
|
||||
paddle.arange(end=h), paddle.arange(end=w))
|
||||
grid_xy = paddle.stack([grid_x, grid_y], -1).astype(memory.dtype)
|
||||
|
||||
valid_WH = paddle.stack([valid_W, valid_H], -1).reshape(
|
||||
[-1, 1, 1, 2]).astype(grid_xy.dtype)
|
||||
grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
|
||||
wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
|
||||
output_anchors.append(
|
||||
paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
|
||||
idx += h * w
|
||||
|
||||
output_anchors = paddle.concat(output_anchors, 1)
|
||||
valid_mask = ((output_anchors > self.eps) *
|
||||
(output_anchors < 1 - self.eps)).all(-1, keepdim=True)
|
||||
output_anchors = paddle.log(output_anchors / (1 - output_anchors))
|
||||
if memory_mask is not None:
|
||||
valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0
|
||||
output_anchors = paddle.where(valid_mask, output_anchors,
|
||||
paddle.to_tensor(float("inf")))
|
||||
|
||||
memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
|
||||
output_memory = self.enc_output(memory)
|
||||
return output_memory, output_anchors
|
||||
|
||||
def _get_decoder_input(self,
|
||||
memory,
|
||||
spatial_shapes,
|
||||
memory_mask=None,
|
||||
denoising_class=None,
|
||||
denoising_bbox_unact=None):
|
||||
bs, _, _ = memory.shape
|
||||
# prepare input for decoder
|
||||
output_memory, output_anchors = self._get_encoder_output_anchors(
|
||||
memory, spatial_shapes, memory_mask)
|
||||
enc_outputs_class = self.enc_score_head(output_memory)
|
||||
enc_outputs_coord_unact = self.enc_bbox_head(
|
||||
output_memory) + output_anchors
|
||||
|
||||
_, topk_ind = paddle.topk(
|
||||
enc_outputs_class.max(-1), self.num_queries, axis=1)
|
||||
# extract region proposal boxes
|
||||
batch_ind = paddle.arange(end=bs).astype(topk_ind.dtype)
|
||||
batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
|
||||
topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
|
||||
reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact,
|
||||
topk_ind) # unsigmoided.
|
||||
enc_topk_bboxes = F.sigmoid(reference_points_unact)
|
||||
if denoising_bbox_unact is not None:
|
||||
reference_points_unact = paddle.concat(
|
||||
[denoising_bbox_unact, reference_points_unact], 1)
|
||||
enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)
|
||||
|
||||
# extract region features
|
||||
if self.learnt_init_query:
|
||||
target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
|
||||
else:
|
||||
target = paddle.gather_nd(output_memory, topk_ind).detach()
|
||||
if denoising_class is not None:
|
||||
target = paddle.concat([denoising_class, target], 1)
|
||||
|
||||
return target, reference_points_unact.detach(
|
||||
), enc_topk_bboxes, enc_topk_logits
|
||||
85
rtdetr_paddle/ppdet/modeling/transformers/ext_op/README.md
Normal file
85
rtdetr_paddle/ppdet/modeling/transformers/ext_op/README.md
Normal file
@@ -0,0 +1,85 @@
|
||||
# Multi-scale deformable attention自定义OP编译
|
||||
该自定义OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。
|
||||
|
||||
## 1. 环境依赖
|
||||
- Paddle >= 2.3.2
|
||||
- gcc 8.2
|
||||
|
||||
## 2. 安装
|
||||
请在当前路径下进行编译安装
|
||||
```
|
||||
cd rtdetr_paddle/ppdet/modeling/transformers/ext_op/
|
||||
python setup_ms_deformable_attn_op.py install
|
||||
```
|
||||
|
||||
编译完成后即可使用,以下为`ms_deformable_attn`的使用示例
|
||||
```
|
||||
# 引入自定义op
|
||||
from deformable_detr_ops import ms_deformable_attn
|
||||
|
||||
# 构造fake input tensor
|
||||
bs, n_heads, c = 2, 8, 8
|
||||
query_length, n_levels, n_points = 2, 2, 2
|
||||
spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
|
||||
level_start_index = paddle.concat((paddle.to_tensor(
|
||||
[0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
|
||||
value_length = sum([(H * W).item() for H, W in spatial_shapes])
|
||||
|
||||
def get_test_tensors(channels):
|
||||
value = paddle.rand(
|
||||
[bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
|
||||
sampling_locations = paddle.rand(
|
||||
[bs, query_length, n_heads, n_levels, n_points, 2],
|
||||
dtype=paddle.float32)
|
||||
attention_weights = paddle.rand(
|
||||
[bs, query_length, n_heads, n_levels, n_points],
|
||||
dtype=paddle.float32) + 1e-5
|
||||
attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
|
||||
-2, keepdim=True)
|
||||
return [value, sampling_locations, attention_weights]
|
||||
|
||||
value, sampling_locations, attention_weights = get_test_tensors(c)
|
||||
|
||||
output = ms_deformable_attn(value,
|
||||
spatial_shapes,
|
||||
level_start_index,
|
||||
sampling_locations,
|
||||
attention_weights)
|
||||
```
|
||||
|
||||
## 3. 单元测试
|
||||
可以通过执行单元测试来确认自定义算子功能的正确性,执行单元测试的示例如下所示:
|
||||
```
|
||||
python test_ms_deformable_attn_op.py
|
||||
```
|
||||
运行成功后,打印如下:
|
||||
```
|
||||
*True check_forward_equal_with_paddle_float: max_abs_err 6.98e-10 max_rel_err 2.03e-07
|
||||
*tensor1 True check_gradient_numerical(D=30)
|
||||
*tensor2 True check_gradient_numerical(D=30)
|
||||
*tensor3 True check_gradient_numerical(D=30)
|
||||
*tensor1 True check_gradient_numerical(D=32)
|
||||
*tensor2 True check_gradient_numerical(D=32)
|
||||
*tensor3 True check_gradient_numerical(D=32)
|
||||
*tensor1 True check_gradient_numerical(D=64)
|
||||
*tensor2 True check_gradient_numerical(D=64)
|
||||
*tensor3 True check_gradient_numerical(D=64)
|
||||
*tensor1 True check_gradient_numerical(D=71)
|
||||
*tensor2 True check_gradient_numerical(D=71)
|
||||
*tensor3 True check_gradient_numerical(D=71)
|
||||
*tensor1 True check_gradient_numerical(D=128)
|
||||
*tensor2 True check_gradient_numerical(D=128)
|
||||
*tensor3 True check_gradient_numerical(D=128)
|
||||
*tensor1 True check_gradient_numerical(D=1024)
|
||||
*tensor2 True check_gradient_numerical(D=1024)
|
||||
*tensor3 True check_gradient_numerical(D=1024)
|
||||
*tensor1 True check_gradient_numerical(D=1025)
|
||||
*tensor2 True check_gradient_numerical(D=1025)
|
||||
*tensor3 True check_gradient_numerical(D=1025)
|
||||
*tensor1 True check_gradient_numerical(D=2048)
|
||||
*tensor2 True check_gradient_numerical(D=2048)
|
||||
*tensor3 True check_gradient_numerical(D=2048)
|
||||
*tensor1 True check_gradient_numerical(D=3096)
|
||||
*tensor2 True check_gradient_numerical(D=3096)
|
||||
*tensor3 True check_gradient_numerical(D=3096)
|
||||
```
|
||||
@@ -0,0 +1,65 @@
|
||||
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/extension.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
// declare GPU implementation
|
||||
std::vector<paddle::Tensor>
|
||||
MSDeformableAttnCUDAForward(const paddle::Tensor &value,
|
||||
const paddle::Tensor &value_spatial_shapes,
|
||||
const paddle::Tensor &value_level_start_index,
|
||||
const paddle::Tensor &sampling_locations,
|
||||
const paddle::Tensor &attention_weights);
|
||||
|
||||
std::vector<paddle::Tensor> MSDeformableAttnCUDABackward(
|
||||
const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes,
|
||||
const paddle::Tensor &value_level_start_index,
|
||||
const paddle::Tensor &sampling_locations,
|
||||
const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out);
|
||||
|
||||
//// CPU not implemented
|
||||
|
||||
std::vector<std::vector<int64_t>>
|
||||
MSDeformableAttnInferShape(std::vector<int64_t> value_shape,
|
||||
std::vector<int64_t> value_spatial_shapes_shape,
|
||||
std::vector<int64_t> value_level_start_index_shape,
|
||||
std::vector<int64_t> sampling_locations_shape,
|
||||
std::vector<int64_t> attention_weights_shape) {
|
||||
return {{value_shape[0], sampling_locations_shape[1],
|
||||
value_shape[2] * value_shape[3]}};
|
||||
}
|
||||
|
||||
std::vector<paddle::DataType>
|
||||
MSDeformableAttnInferDtype(paddle::DataType value_dtype,
|
||||
paddle::DataType value_spatial_shapes_dtype,
|
||||
paddle::DataType value_level_start_index_dtype,
|
||||
paddle::DataType sampling_locations_dtype,
|
||||
paddle::DataType attention_weights_dtype) {
|
||||
return {value_dtype};
|
||||
}
|
||||
|
||||
PD_BUILD_OP(ms_deformable_attn)
|
||||
.Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
|
||||
"AttentionWeights"})
|
||||
.Outputs({"Out"})
|
||||
.SetKernelFn(PD_KERNEL(MSDeformableAttnCUDAForward))
|
||||
.SetInferShapeFn(PD_INFER_SHAPE(MSDeformableAttnInferShape))
|
||||
.SetInferDtypeFn(PD_INFER_DTYPE(MSDeformableAttnInferDtype));
|
||||
|
||||
PD_BUILD_GRAD_OP(ms_deformable_attn)
|
||||
.Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
|
||||
"AttentionWeights", paddle::Grad("Out")})
|
||||
.Outputs({paddle::Grad("Value"), paddle::Grad("SpatialShapes"),
|
||||
paddle::Grad("LevelIndex"), paddle::Grad("SamplingLocations"),
|
||||
paddle::Grad("AttentionWeights")})
|
||||
.SetKernelFn(PD_KERNEL(MSDeformableAttnCUDABackward));
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,7 @@
|
||||
from paddle.utils.cpp_extension import CUDAExtension, setup
|
||||
|
||||
if __name__ == "__main__":
|
||||
setup(
|
||||
name='deformable_detr_ops',
|
||||
ext_modules=CUDAExtension(
|
||||
sources=['ms_deformable_attn_op.cc', 'ms_deformable_attn_op.cu']))
|
||||
@@ -0,0 +1,140 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import print_function
|
||||
from __future__ import division
|
||||
|
||||
import os
|
||||
import sys
|
||||
import random
|
||||
import numpy as np
|
||||
import paddle
|
||||
# add python path of PaddleDetection to sys.path
|
||||
parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 5)))
|
||||
if parent_path not in sys.path:
|
||||
sys.path.append(parent_path)
|
||||
|
||||
from ppdet.modeling.transformers.utils import deformable_attention_core_func
|
||||
ms_deform_attn_core_paddle = deformable_attention_core_func
|
||||
|
||||
try:
|
||||
gpu_index = int(sys.argv[1])
|
||||
except:
|
||||
gpu_index = 0
|
||||
print(f'Use gpu {gpu_index} to test...')
|
||||
paddle.set_device(f'gpu:{gpu_index}')
|
||||
|
||||
try:
|
||||
from deformable_detr_ops import ms_deformable_attn
|
||||
except Exception as e:
|
||||
print('import deformable_detr_ops error', e)
|
||||
sys.exit(-1)
|
||||
|
||||
paddle.seed(1)
|
||||
random.seed(1)
|
||||
np.random.seed(1)
|
||||
|
||||
bs, n_heads, c = 2, 8, 8
|
||||
query_length, n_levels, n_points = 2, 2, 2
|
||||
spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
|
||||
level_start_index = paddle.concat((paddle.to_tensor(
|
||||
[0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
|
||||
value_length = sum([(H * W).item() for H, W in spatial_shapes])
|
||||
|
||||
|
||||
def get_test_tensors(channels):
|
||||
value = paddle.rand(
|
||||
[bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
|
||||
sampling_locations = paddle.rand(
|
||||
[bs, query_length, n_heads, n_levels, n_points, 2],
|
||||
dtype=paddle.float32)
|
||||
attention_weights = paddle.rand(
|
||||
[bs, query_length, n_heads, n_levels, n_points],
|
||||
dtype=paddle.float32) + 1e-5
|
||||
attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
|
||||
-2, keepdim=True)
|
||||
|
||||
return [value, sampling_locations, attention_weights]
|
||||
|
||||
|
||||
@paddle.no_grad()
|
||||
def check_forward_equal_with_paddle_float():
|
||||
value, sampling_locations, attention_weights = get_test_tensors(c)
|
||||
|
||||
output_paddle = ms_deform_attn_core_paddle(
|
||||
value, spatial_shapes, level_start_index, sampling_locations,
|
||||
attention_weights).detach().cpu()
|
||||
output_cuda = ms_deformable_attn(value, spatial_shapes, level_start_index,
|
||||
sampling_locations,
|
||||
attention_weights).detach().cpu()
|
||||
fwdok = paddle.allclose(
|
||||
output_cuda, output_paddle, rtol=1e-2, atol=1e-3).item()
|
||||
max_abs_err = (output_cuda - output_paddle).abs().max().item()
|
||||
max_rel_err = (
|
||||
(output_cuda - output_paddle).abs() / output_paddle.abs()).max().item()
|
||||
|
||||
print(
|
||||
f'*{fwdok} check_forward_equal_with_paddle_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}'
|
||||
)
|
||||
|
||||
|
||||
def check_gradient_numerical(channels=4):
|
||||
value_paddle, sampling_locations_paddle, attention_weights_paddle = get_test_tensors(
|
||||
channels)
|
||||
value_paddle.stop_gradient = False
|
||||
sampling_locations_paddle.stop_gradient = False
|
||||
attention_weights_paddle.stop_gradient = False
|
||||
|
||||
value_cuda = value_paddle.detach().clone()
|
||||
sampling_locations_cuda = sampling_locations_paddle.detach().clone()
|
||||
attention_weights_cuda = attention_weights_paddle.detach().clone()
|
||||
value_cuda.stop_gradient = False
|
||||
sampling_locations_cuda.stop_gradient = False
|
||||
attention_weights_cuda.stop_gradient = False
|
||||
|
||||
output_paddle = ms_deform_attn_core_paddle(
|
||||
value_paddle, spatial_shapes, level_start_index,
|
||||
sampling_locations_paddle, attention_weights_paddle)
|
||||
output_paddle.sum().backward()
|
||||
|
||||
output_cuda = ms_deformable_attn(value_cuda, spatial_shapes,
|
||||
level_start_index, sampling_locations_cuda,
|
||||
attention_weights_cuda)
|
||||
output_cuda.sum().backward()
|
||||
|
||||
res = paddle.allclose(
|
||||
value_paddle.grad, value_cuda.grad, rtol=1e-2, atol=1e-3).item()
|
||||
print(f'*tensor1 {res} check_gradient_numerical(D={channels})')
|
||||
|
||||
res = paddle.allclose(
|
||||
sampling_locations_paddle.grad,
|
||||
sampling_locations_cuda.grad,
|
||||
rtol=1e-2,
|
||||
atol=1e-3).item()
|
||||
print(f'*tensor2 {res} check_gradient_numerical(D={channels})')
|
||||
|
||||
res = paddle.allclose(
|
||||
attention_weights_paddle.grad,
|
||||
attention_weights_cuda.grad,
|
||||
rtol=1e-2,
|
||||
atol=1e-3).item()
|
||||
print(f'*tensor3 {res} check_gradient_numerical(D={channels})')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
check_forward_equal_with_paddle_float()
|
||||
|
||||
for channels in [30, 32, 64, 71, 128, 1024, 1025, 2048, 3096]:
|
||||
check_gradient_numerical(channels)
|
||||
287
rtdetr_paddle/ppdet/modeling/transformers/hybrid_encoder.py
Normal file
287
rtdetr_paddle/ppdet/modeling/transformers/hybrid_encoder.py
Normal file
@@ -0,0 +1,287 @@
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from ppdet.modeling.ops import get_act_fn
|
||||
from ..shape_spec import ShapeSpec
|
||||
from ..backbones.csp_darknet import BaseConv
|
||||
from ..backbones.cspresnet import RepVggBlock
|
||||
from ppdet.modeling.transformers.detr_transformer import TransformerEncoder
|
||||
from ..initializer import xavier_uniform_, linear_init_
|
||||
from ..layers import MultiHeadAttention
|
||||
from paddle import ParamAttr
|
||||
from paddle.regularizer import L2Decay
|
||||
|
||||
__all__ = ['HybridEncoder']
|
||||
|
||||
|
||||
class CSPRepLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
num_blocks=3,
|
||||
expansion=1.0,
|
||||
bias=False,
|
||||
act="silu"):
|
||||
super(CSPRepLayer, self).__init__()
|
||||
hidden_channels = int(out_channels * expansion)
|
||||
self.conv1 = BaseConv(
|
||||
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
|
||||
self.conv2 = BaseConv(
|
||||
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
|
||||
self.bottlenecks = nn.Sequential(*[
|
||||
RepVggBlock(
|
||||
hidden_channels, hidden_channels, act=act)
|
||||
for _ in range(num_blocks)
|
||||
])
|
||||
if hidden_channels != out_channels:
|
||||
self.conv3 = BaseConv(
|
||||
hidden_channels,
|
||||
out_channels,
|
||||
ksize=1,
|
||||
stride=1,
|
||||
bias=bias,
|
||||
act=act)
|
||||
else:
|
||||
self.conv3 = nn.Identity()
|
||||
|
||||
def forward(self, x):
|
||||
x_1 = self.conv1(x)
|
||||
x_1 = self.bottlenecks(x_1)
|
||||
x_2 = self.conv2(x)
|
||||
return self.conv3(x_1 + x_2)
|
||||
|
||||
|
||||
@register
|
||||
class TransformerLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
d_model,
|
||||
nhead,
|
||||
dim_feedforward=1024,
|
||||
dropout=0.,
|
||||
activation="relu",
|
||||
attn_dropout=None,
|
||||
act_dropout=None,
|
||||
normalize_before=False):
|
||||
super(TransformerLayer, self).__init__()
|
||||
attn_dropout = dropout if attn_dropout is None else attn_dropout
|
||||
act_dropout = dropout if act_dropout is None else act_dropout
|
||||
self.normalize_before = normalize_before
|
||||
|
||||
self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
|
||||
# Implementation of Feedforward model
|
||||
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
||||
self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
|
||||
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
||||
|
||||
self.norm1 = nn.LayerNorm(d_model)
|
||||
self.norm2 = nn.LayerNorm(d_model)
|
||||
self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
|
||||
self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
|
||||
self.activation = getattr(F, activation)
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
linear_init_(self.linear1)
|
||||
linear_init_(self.linear2)
|
||||
|
||||
@staticmethod
|
||||
def with_pos_embed(tensor, pos_embed):
|
||||
return tensor if pos_embed is None else tensor + pos_embed
|
||||
|
||||
def forward(self, src, src_mask=None, pos_embed=None):
|
||||
residual = src
|
||||
if self.normalize_before:
|
||||
src = self.norm1(src)
|
||||
q = k = self.with_pos_embed(src, pos_embed)
|
||||
src = self.self_attn(q, k, value=src, attn_mask=src_mask)
|
||||
|
||||
src = residual + self.dropout1(src)
|
||||
if not self.normalize_before:
|
||||
src = self.norm1(src)
|
||||
|
||||
residual = src
|
||||
if self.normalize_before:
|
||||
src = self.norm2(src)
|
||||
src = self.linear2(self.dropout(self.activation(self.linear1(src))))
|
||||
src = residual + self.dropout2(src)
|
||||
if not self.normalize_before:
|
||||
src = self.norm2(src)
|
||||
return src
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class HybridEncoder(nn.Layer):
|
||||
__shared__ = ['depth_mult', 'act', 'trt', 'eval_size']
|
||||
__inject__ = ['encoder_layer']
|
||||
|
||||
def __init__(self,
|
||||
in_channels=[512, 1024, 2048],
|
||||
feat_strides=[8, 16, 32],
|
||||
hidden_dim=256,
|
||||
use_encoder_idx=[2],
|
||||
num_encoder_layers=1,
|
||||
encoder_layer='TransformerLayer',
|
||||
pe_temperature=10000,
|
||||
expansion=1.0,
|
||||
depth_mult=1.0,
|
||||
act='silu',
|
||||
trt=False,
|
||||
eval_size=None):
|
||||
super(HybridEncoder, self).__init__()
|
||||
self.in_channels = in_channels
|
||||
self.feat_strides = feat_strides
|
||||
self.hidden_dim = hidden_dim
|
||||
self.use_encoder_idx = use_encoder_idx
|
||||
self.num_encoder_layers = num_encoder_layers
|
||||
self.pe_temperature = pe_temperature
|
||||
self.eval_size = eval_size
|
||||
|
||||
# channel projection
|
||||
self.input_proj = nn.LayerList()
|
||||
for in_channel in in_channels:
|
||||
self.input_proj.append(
|
||||
nn.Sequential(
|
||||
nn.Conv2D(
|
||||
in_channel, hidden_dim, kernel_size=1, bias_attr=False),
|
||||
nn.BatchNorm2D(
|
||||
hidden_dim,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))
|
||||
# encoder transformer
|
||||
self.encoder = nn.LayerList([
|
||||
TransformerEncoder(encoder_layer, num_encoder_layers)
|
||||
for _ in range(len(use_encoder_idx))
|
||||
])
|
||||
|
||||
act = get_act_fn(
|
||||
act, trt=trt) if act is None or isinstance(act,
|
||||
(str, dict)) else act
|
||||
# top-down fpn
|
||||
self.lateral_convs = nn.LayerList()
|
||||
self.fpn_blocks = nn.LayerList()
|
||||
for idx in range(len(in_channels) - 1, 0, -1):
|
||||
self.lateral_convs.append(
|
||||
BaseConv(
|
||||
hidden_dim, hidden_dim, 1, 1, act=act))
|
||||
self.fpn_blocks.append(
|
||||
CSPRepLayer(
|
||||
hidden_dim * 2,
|
||||
hidden_dim,
|
||||
round(3 * depth_mult),
|
||||
act=act,
|
||||
expansion=expansion))
|
||||
|
||||
# bottom-up pan
|
||||
self.downsample_convs = nn.LayerList()
|
||||
self.pan_blocks = nn.LayerList()
|
||||
for idx in range(len(in_channels) - 1):
|
||||
self.downsample_convs.append(
|
||||
BaseConv(
|
||||
hidden_dim, hidden_dim, 3, stride=2, act=act))
|
||||
self.pan_blocks.append(
|
||||
CSPRepLayer(
|
||||
hidden_dim * 2,
|
||||
hidden_dim,
|
||||
round(3 * depth_mult),
|
||||
act=act,
|
||||
expansion=expansion))
|
||||
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
if self.eval_size:
|
||||
for idx in self.use_encoder_idx:
|
||||
stride = self.feat_strides[idx]
|
||||
pos_embed = self.build_2d_sincos_position_embedding(
|
||||
self.eval_size[1] // stride, self.eval_size[0] // stride,
|
||||
self.hidden_dim, self.pe_temperature)
|
||||
setattr(self, f'pos_embed{idx}', pos_embed)
|
||||
|
||||
@staticmethod
|
||||
def build_2d_sincos_position_embedding(w,
|
||||
h,
|
||||
embed_dim=256,
|
||||
temperature=10000.):
|
||||
grid_w = paddle.arange(int(w), dtype=paddle.float32)
|
||||
grid_h = paddle.arange(int(h), dtype=paddle.float32)
|
||||
grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
|
||||
assert embed_dim % 4 == 0, \
|
||||
'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
|
||||
pos_dim = embed_dim // 4
|
||||
omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
|
||||
omega = 1. / (temperature**omega)
|
||||
|
||||
out_w = grid_w.flatten()[..., None] @omega[None]
|
||||
out_h = grid_h.flatten()[..., None] @omega[None]
|
||||
|
||||
return paddle.concat(
|
||||
[
|
||||
paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
|
||||
paddle.cos(out_h)
|
||||
],
|
||||
axis=1)[None, :, :]
|
||||
|
||||
def forward(self, feats, for_mot=False):
|
||||
assert len(feats) == len(self.in_channels)
|
||||
# get projection features
|
||||
proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
|
||||
# encoder
|
||||
if self.num_encoder_layers > 0:
|
||||
for i, enc_ind in enumerate(self.use_encoder_idx):
|
||||
h, w = proj_feats[enc_ind].shape[2:]
|
||||
# flatten [B, C, H, W] to [B, HxW, C]
|
||||
src_flatten = proj_feats[enc_ind].flatten(2).transpose(
|
||||
[0, 2, 1])
|
||||
if self.training or self.eval_size is None:
|
||||
pos_embed = self.build_2d_sincos_position_embedding(
|
||||
w, h, self.hidden_dim, self.pe_temperature)
|
||||
else:
|
||||
pos_embed = getattr(self, f'pos_embed{enc_ind}', None)
|
||||
memory = self.encoder[i](src_flatten, pos_embed=pos_embed)
|
||||
proj_feats[enc_ind] = memory.transpose([0, 2, 1]).reshape(
|
||||
[-1, self.hidden_dim, h, w])
|
||||
|
||||
# top-down fpn
|
||||
inner_outs = [proj_feats[-1]]
|
||||
for idx in range(len(self.in_channels) - 1, 0, -1):
|
||||
feat_heigh = inner_outs[0]
|
||||
feat_low = proj_feats[idx - 1]
|
||||
feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
|
||||
feat_heigh)
|
||||
inner_outs[0] = feat_heigh
|
||||
|
||||
upsample_feat = F.interpolate(
|
||||
feat_heigh, scale_factor=2., mode="nearest")
|
||||
inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
|
||||
paddle.concat(
|
||||
[upsample_feat, feat_low], axis=1))
|
||||
inner_outs.insert(0, inner_out)
|
||||
|
||||
# bottom-up pan
|
||||
outs = [inner_outs[0]]
|
||||
for idx in range(len(self.in_channels) - 1):
|
||||
feat_low = outs[-1]
|
||||
feat_height = inner_outs[idx + 1]
|
||||
downsample_feat = self.downsample_convs[idx](feat_low)
|
||||
out = self.pan_blocks[idx](paddle.concat(
|
||||
[downsample_feat, feat_height], axis=1))
|
||||
outs.append(out)
|
||||
|
||||
return outs
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, cfg, input_shape):
|
||||
return {
|
||||
'in_channels': [i.channels for i in input_shape],
|
||||
'feat_strides': [i.stride for i in input_shape]
|
||||
}
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [
|
||||
ShapeSpec(
|
||||
channels=self.hidden_dim, stride=self.feat_strides[idx])
|
||||
for idx in range(len(self.in_channels))
|
||||
]
|
||||
184
rtdetr_paddle/ppdet/modeling/transformers/matchers.py
Normal file
184
rtdetr_paddle/ppdet/modeling/transformers/matchers.py
Normal file
@@ -0,0 +1,184 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Modified from DETR (https://github.com/facebookresearch/detr)
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from scipy.optimize import linear_sum_assignment
|
||||
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from ..losses.iou_loss import GIoULoss
|
||||
from .utils import bbox_cxcywh_to_xyxy
|
||||
|
||||
__all__ = ['HungarianMatcher']
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class HungarianMatcher(nn.Layer):
|
||||
__shared__ = ['use_focal_loss', 'with_mask', 'num_sample_points']
|
||||
|
||||
def __init__(self,
|
||||
matcher_coeff={
|
||||
'class': 1,
|
||||
'bbox': 5,
|
||||
'giou': 2,
|
||||
'mask': 1,
|
||||
'dice': 1
|
||||
},
|
||||
use_focal_loss=False,
|
||||
with_mask=False,
|
||||
num_sample_points=12544,
|
||||
alpha=0.25,
|
||||
gamma=2.0):
|
||||
r"""
|
||||
Args:
|
||||
matcher_coeff (dict): The coefficient of hungarian matcher cost.
|
||||
"""
|
||||
super(HungarianMatcher, self).__init__()
|
||||
self.matcher_coeff = matcher_coeff
|
||||
self.use_focal_loss = use_focal_loss
|
||||
self.with_mask = with_mask
|
||||
self.num_sample_points = num_sample_points
|
||||
self.alpha = alpha
|
||||
self.gamma = gamma
|
||||
|
||||
self.giou_loss = GIoULoss()
|
||||
|
||||
def forward(self,
|
||||
boxes,
|
||||
logits,
|
||||
gt_bbox,
|
||||
gt_class,
|
||||
masks=None,
|
||||
gt_mask=None):
|
||||
r"""
|
||||
Args:
|
||||
boxes (Tensor): [b, query, 4]
|
||||
logits (Tensor): [b, query, num_classes]
|
||||
gt_bbox (List(Tensor)): list[[n, 4]]
|
||||
gt_class (List(Tensor)): list[[n, 1]]
|
||||
masks (Tensor|None): [b, query, h, w]
|
||||
gt_mask (List(Tensor)): list[[n, H, W]]
|
||||
|
||||
Returns:
|
||||
A list of size batch_size, containing tuples of (index_i, index_j) where:
|
||||
- index_i is the indices of the selected predictions (in order)
|
||||
- index_j is the indices of the corresponding selected targets (in order)
|
||||
For each batch element, it holds:
|
||||
len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
|
||||
"""
|
||||
bs, num_queries = boxes.shape[:2]
|
||||
|
||||
num_gts = [len(a) for a in gt_class]
|
||||
if sum(num_gts) == 0:
|
||||
return [(paddle.to_tensor(
|
||||
[], dtype=paddle.int64), paddle.to_tensor(
|
||||
[], dtype=paddle.int64)) for _ in range(bs)]
|
||||
|
||||
# We flatten to compute the cost matrices in a batch
|
||||
# [batch_size * num_queries, num_classes]
|
||||
logits = logits.detach()
|
||||
out_prob = F.sigmoid(logits.flatten(
|
||||
0, 1)) if self.use_focal_loss else F.softmax(logits.flatten(0, 1))
|
||||
# [batch_size * num_queries, 4]
|
||||
out_bbox = boxes.detach().flatten(0, 1)
|
||||
|
||||
# Also concat the target labels and boxes
|
||||
tgt_ids = paddle.concat(gt_class).flatten()
|
||||
tgt_bbox = paddle.concat(gt_bbox)
|
||||
|
||||
# Compute the classification cost
|
||||
out_prob = paddle.gather(out_prob, tgt_ids, axis=1)
|
||||
if self.use_focal_loss:
|
||||
neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-(
|
||||
1 - out_prob + 1e-8).log())
|
||||
pos_cost_class = self.alpha * (
|
||||
(1 - out_prob)**self.gamma) * (-(out_prob + 1e-8).log())
|
||||
cost_class = pos_cost_class - neg_cost_class
|
||||
else:
|
||||
cost_class = -out_prob
|
||||
|
||||
# Compute the L1 cost between boxes
|
||||
cost_bbox = (
|
||||
out_bbox.unsqueeze(1) - tgt_bbox.unsqueeze(0)).abs().sum(-1)
|
||||
|
||||
# Compute the giou cost betwen boxes
|
||||
cost_giou = self.giou_loss(
|
||||
bbox_cxcywh_to_xyxy(out_bbox.unsqueeze(1)),
|
||||
bbox_cxcywh_to_xyxy(tgt_bbox.unsqueeze(0))).squeeze(-1)
|
||||
|
||||
# Final cost matrix
|
||||
C = self.matcher_coeff['class'] * cost_class + \
|
||||
self.matcher_coeff['bbox'] * cost_bbox + \
|
||||
self.matcher_coeff['giou'] * cost_giou
|
||||
# Compute the mask cost and dice cost
|
||||
if self.with_mask:
|
||||
assert (masks is not None and gt_mask is not None,
|
||||
'Make sure the input has `mask` and `gt_mask`')
|
||||
# all masks share the same set of points for efficient matching
|
||||
sample_points = paddle.rand([bs, 1, self.num_sample_points, 2])
|
||||
sample_points = 2.0 * sample_points - 1.0
|
||||
|
||||
out_mask = F.grid_sample(
|
||||
masks.detach(), sample_points, align_corners=False).squeeze(-2)
|
||||
out_mask = out_mask.flatten(0, 1)
|
||||
|
||||
tgt_mask = paddle.concat(gt_mask).unsqueeze(1)
|
||||
sample_points = paddle.concat([
|
||||
a.tile([b, 1, 1, 1]) for a, b in zip(sample_points, num_gts)
|
||||
if b > 0
|
||||
])
|
||||
tgt_mask = F.grid_sample(
|
||||
tgt_mask, sample_points, align_corners=False).squeeze([1, 2])
|
||||
|
||||
with paddle.amp.auto_cast(enable=False):
|
||||
# binary cross entropy cost
|
||||
pos_cost_mask = F.binary_cross_entropy_with_logits(
|
||||
out_mask, paddle.ones_like(out_mask), reduction='none')
|
||||
neg_cost_mask = F.binary_cross_entropy_with_logits(
|
||||
out_mask, paddle.zeros_like(out_mask), reduction='none')
|
||||
cost_mask = paddle.matmul(
|
||||
pos_cost_mask, tgt_mask, transpose_y=True) + paddle.matmul(
|
||||
neg_cost_mask, 1 - tgt_mask, transpose_y=True)
|
||||
cost_mask /= self.num_sample_points
|
||||
|
||||
# dice cost
|
||||
out_mask = F.sigmoid(out_mask)
|
||||
numerator = 2 * paddle.matmul(
|
||||
out_mask, tgt_mask, transpose_y=True)
|
||||
denominator = out_mask.sum(
|
||||
-1, keepdim=True) + tgt_mask.sum(-1).unsqueeze(0)
|
||||
cost_dice = 1 - (numerator + 1) / (denominator + 1)
|
||||
|
||||
C = C + self.matcher_coeff['mask'] * cost_mask + \
|
||||
self.matcher_coeff['dice'] * cost_dice
|
||||
|
||||
C = C.reshape([bs, num_queries, -1])
|
||||
C = [a.squeeze(0) for a in C.chunk(bs)]
|
||||
sizes = [a.shape[0] for a in gt_bbox]
|
||||
indices = [
|
||||
linear_sum_assignment(c.split(sizes, -1)[i].numpy())
|
||||
for i, c in enumerate(C)
|
||||
]
|
||||
return [(paddle.to_tensor(
|
||||
i, dtype=paddle.int64), paddle.to_tensor(
|
||||
j, dtype=paddle.int64)) for i, j in indices]
|
||||
100
rtdetr_paddle/ppdet/modeling/transformers/position_encoding.py
Normal file
100
rtdetr_paddle/ppdet/modeling/transformers/position_encoding.py
Normal file
@@ -0,0 +1,100 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Modified from DETR (https://github.com/facebookresearch/detr)
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
|
||||
from ppdet.core.workspace import register, serializable
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class PositionEmbedding(nn.Layer):
|
||||
def __init__(self,
|
||||
num_pos_feats=128,
|
||||
temperature=10000,
|
||||
normalize=True,
|
||||
scale=2 * math.pi,
|
||||
embed_type='sine',
|
||||
num_embeddings=50,
|
||||
offset=0.,
|
||||
eps=1e-6):
|
||||
super(PositionEmbedding, self).__init__()
|
||||
assert embed_type in ['sine', 'learned']
|
||||
|
||||
self.embed_type = embed_type
|
||||
self.offset = offset
|
||||
self.eps = eps
|
||||
if self.embed_type == 'sine':
|
||||
self.num_pos_feats = num_pos_feats
|
||||
self.temperature = temperature
|
||||
self.normalize = normalize
|
||||
self.scale = scale
|
||||
elif self.embed_type == 'learned':
|
||||
self.row_embed = nn.Embedding(num_embeddings, num_pos_feats)
|
||||
self.col_embed = nn.Embedding(num_embeddings, num_pos_feats)
|
||||
else:
|
||||
raise ValueError(f"{self.embed_type} is not supported.")
|
||||
|
||||
def forward(self, mask):
|
||||
"""
|
||||
Args:
|
||||
mask (Tensor): [B, H, W]
|
||||
Returns:
|
||||
pos (Tensor): [B, H, W, C]
|
||||
"""
|
||||
if self.embed_type == 'sine':
|
||||
y_embed = mask.cumsum(1)
|
||||
x_embed = mask.cumsum(2)
|
||||
if self.normalize:
|
||||
y_embed = (y_embed + self.offset) / (
|
||||
y_embed[:, -1:, :] + self.eps) * self.scale
|
||||
x_embed = (x_embed + self.offset) / (
|
||||
x_embed[:, :, -1:] + self.eps) * self.scale
|
||||
|
||||
dim_t = 2 * (paddle.arange(self.num_pos_feats) //
|
||||
2).astype('float32')
|
||||
dim_t = self.temperature**(dim_t / self.num_pos_feats)
|
||||
|
||||
pos_x = x_embed.unsqueeze(-1) / dim_t
|
||||
pos_y = y_embed.unsqueeze(-1) / dim_t
|
||||
pos_x = paddle.stack(
|
||||
(pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
|
||||
axis=4).flatten(3)
|
||||
pos_y = paddle.stack(
|
||||
(pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
|
||||
axis=4).flatten(3)
|
||||
return paddle.concat((pos_y, pos_x), axis=3)
|
||||
elif self.embed_type == 'learned':
|
||||
h, w = mask.shape[-2:]
|
||||
i = paddle.arange(w)
|
||||
j = paddle.arange(h)
|
||||
x_emb = self.col_embed(i)
|
||||
y_emb = self.row_embed(j)
|
||||
return paddle.concat(
|
||||
[
|
||||
x_emb.unsqueeze(0).tile([h, 1, 1]),
|
||||
y_emb.unsqueeze(1).tile([1, w, 1]),
|
||||
],
|
||||
axis=-1).unsqueeze(0)
|
||||
else:
|
||||
raise ValueError(f"not supported {self.embed_type}")
|
||||
523
rtdetr_paddle/ppdet/modeling/transformers/rtdetr_transformer.py
Normal file
523
rtdetr_paddle/ppdet/modeling/transformers/rtdetr_transformer.py
Normal file
@@ -0,0 +1,523 @@
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
from paddle.regularizer import L2Decay
|
||||
|
||||
from ppdet.core.workspace import register
|
||||
from ..layers import MultiHeadAttention
|
||||
from .deformable_transformer import MSDeformableAttention
|
||||
from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,
|
||||
bias_init_with_prob)
|
||||
from .utils import (_get_clones, get_sine_pos_embed,
|
||||
get_contrastive_denoising_training_group, inverse_sigmoid, MLP)
|
||||
|
||||
__all__ = ['RTDETRTransformer']
|
||||
|
||||
|
||||
class PPMSDeformableAttention(MSDeformableAttention):
|
||||
def forward(self,
|
||||
query,
|
||||
reference_points,
|
||||
value,
|
||||
value_spatial_shapes,
|
||||
value_level_start_index,
|
||||
value_mask=None):
|
||||
"""
|
||||
Args:
|
||||
query (Tensor): [bs, query_length, C]
|
||||
reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
|
||||
bottom-right (1, 1), including padding area
|
||||
value (Tensor): [bs, value_length, C]
|
||||
value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
|
||||
value_level_start_index (List): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
|
||||
value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
|
||||
|
||||
Returns:
|
||||
output (Tensor): [bs, Length_{query}, C]
|
||||
"""
|
||||
bs, Len_q = query.shape[:2]
|
||||
Len_v = value.shape[1]
|
||||
|
||||
value = self.value_proj(value)
|
||||
if value_mask is not None:
|
||||
value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
|
||||
value *= value_mask
|
||||
value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])
|
||||
|
||||
sampling_offsets = self.sampling_offsets(query).reshape(
|
||||
[bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
|
||||
attention_weights = self.attention_weights(query).reshape(
|
||||
[bs, Len_q, self.num_heads, self.num_levels * self.num_points])
|
||||
attention_weights = F.softmax(attention_weights).reshape(
|
||||
[bs, Len_q, self.num_heads, self.num_levels, self.num_points])
|
||||
|
||||
if reference_points.shape[-1] == 2:
|
||||
offset_normalizer = paddle.to_tensor(value_spatial_shapes)
|
||||
offset_normalizer = offset_normalizer.flip([1]).reshape(
|
||||
[1, 1, 1, self.num_levels, 1, 2])
|
||||
sampling_locations = reference_points.reshape([
|
||||
bs, Len_q, 1, self.num_levels, 1, 2
|
||||
]) + sampling_offsets / offset_normalizer
|
||||
elif reference_points.shape[-1] == 4:
|
||||
sampling_locations = (
|
||||
reference_points[:, :, None, :, None, :2] + sampling_offsets /
|
||||
self.num_points * reference_points[:, :, None, :, None, 2:] *
|
||||
0.5)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Last dim of reference_points must be 2 or 4, but get {} instead.".
|
||||
format(reference_points.shape[-1]))
|
||||
|
||||
if not isinstance(query, paddle.Tensor):
|
||||
from ppdet.modeling.transformers.utils import deformable_attention_core_func
|
||||
output = deformable_attention_core_func(
|
||||
value, value_spatial_shapes, value_level_start_index,
|
||||
sampling_locations, attention_weights)
|
||||
else:
|
||||
value_spatial_shapes = paddle.to_tensor(value_spatial_shapes)
|
||||
value_level_start_index = paddle.to_tensor(value_level_start_index)
|
||||
output = self.ms_deformable_attn_core(
|
||||
value, value_spatial_shapes, value_level_start_index,
|
||||
sampling_locations, attention_weights)
|
||||
output = self.output_proj(output)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class TransformerDecoderLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
d_model=256,
|
||||
n_head=8,
|
||||
dim_feedforward=1024,
|
||||
dropout=0.,
|
||||
activation="relu",
|
||||
n_levels=4,
|
||||
n_points=4,
|
||||
weight_attr=None,
|
||||
bias_attr=None):
|
||||
super(TransformerDecoderLayer, self).__init__()
|
||||
|
||||
# self attention
|
||||
self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
|
||||
self.dropout1 = nn.Dropout(dropout)
|
||||
self.norm1 = nn.LayerNorm(
|
||||
d_model,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
|
||||
|
||||
# cross attention
|
||||
self.cross_attn = PPMSDeformableAttention(d_model, n_head, n_levels,
|
||||
n_points, 1.0)
|
||||
self.dropout2 = nn.Dropout(dropout)
|
||||
self.norm2 = nn.LayerNorm(
|
||||
d_model,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
|
||||
|
||||
# ffn
|
||||
self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr,
|
||||
bias_attr)
|
||||
self.activation = getattr(F, activation)
|
||||
self.dropout3 = nn.Dropout(dropout)
|
||||
self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr,
|
||||
bias_attr)
|
||||
self.dropout4 = nn.Dropout(dropout)
|
||||
self.norm3 = nn.LayerNorm(
|
||||
d_model,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
linear_init_(self.linear1)
|
||||
linear_init_(self.linear2)
|
||||
xavier_uniform_(self.linear1.weight)
|
||||
xavier_uniform_(self.linear2.weight)
|
||||
|
||||
def with_pos_embed(self, tensor, pos):
|
||||
return tensor if pos is None else tensor + pos
|
||||
|
||||
def forward_ffn(self, tgt):
|
||||
return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
|
||||
|
||||
def forward(self,
|
||||
tgt,
|
||||
reference_points,
|
||||
memory,
|
||||
memory_spatial_shapes,
|
||||
memory_level_start_index,
|
||||
attn_mask=None,
|
||||
memory_mask=None,
|
||||
query_pos_embed=None):
|
||||
# self attention
|
||||
q = k = self.with_pos_embed(tgt, query_pos_embed)
|
||||
if attn_mask is not None:
|
||||
attn_mask = paddle.where(
|
||||
attn_mask.astype('bool'),
|
||||
paddle.zeros(attn_mask.shape, tgt.dtype),
|
||||
paddle.full(attn_mask.shape, float("-inf"), tgt.dtype))
|
||||
tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
|
||||
tgt = tgt + self.dropout1(tgt2)
|
||||
tgt = self.norm1(tgt)
|
||||
|
||||
# cross attention
|
||||
tgt2 = self.cross_attn(
|
||||
self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
|
||||
memory_spatial_shapes, memory_level_start_index, memory_mask)
|
||||
tgt = tgt + self.dropout2(tgt2)
|
||||
tgt = self.norm2(tgt)
|
||||
|
||||
# ffn
|
||||
tgt2 = self.forward_ffn(tgt)
|
||||
tgt = tgt + self.dropout4(tgt2)
|
||||
tgt = self.norm3(tgt)
|
||||
|
||||
return tgt
|
||||
|
||||
|
||||
class TransformerDecoder(nn.Layer):
|
||||
def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
|
||||
super(TransformerDecoder, self).__init__()
|
||||
self.layers = _get_clones(decoder_layer, num_layers)
|
||||
self.hidden_dim = hidden_dim
|
||||
self.num_layers = num_layers
|
||||
self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
|
||||
|
||||
def forward(self,
|
||||
tgt,
|
||||
ref_points_unact,
|
||||
memory,
|
||||
memory_spatial_shapes,
|
||||
memory_level_start_index,
|
||||
bbox_head,
|
||||
score_head,
|
||||
query_pos_head,
|
||||
attn_mask=None,
|
||||
memory_mask=None):
|
||||
output = tgt
|
||||
dec_out_bboxes = []
|
||||
dec_out_logits = []
|
||||
ref_points_detach = F.sigmoid(ref_points_unact)
|
||||
for i, layer in enumerate(self.layers):
|
||||
ref_points_input = ref_points_detach.unsqueeze(2)
|
||||
query_pos_embed = query_pos_head(ref_points_detach)
|
||||
|
||||
output = layer(output, ref_points_input, memory,
|
||||
memory_spatial_shapes, memory_level_start_index,
|
||||
attn_mask, memory_mask, query_pos_embed)
|
||||
|
||||
inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
|
||||
ref_points_detach))
|
||||
|
||||
if self.training:
|
||||
dec_out_logits.append(score_head[i](output))
|
||||
if i == 0:
|
||||
dec_out_bboxes.append(inter_ref_bbox)
|
||||
else:
|
||||
dec_out_bboxes.append(
|
||||
F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
|
||||
ref_points)))
|
||||
elif i == self.eval_idx:
|
||||
dec_out_logits.append(score_head[i](output))
|
||||
dec_out_bboxes.append(inter_ref_bbox)
|
||||
break
|
||||
|
||||
ref_points = inter_ref_bbox
|
||||
ref_points_detach = inter_ref_bbox.detach(
|
||||
) if self.training else inter_ref_bbox
|
||||
|
||||
return paddle.stack(dec_out_bboxes), paddle.stack(dec_out_logits)
|
||||
|
||||
|
||||
@register
|
||||
class RTDETRTransformer(nn.Layer):
|
||||
__shared__ = ['num_classes', 'hidden_dim', 'eval_size']
|
||||
|
||||
def __init__(self,
|
||||
num_classes=80,
|
||||
hidden_dim=256,
|
||||
num_queries=300,
|
||||
position_embed_type='sine',
|
||||
backbone_feat_channels=[512, 1024, 2048],
|
||||
feat_strides=[8, 16, 32],
|
||||
num_levels=3,
|
||||
num_decoder_points=4,
|
||||
nhead=8,
|
||||
num_decoder_layers=6,
|
||||
dim_feedforward=1024,
|
||||
dropout=0.,
|
||||
activation="relu",
|
||||
num_denoising=100,
|
||||
label_noise_ratio=0.5,
|
||||
box_noise_scale=1.0,
|
||||
learnt_init_query=True,
|
||||
eval_size=None,
|
||||
eval_idx=-1,
|
||||
eps=1e-2):
|
||||
super(RTDETRTransformer, self).__init__()
|
||||
assert position_embed_type in ['sine', 'learned'], \
|
||||
f'ValueError: position_embed_type not supported {position_embed_type}!'
|
||||
assert len(backbone_feat_channels) <= num_levels
|
||||
assert len(feat_strides) == len(backbone_feat_channels)
|
||||
for _ in range(num_levels - len(feat_strides)):
|
||||
feat_strides.append(feat_strides[-1] * 2)
|
||||
|
||||
self.hidden_dim = hidden_dim
|
||||
self.nhead = nhead
|
||||
self.feat_strides = feat_strides
|
||||
self.num_levels = num_levels
|
||||
self.num_classes = num_classes
|
||||
self.num_queries = num_queries
|
||||
self.eps = eps
|
||||
self.num_decoder_layers = num_decoder_layers
|
||||
self.eval_size = eval_size
|
||||
|
||||
# backbone feature projection
|
||||
self._build_input_proj_layer(backbone_feat_channels)
|
||||
|
||||
# Transformer module
|
||||
decoder_layer = TransformerDecoderLayer(
|
||||
hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
|
||||
num_decoder_points)
|
||||
self.decoder = TransformerDecoder(hidden_dim, decoder_layer,
|
||||
num_decoder_layers, eval_idx)
|
||||
|
||||
# denoising part
|
||||
self.denoising_class_embed = nn.Embedding(
|
||||
num_classes,
|
||||
hidden_dim,
|
||||
weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
|
||||
self.num_denoising = num_denoising
|
||||
self.label_noise_ratio = label_noise_ratio
|
||||
self.box_noise_scale = box_noise_scale
|
||||
|
||||
# decoder embedding
|
||||
self.learnt_init_query = learnt_init_query
|
||||
if learnt_init_query:
|
||||
self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
|
||||
self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)
|
||||
|
||||
# encoder head
|
||||
self.enc_output = nn.Sequential(
|
||||
nn.Linear(hidden_dim, hidden_dim),
|
||||
nn.LayerNorm(
|
||||
hidden_dim,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0))))
|
||||
self.enc_score_head = nn.Linear(hidden_dim, num_classes)
|
||||
self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
|
||||
|
||||
# decoder head
|
||||
self.dec_score_head = nn.LayerList([
|
||||
nn.Linear(hidden_dim, num_classes)
|
||||
for _ in range(num_decoder_layers)
|
||||
])
|
||||
self.dec_bbox_head = nn.LayerList([
|
||||
MLP(hidden_dim, hidden_dim, 4, num_layers=3)
|
||||
for _ in range(num_decoder_layers)
|
||||
])
|
||||
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
# class and bbox head init
|
||||
bias_cls = bias_init_with_prob(0.01)
|
||||
linear_init_(self.enc_score_head)
|
||||
constant_(self.enc_score_head.bias, bias_cls)
|
||||
constant_(self.enc_bbox_head.layers[-1].weight)
|
||||
constant_(self.enc_bbox_head.layers[-1].bias)
|
||||
for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
|
||||
linear_init_(cls_)
|
||||
constant_(cls_.bias, bias_cls)
|
||||
constant_(reg_.layers[-1].weight)
|
||||
constant_(reg_.layers[-1].bias)
|
||||
|
||||
linear_init_(self.enc_output[0])
|
||||
xavier_uniform_(self.enc_output[0].weight)
|
||||
if self.learnt_init_query:
|
||||
xavier_uniform_(self.tgt_embed.weight)
|
||||
xavier_uniform_(self.query_pos_head.layers[0].weight)
|
||||
xavier_uniform_(self.query_pos_head.layers[1].weight)
|
||||
for l in self.input_proj:
|
||||
xavier_uniform_(l[0].weight)
|
||||
|
||||
# init encoder output anchors and valid_mask
|
||||
if self.eval_size:
|
||||
self.anchors, self.valid_mask = self._generate_anchors()
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, cfg, input_shape):
|
||||
return {'backbone_feat_channels': [i.channels for i in input_shape]}
|
||||
|
||||
def _build_input_proj_layer(self, backbone_feat_channels):
|
||||
self.input_proj = nn.LayerList()
|
||||
for in_channels in backbone_feat_channels:
|
||||
self.input_proj.append(
|
||||
nn.Sequential(
|
||||
('conv', nn.Conv2D(
|
||||
in_channels,
|
||||
self.hidden_dim,
|
||||
kernel_size=1,
|
||||
bias_attr=False)), ('norm', nn.BatchNorm2D(
|
||||
self.hidden_dim,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
|
||||
in_channels = backbone_feat_channels[-1]
|
||||
for _ in range(self.num_levels - len(backbone_feat_channels)):
|
||||
self.input_proj.append(
|
||||
nn.Sequential(
|
||||
('conv', nn.Conv2D(
|
||||
in_channels,
|
||||
self.hidden_dim,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
bias_attr=False)), ('norm', nn.BatchNorm2D(
|
||||
self.hidden_dim,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
|
||||
in_channels = self.hidden_dim
|
||||
|
||||
def _get_encoder_input(self, feats):
|
||||
# get projection features
|
||||
proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
|
||||
if self.num_levels > len(proj_feats):
|
||||
len_srcs = len(proj_feats)
|
||||
for i in range(len_srcs, self.num_levels):
|
||||
if i == len_srcs:
|
||||
proj_feats.append(self.input_proj[i](feats[-1]))
|
||||
else:
|
||||
proj_feats.append(self.input_proj[i](proj_feats[-1]))
|
||||
|
||||
# get encoder inputs
|
||||
feat_flatten = []
|
||||
spatial_shapes = []
|
||||
level_start_index = [0, ]
|
||||
for i, feat in enumerate(proj_feats):
|
||||
_, _, h, w = feat.shape
|
||||
# [b, c, h, w] -> [b, h*w, c]
|
||||
feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
|
||||
# [num_levels, 2]
|
||||
spatial_shapes.append([h, w])
|
||||
# [l], start index of each level
|
||||
level_start_index.append(h * w + level_start_index[-1])
|
||||
|
||||
# [b, l, c]
|
||||
feat_flatten = paddle.concat(feat_flatten, 1)
|
||||
level_start_index.pop()
|
||||
return (feat_flatten, spatial_shapes, level_start_index)
|
||||
|
||||
def forward(self, feats, pad_mask=None, gt_meta=None):
|
||||
# input projection and embedding
|
||||
(memory, spatial_shapes,
|
||||
level_start_index) = self._get_encoder_input(feats)
|
||||
|
||||
# prepare denoising training
|
||||
if self.training:
|
||||
denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
|
||||
get_contrastive_denoising_training_group(gt_meta,
|
||||
self.num_classes,
|
||||
self.num_queries,
|
||||
self.denoising_class_embed.weight,
|
||||
self.num_denoising,
|
||||
self.label_noise_ratio,
|
||||
self.box_noise_scale)
|
||||
else:
|
||||
denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
|
||||
|
||||
target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
|
||||
self._get_decoder_input(
|
||||
memory, spatial_shapes, denoising_class, denoising_bbox_unact)
|
||||
|
||||
# decoder
|
||||
out_bboxes, out_logits = self.decoder(
|
||||
target,
|
||||
init_ref_points_unact,
|
||||
memory,
|
||||
spatial_shapes,
|
||||
level_start_index,
|
||||
self.dec_bbox_head,
|
||||
self.dec_score_head,
|
||||
self.query_pos_head,
|
||||
attn_mask=attn_mask)
|
||||
return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
|
||||
dn_meta)
|
||||
|
||||
def _generate_anchors(self,
|
||||
spatial_shapes=None,
|
||||
grid_size=0.05,
|
||||
dtype="float32"):
|
||||
if spatial_shapes is None:
|
||||
spatial_shapes = [
|
||||
[int(self.eval_size[0] / s), int(self.eval_size[1] / s)]
|
||||
for s in self.feat_strides
|
||||
]
|
||||
anchors = []
|
||||
for lvl, (h, w) in enumerate(spatial_shapes):
|
||||
grid_y, grid_x = paddle.meshgrid(
|
||||
paddle.arange(
|
||||
end=h, dtype=dtype),
|
||||
paddle.arange(
|
||||
end=w, dtype=dtype))
|
||||
grid_xy = paddle.stack([grid_x, grid_y], -1)
|
||||
|
||||
valid_WH = paddle.to_tensor([w, h]).astype(dtype)
|
||||
grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
|
||||
wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
|
||||
anchors.append(
|
||||
paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
|
||||
|
||||
anchors = paddle.concat(anchors, 1)
|
||||
valid_mask = ((anchors > self.eps) *
|
||||
(anchors < 1 - self.eps)).all(-1, keepdim=True)
|
||||
anchors = paddle.log(anchors / (1 - anchors))
|
||||
anchors = paddle.where(valid_mask, anchors,
|
||||
paddle.to_tensor(float("inf")))
|
||||
return anchors, valid_mask
|
||||
|
||||
def _get_decoder_input(self,
|
||||
memory,
|
||||
spatial_shapes,
|
||||
denoising_class=None,
|
||||
denoising_bbox_unact=None):
|
||||
bs, _, _ = memory.shape
|
||||
# prepare input for decoder
|
||||
if self.training or self.eval_size is None:
|
||||
anchors, valid_mask = self._generate_anchors(spatial_shapes)
|
||||
else:
|
||||
anchors, valid_mask = self.anchors, self.valid_mask
|
||||
memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
|
||||
output_memory = self.enc_output(memory)
|
||||
|
||||
enc_outputs_class = self.enc_score_head(output_memory)
|
||||
enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors
|
||||
|
||||
_, topk_ind = paddle.topk(
|
||||
enc_outputs_class.max(-1), self.num_queries, axis=1)
|
||||
# extract region proposal boxes
|
||||
batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype)
|
||||
batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
|
||||
topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
|
||||
|
||||
reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact,
|
||||
topk_ind) # unsigmoided.
|
||||
enc_topk_bboxes = F.sigmoid(reference_points_unact)
|
||||
if denoising_bbox_unact is not None:
|
||||
reference_points_unact = paddle.concat(
|
||||
[denoising_bbox_unact, reference_points_unact], 1)
|
||||
if self.training:
|
||||
reference_points_unact = reference_points_unact.detach()
|
||||
enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)
|
||||
|
||||
# extract region features
|
||||
if self.learnt_init_query:
|
||||
target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
|
||||
else:
|
||||
target = paddle.gather_nd(output_memory, topk_ind)
|
||||
if self.training:
|
||||
target = target.detach()
|
||||
if denoising_class is not None:
|
||||
target = paddle.concat([denoising_class, target], 1)
|
||||
|
||||
return target, reference_points_unact, enc_topk_bboxes, enc_topk_logits
|
||||
481
rtdetr_paddle/ppdet/modeling/transformers/utils.py
Normal file
481
rtdetr_paddle/ppdet/modeling/transformers/utils.py
Normal file
@@ -0,0 +1,481 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Modified from DETR (https://github.com/facebookresearch/detr)
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
# Modified from detrex (https://github.com/IDEA-Research/detrex)
|
||||
# Copyright 2022 The IDEA Authors. All rights reserved.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import copy
|
||||
import math
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
|
||||
|
||||
__all__ = [
|
||||
'_get_clones', 'bbox_cxcywh_to_xyxy',
|
||||
'bbox_xyxy_to_cxcywh', 'sigmoid_focal_loss', 'inverse_sigmoid',
|
||||
'deformable_attention_core_func', 'varifocal_loss_with_logits'
|
||||
]
|
||||
|
||||
|
||||
|
||||
def bbox_area(boxes):
|
||||
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
|
||||
|
||||
|
||||
def bbox_overlaps(boxes1, boxes2):
|
||||
"""
|
||||
Calculate overlaps between boxes1 and boxes2
|
||||
|
||||
Args:
|
||||
boxes1 (Tensor): boxes with shape [M, 4]
|
||||
boxes2 (Tensor): boxes with shape [N, 4]
|
||||
|
||||
Return:
|
||||
overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N]
|
||||
"""
|
||||
M = boxes1.shape[0]
|
||||
N = boxes2.shape[0]
|
||||
if M * N == 0:
|
||||
return paddle.zeros([M, N], dtype='float32')
|
||||
area1 = bbox_area(boxes1)
|
||||
area2 = bbox_area(boxes2)
|
||||
|
||||
xy_max = paddle.minimum(
|
||||
paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:])
|
||||
xy_min = paddle.maximum(
|
||||
paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2])
|
||||
width_height = xy_max - xy_min
|
||||
width_height = width_height.clip(min=0)
|
||||
inter = width_height.prod(axis=2)
|
||||
|
||||
overlaps = paddle.where(inter > 0, inter /
|
||||
(paddle.unsqueeze(area1, 1) + area2 - inter),
|
||||
paddle.zeros_like(inter))
|
||||
return overlaps
|
||||
|
||||
|
||||
def _get_clones(module, N):
|
||||
return nn.LayerList([copy.deepcopy(module) for _ in range(N)])
|
||||
|
||||
|
||||
def bbox_cxcywh_to_xyxy(x):
|
||||
cxcy, wh = paddle.split(x, 2, axis=-1)
|
||||
return paddle.concat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], axis=-1)
|
||||
|
||||
|
||||
def bbox_xyxy_to_cxcywh(x):
|
||||
x1, y1, x2, y2 = x.split(4, axis=-1)
|
||||
return paddle.concat(
|
||||
[(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)], axis=-1)
|
||||
|
||||
|
||||
def sigmoid_focal_loss(logit, label, normalizer=1.0, alpha=0.25, gamma=2.0):
|
||||
prob = F.sigmoid(logit)
|
||||
ce_loss = F.binary_cross_entropy_with_logits(logit, label, reduction="none")
|
||||
p_t = prob * label + (1 - prob) * (1 - label)
|
||||
loss = ce_loss * ((1 - p_t)**gamma)
|
||||
|
||||
if alpha >= 0:
|
||||
alpha_t = alpha * label + (1 - alpha) * (1 - label)
|
||||
loss = alpha_t * loss
|
||||
return loss.mean(1).sum() / normalizer
|
||||
|
||||
|
||||
def inverse_sigmoid(x, eps=1e-5):
|
||||
x = x.clip(min=0., max=1.)
|
||||
return paddle.log(x.clip(min=eps) / (1 - x).clip(min=eps))
|
||||
|
||||
|
||||
def deformable_attention_core_func(value, value_spatial_shapes,
|
||||
value_level_start_index, sampling_locations,
|
||||
attention_weights):
|
||||
"""
|
||||
Args:
|
||||
value (Tensor): [bs, value_length, n_head, c]
|
||||
value_spatial_shapes (Tensor|List): [n_levels, 2]
|
||||
value_level_start_index (Tensor|List): [n_levels]
|
||||
sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2]
|
||||
attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points]
|
||||
|
||||
Returns:
|
||||
output (Tensor): [bs, Length_{query}, C]
|
||||
"""
|
||||
bs, _, n_head, c = value.shape
|
||||
_, Len_q, _, n_levels, n_points, _ = sampling_locations.shape
|
||||
|
||||
split_shape = [h * w for h, w in value_spatial_shapes]
|
||||
value_list = value.split(split_shape, axis=1)
|
||||
sampling_grids = 2 * sampling_locations - 1
|
||||
sampling_value_list = []
|
||||
for level, (h, w) in enumerate(value_spatial_shapes):
|
||||
# N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
|
||||
value_l_ = value_list[level].flatten(2).transpose(
|
||||
[0, 2, 1]).reshape([bs * n_head, c, h, w])
|
||||
# N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
|
||||
sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(
|
||||
[0, 2, 1, 3, 4]).flatten(0, 1)
|
||||
# N_*M_, D_, Lq_, P_
|
||||
sampling_value_l_ = F.grid_sample(
|
||||
value_l_,
|
||||
sampling_grid_l_,
|
||||
mode='bilinear',
|
||||
padding_mode='zeros',
|
||||
align_corners=False)
|
||||
sampling_value_list.append(sampling_value_l_)
|
||||
# (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_)
|
||||
attention_weights = attention_weights.transpose([0, 2, 1, 3, 4]).reshape(
|
||||
[bs * n_head, 1, Len_q, n_levels * n_points])
|
||||
output = (paddle.stack(
|
||||
sampling_value_list, axis=-2).flatten(-2) *
|
||||
attention_weights).sum(-1).reshape([bs, n_head * c, Len_q])
|
||||
|
||||
return output.transpose([0, 2, 1])
|
||||
|
||||
|
||||
def get_valid_ratio(mask):
|
||||
_, H, W = paddle.shape(mask)
|
||||
valid_ratio_h = paddle.sum(mask[:, :, 0], 1) / H
|
||||
valid_ratio_w = paddle.sum(mask[:, 0, :], 1) / W
|
||||
# [b, 2]
|
||||
return paddle.stack([valid_ratio_w, valid_ratio_h], -1)
|
||||
|
||||
|
||||
def get_denoising_training_group(targets,
|
||||
num_classes,
|
||||
num_queries,
|
||||
class_embed,
|
||||
num_denoising=100,
|
||||
label_noise_ratio=0.5,
|
||||
box_noise_scale=1.0):
|
||||
if num_denoising <= 0:
|
||||
return None, None, None, None
|
||||
num_gts = [len(t) for t in targets["gt_class"]]
|
||||
max_gt_num = max(num_gts)
|
||||
if max_gt_num == 0:
|
||||
return None, None, None, None
|
||||
|
||||
num_group = num_denoising // max_gt_num
|
||||
num_group = 1 if num_group == 0 else num_group
|
||||
# pad gt to max_num of a batch
|
||||
bs = len(targets["gt_class"])
|
||||
input_query_class = paddle.full(
|
||||
[bs, max_gt_num], num_classes, dtype='int32')
|
||||
input_query_bbox = paddle.zeros([bs, max_gt_num, 4])
|
||||
pad_gt_mask = paddle.zeros([bs, max_gt_num])
|
||||
for i in range(bs):
|
||||
num_gt = num_gts[i]
|
||||
if num_gt > 0:
|
||||
input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1)
|
||||
input_query_bbox[i, :num_gt] = targets["gt_bbox"][i]
|
||||
pad_gt_mask[i, :num_gt] = 1
|
||||
|
||||
input_query_class = input_query_class.tile([1, num_group])
|
||||
input_query_bbox = input_query_bbox.tile([1, num_group, 1])
|
||||
pad_gt_mask = pad_gt_mask.tile([1, num_group])
|
||||
|
||||
dn_positive_idx = paddle.nonzero(pad_gt_mask)[:, 1]
|
||||
dn_positive_idx = paddle.split(dn_positive_idx,
|
||||
[n * num_group for n in num_gts])
|
||||
# total denoising queries
|
||||
num_denoising = int(max_gt_num * num_group)
|
||||
|
||||
if label_noise_ratio > 0:
|
||||
input_query_class = input_query_class.flatten()
|
||||
pad_gt_mask = pad_gt_mask.flatten()
|
||||
# half of bbox prob
|
||||
mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5)
|
||||
chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1)
|
||||
# randomly put a new one here
|
||||
new_label = paddle.randint_like(
|
||||
chosen_idx, 0, num_classes, dtype=input_query_class.dtype)
|
||||
input_query_class.scatter_(chosen_idx, new_label)
|
||||
input_query_class.reshape_([bs, num_denoising])
|
||||
pad_gt_mask.reshape_([bs, num_denoising])
|
||||
|
||||
if box_noise_scale > 0:
|
||||
diff = paddle.concat(
|
||||
[input_query_bbox[..., 2:] * 0.5, input_query_bbox[..., 2:]],
|
||||
axis=-1) * box_noise_scale
|
||||
diff *= (paddle.rand(input_query_bbox.shape) * 2.0 - 1.0)
|
||||
input_query_bbox += diff
|
||||
input_query_bbox = inverse_sigmoid(input_query_bbox)
|
||||
|
||||
class_embed = paddle.concat(
|
||||
[class_embed, paddle.zeros([1, class_embed.shape[-1]])])
|
||||
input_query_class = paddle.gather(
|
||||
class_embed, input_query_class.flatten(),
|
||||
axis=0).reshape([bs, num_denoising, -1])
|
||||
|
||||
tgt_size = num_denoising + num_queries
|
||||
attn_mask = paddle.ones([tgt_size, tgt_size]) < 0
|
||||
# match query cannot see the reconstruction
|
||||
attn_mask[num_denoising:, :num_denoising] = True
|
||||
# reconstruct cannot see each other
|
||||
for i in range(num_group):
|
||||
if i == 0:
|
||||
attn_mask[max_gt_num * i:max_gt_num * (i + 1), max_gt_num * (i + 1):
|
||||
num_denoising] = True
|
||||
if i == num_group - 1:
|
||||
attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num *
|
||||
i] = True
|
||||
else:
|
||||
attn_mask[max_gt_num * i:max_gt_num * (i + 1), max_gt_num * (i + 1):
|
||||
num_denoising] = True
|
||||
attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num *
|
||||
i] = True
|
||||
attn_mask = ~attn_mask
|
||||
dn_meta = {
|
||||
"dn_positive_idx": dn_positive_idx,
|
||||
"dn_num_group": num_group,
|
||||
"dn_num_split": [num_denoising, num_queries]
|
||||
}
|
||||
|
||||
return input_query_class, input_query_bbox, attn_mask, dn_meta
|
||||
|
||||
|
||||
def get_contrastive_denoising_training_group(targets,
|
||||
num_classes,
|
||||
num_queries,
|
||||
class_embed,
|
||||
num_denoising=100,
|
||||
label_noise_ratio=0.5,
|
||||
box_noise_scale=1.0):
|
||||
if num_denoising <= 0:
|
||||
return None, None, None, None
|
||||
num_gts = [len(t) for t in targets["gt_class"]]
|
||||
max_gt_num = max(num_gts)
|
||||
if max_gt_num == 0:
|
||||
return None, None, None, None
|
||||
|
||||
num_group = num_denoising // max_gt_num
|
||||
num_group = 1 if num_group == 0 else num_group
|
||||
# pad gt to max_num of a batch
|
||||
bs = len(targets["gt_class"])
|
||||
input_query_class = paddle.full(
|
||||
[bs, max_gt_num], num_classes, dtype='int32')
|
||||
input_query_bbox = paddle.zeros([bs, max_gt_num, 4])
|
||||
pad_gt_mask = paddle.zeros([bs, max_gt_num])
|
||||
for i in range(bs):
|
||||
num_gt = num_gts[i]
|
||||
if num_gt > 0:
|
||||
input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1)
|
||||
input_query_bbox[i, :num_gt] = targets["gt_bbox"][i]
|
||||
pad_gt_mask[i, :num_gt] = 1
|
||||
# each group has positive and negative queries.
|
||||
input_query_class = input_query_class.tile([1, 2 * num_group])
|
||||
input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1])
|
||||
pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group])
|
||||
# positive and negative mask
|
||||
negative_gt_mask = paddle.zeros([bs, max_gt_num * 2, 1])
|
||||
negative_gt_mask[:, max_gt_num:] = 1
|
||||
negative_gt_mask = negative_gt_mask.tile([1, num_group, 1])
|
||||
positive_gt_mask = 1 - negative_gt_mask
|
||||
# contrastive denoising training positive index
|
||||
positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
|
||||
dn_positive_idx = paddle.nonzero(positive_gt_mask)[:, 1]
|
||||
dn_positive_idx = paddle.split(dn_positive_idx,
|
||||
[n * num_group for n in num_gts])
|
||||
# total denoising queries
|
||||
num_denoising = int(max_gt_num * 2 * num_group)
|
||||
|
||||
if label_noise_ratio > 0:
|
||||
input_query_class = input_query_class.flatten()
|
||||
pad_gt_mask = pad_gt_mask.flatten()
|
||||
|
||||
# Convert pad_gt_mask to bool if it's not already
|
||||
pad_gt_mask = pad_gt_mask.astype('bool')
|
||||
|
||||
# half of bbox prob
|
||||
mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5)
|
||||
chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1)
|
||||
|
||||
# randomly put a new one here
|
||||
new_label = paddle.randint_like(
|
||||
chosen_idx, 0, num_classes, dtype=input_query_class.dtype)
|
||||
|
||||
input_query_class.scatter_(chosen_idx, new_label)
|
||||
input_query_class.reshape_([bs, num_denoising])
|
||||
pad_gt_mask.reshape_([bs, num_denoising])
|
||||
|
||||
if box_noise_scale > 0:
|
||||
known_bbox = bbox_cxcywh_to_xyxy(input_query_bbox)
|
||||
|
||||
diff = paddle.tile(input_query_bbox[..., 2:] * 0.5,
|
||||
[1, 1, 2]) * box_noise_scale
|
||||
|
||||
rand_sign = paddle.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
|
||||
rand_part = paddle.rand(input_query_bbox.shape)
|
||||
rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (
|
||||
1 - negative_gt_mask)
|
||||
rand_part *= rand_sign
|
||||
known_bbox += rand_part * diff
|
||||
known_bbox.clip_(min=0.0, max=1.0)
|
||||
input_query_bbox = bbox_xyxy_to_cxcywh(known_bbox)
|
||||
input_query_bbox = inverse_sigmoid(input_query_bbox)
|
||||
|
||||
class_embed = paddle.concat(
|
||||
[class_embed, paddle.zeros([1, class_embed.shape[-1]])])
|
||||
input_query_class = paddle.gather(
|
||||
class_embed, input_query_class.flatten(),
|
||||
axis=0).reshape([bs, num_denoising, -1])
|
||||
|
||||
tgt_size = num_denoising + num_queries
|
||||
attn_mask = paddle.ones([tgt_size, tgt_size]) < 0
|
||||
# match query cannot see the reconstruction
|
||||
attn_mask[num_denoising:, :num_denoising] = True
|
||||
# reconstruct cannot see each other
|
||||
for i in range(num_group):
|
||||
if i == 0:
|
||||
attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num *
|
||||
2 * (i + 1):num_denoising] = True
|
||||
if i == num_group - 1:
|
||||
attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *
|
||||
i * 2] = True
|
||||
else:
|
||||
attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num *
|
||||
2 * (i + 1):num_denoising] = True
|
||||
attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *
|
||||
2 * i] = True
|
||||
attn_mask = ~attn_mask
|
||||
dn_meta = {
|
||||
"dn_positive_idx": dn_positive_idx,
|
||||
"dn_num_group": num_group,
|
||||
"dn_num_split": [num_denoising, num_queries]
|
||||
}
|
||||
|
||||
return input_query_class, input_query_bbox, attn_mask, dn_meta
|
||||
|
||||
|
||||
def get_sine_pos_embed(pos_tensor,
|
||||
num_pos_feats=128,
|
||||
temperature=10000,
|
||||
exchange_xy=True):
|
||||
"""generate sine position embedding from a position tensor
|
||||
|
||||
Args:
|
||||
pos_tensor (Tensor): Shape as `(None, n)`.
|
||||
num_pos_feats (int): projected shape for each float in the tensor. Default: 128
|
||||
temperature (int): The temperature used for scaling
|
||||
the position embedding. Default: 10000.
|
||||
exchange_xy (bool, optional): exchange pos x and pos y. \
|
||||
For example, input tensor is `[x, y]`, the results will # noqa
|
||||
be `[pos(y), pos(x)]`. Defaults: True.
|
||||
|
||||
Returns:
|
||||
Tensor: Returned position embedding # noqa
|
||||
with shape `(None, n * num_pos_feats)`.
|
||||
"""
|
||||
scale = 2. * math.pi
|
||||
dim_t = 2. * paddle.floor_divide(
|
||||
paddle.arange(num_pos_feats), paddle.to_tensor(2))
|
||||
dim_t = scale / temperature**(dim_t / num_pos_feats)
|
||||
|
||||
def sine_func(x):
|
||||
x *= dim_t
|
||||
return paddle.stack(
|
||||
(x[:, :, 0::2].sin(), x[:, :, 1::2].cos()), axis=3).flatten(2)
|
||||
|
||||
pos_res = [sine_func(x) for x in pos_tensor.split(pos_tensor.shape[-1], -1)]
|
||||
if exchange_xy:
|
||||
pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
|
||||
pos_res = paddle.concat(pos_res, axis=2)
|
||||
return pos_res
|
||||
|
||||
|
||||
def mask_to_box_coordinate(mask,
|
||||
normalize=False,
|
||||
format="xyxy",
|
||||
dtype="float32"):
|
||||
"""
|
||||
Compute the bounding boxes around the provided mask.
|
||||
Args:
|
||||
mask (Tensor:bool): [b, c, h, w]
|
||||
|
||||
Returns:
|
||||
bbox (Tensor): [b, c, 4]
|
||||
"""
|
||||
assert mask.ndim == 4
|
||||
assert format in ["xyxy", "xywh"]
|
||||
if mask.sum() == 0:
|
||||
return paddle.zeros([mask.shape[0], mask.shape[1], 4], dtype=dtype)
|
||||
|
||||
h, w = mask.shape[-2:]
|
||||
y, x = paddle.meshgrid(
|
||||
paddle.arange(
|
||||
end=h, dtype=dtype), paddle.arange(
|
||||
end=w, dtype=dtype))
|
||||
|
||||
x_mask = x * mask
|
||||
x_max = x_mask.flatten(-2).max(-1) + 1
|
||||
x_min = paddle.where(mask, x_mask,
|
||||
paddle.to_tensor(1e8)).flatten(-2).min(-1)
|
||||
|
||||
y_mask = y * mask
|
||||
y_max = y_mask.flatten(-2).max(-1) + 1
|
||||
y_min = paddle.where(mask, y_mask,
|
||||
paddle.to_tensor(1e8)).flatten(-2).min(-1)
|
||||
out_bbox = paddle.stack([x_min, y_min, x_max, y_max], axis=-1)
|
||||
if normalize:
|
||||
out_bbox /= paddle.to_tensor([w, h, w, h]).astype(dtype)
|
||||
|
||||
return out_bbox if format == "xyxy" else bbox_xyxy_to_cxcywh(out_bbox)
|
||||
|
||||
|
||||
def varifocal_loss_with_logits(pred_logits,
|
||||
gt_score,
|
||||
label,
|
||||
normalizer=1.0,
|
||||
alpha=0.75,
|
||||
gamma=2.0):
|
||||
pred_score = F.sigmoid(pred_logits)
|
||||
weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
|
||||
loss = F.binary_cross_entropy_with_logits(
|
||||
pred_logits, gt_score, weight=weight, reduction='none')
|
||||
return loss.mean(1).sum() / normalizer
|
||||
|
||||
|
||||
|
||||
|
||||
from ..initializer import linear_init_
|
||||
|
||||
class MLP(nn.Layer):
|
||||
"""This code is based on
|
||||
https://github.com/facebookresearch/detr/blob/main/models/detr.py
|
||||
"""
|
||||
|
||||
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
|
||||
super().__init__()
|
||||
self.num_layers = num_layers
|
||||
h = [hidden_dim] * (num_layers - 1)
|
||||
self.layers = nn.LayerList(
|
||||
nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
|
||||
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
for l in self.layers:
|
||||
linear_init_(l)
|
||||
|
||||
def forward(self, x):
|
||||
for i, layer in enumerate(self.layers):
|
||||
x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
|
||||
return x
|
||||
|
||||
Reference in New Issue
Block a user