first commit
This commit is contained in:
20
rtdetr_paddle/ppdet/modeling/transformers/__init__.py
Normal file
20
rtdetr_paddle/ppdet/modeling/transformers/__init__.py
Normal file
@@ -0,0 +1,20 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from .utils import *
|
||||
from .matchers import *
|
||||
from .position_encoding import *
|
||||
from .rtdetr_transformer import *
|
||||
from .dino_transformer import *
|
||||
from .hybrid_encoder import *
|
||||
@@ -0,0 +1,537 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
|
||||
# Copyright (c) 2020 SenseTime. All Rights Reserved.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
|
||||
from ppdet.core.workspace import register
|
||||
from ..layers import MultiHeadAttention
|
||||
from .position_encoding import PositionEmbedding
|
||||
from .utils import _get_clones, get_valid_ratio
|
||||
from ..initializer import linear_init_, constant_, xavier_uniform_, normal_
|
||||
|
||||
__all__ = ['DeformableTransformer']
|
||||
|
||||
|
||||
class MSDeformableAttention(nn.Layer):
|
||||
def __init__(self,
|
||||
embed_dim=256,
|
||||
num_heads=8,
|
||||
num_levels=4,
|
||||
num_points=4,
|
||||
lr_mult=0.1):
|
||||
"""
|
||||
Multi-Scale Deformable Attention Module
|
||||
"""
|
||||
super(MSDeformableAttention, self).__init__()
|
||||
self.embed_dim = embed_dim
|
||||
self.num_heads = num_heads
|
||||
self.num_levels = num_levels
|
||||
self.num_points = num_points
|
||||
self.total_points = num_heads * num_levels * num_points
|
||||
|
||||
self.head_dim = embed_dim // num_heads
|
||||
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
|
||||
|
||||
self.sampling_offsets = nn.Linear(
|
||||
embed_dim,
|
||||
self.total_points * 2,
|
||||
weight_attr=ParamAttr(learning_rate=lr_mult),
|
||||
bias_attr=ParamAttr(learning_rate=lr_mult))
|
||||
|
||||
self.attention_weights = nn.Linear(embed_dim, self.total_points)
|
||||
self.value_proj = nn.Linear(embed_dim, embed_dim)
|
||||
self.output_proj = nn.Linear(embed_dim, embed_dim)
|
||||
try:
|
||||
# use cuda op
|
||||
from deformable_detr_ops import ms_deformable_attn
|
||||
except:
|
||||
# use paddle func
|
||||
from .utils import deformable_attention_core_func as ms_deformable_attn
|
||||
self.ms_deformable_attn_core = ms_deformable_attn
|
||||
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
# sampling_offsets
|
||||
constant_(self.sampling_offsets.weight)
|
||||
thetas = paddle.arange(
|
||||
self.num_heads,
|
||||
dtype=paddle.float32) * (2.0 * math.pi / self.num_heads)
|
||||
grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1)
|
||||
grid_init = grid_init / grid_init.abs().max(-1, keepdim=True)
|
||||
grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile(
|
||||
[1, self.num_levels, self.num_points, 1])
|
||||
scaling = paddle.arange(
|
||||
1, self.num_points + 1,
|
||||
dtype=paddle.float32).reshape([1, 1, -1, 1])
|
||||
grid_init *= scaling
|
||||
self.sampling_offsets.bias.set_value(grid_init.flatten())
|
||||
# attention_weights
|
||||
constant_(self.attention_weights.weight)
|
||||
constant_(self.attention_weights.bias)
|
||||
# proj
|
||||
xavier_uniform_(self.value_proj.weight)
|
||||
constant_(self.value_proj.bias)
|
||||
xavier_uniform_(self.output_proj.weight)
|
||||
constant_(self.output_proj.bias)
|
||||
|
||||
def forward(self,
|
||||
query,
|
||||
reference_points,
|
||||
value,
|
||||
value_spatial_shapes,
|
||||
value_level_start_index,
|
||||
value_mask=None):
|
||||
"""
|
||||
Args:
|
||||
query (Tensor): [bs, query_length, C]
|
||||
reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
|
||||
bottom-right (1, 1), including padding area
|
||||
value (Tensor): [bs, value_length, C]
|
||||
value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
|
||||
value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
|
||||
value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
|
||||
|
||||
Returns:
|
||||
output (Tensor): [bs, Length_{query}, C]
|
||||
"""
|
||||
bs, Len_q = query.shape[:2]
|
||||
Len_v = value.shape[1]
|
||||
assert int(value_spatial_shapes.prod(1).sum()) == Len_v
|
||||
|
||||
value = self.value_proj(value)
|
||||
if value_mask is not None:
|
||||
value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
|
||||
value *= value_mask
|
||||
value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])
|
||||
|
||||
sampling_offsets = self.sampling_offsets(query).reshape(
|
||||
[bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
|
||||
attention_weights = self.attention_weights(query).reshape(
|
||||
[bs, Len_q, self.num_heads, self.num_levels * self.num_points])
|
||||
attention_weights = F.softmax(attention_weights).reshape(
|
||||
[bs, Len_q, self.num_heads, self.num_levels, self.num_points])
|
||||
|
||||
if reference_points.shape[-1] == 2:
|
||||
offset_normalizer = value_spatial_shapes.flip([1]).reshape(
|
||||
[1, 1, 1, self.num_levels, 1, 2])
|
||||
sampling_locations = reference_points.reshape([
|
||||
bs, Len_q, 1, self.num_levels, 1, 2
|
||||
]) + sampling_offsets / offset_normalizer
|
||||
elif reference_points.shape[-1] == 4:
|
||||
sampling_locations = (
|
||||
reference_points[:, :, None, :, None, :2] + sampling_offsets /
|
||||
self.num_points * reference_points[:, :, None, :, None, 2:] *
|
||||
0.5)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Last dim of reference_points must be 2 or 4, but get {} instead.".
|
||||
format(reference_points.shape[-1]))
|
||||
|
||||
output = self.ms_deformable_attn_core(
|
||||
value, value_spatial_shapes, value_level_start_index,
|
||||
sampling_locations, attention_weights)
|
||||
output = self.output_proj(output)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class DeformableTransformerEncoderLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
d_model=256,
|
||||
n_head=8,
|
||||
dim_feedforward=1024,
|
||||
dropout=0.1,
|
||||
activation="relu",
|
||||
n_levels=4,
|
||||
n_points=4,
|
||||
lr_mult=0.1,
|
||||
weight_attr=None,
|
||||
bias_attr=None):
|
||||
super(DeformableTransformerEncoderLayer, self).__init__()
|
||||
# self attention
|
||||
self.self_attn = MSDeformableAttention(d_model, n_head, n_levels,
|
||||
n_points, lr_mult)
|
||||
self.dropout1 = nn.Dropout(dropout)
|
||||
self.norm1 = nn.LayerNorm(
|
||||
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
|
||||
# ffn
|
||||
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
||||
self.activation = getattr(F, activation)
|
||||
self.dropout2 = nn.Dropout(dropout)
|
||||
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
||||
self.dropout3 = nn.Dropout(dropout)
|
||||
self.norm2 = nn.LayerNorm(
|
||||
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
linear_init_(self.linear1)
|
||||
linear_init_(self.linear2)
|
||||
xavier_uniform_(self.linear1.weight)
|
||||
xavier_uniform_(self.linear2.weight)
|
||||
|
||||
def with_pos_embed(self, tensor, pos):
|
||||
return tensor if pos is None else tensor + pos
|
||||
|
||||
def forward_ffn(self, src):
|
||||
src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
|
||||
src = src + self.dropout3(src2)
|
||||
src = self.norm2(src)
|
||||
return src
|
||||
|
||||
def forward(self,
|
||||
src,
|
||||
reference_points,
|
||||
spatial_shapes,
|
||||
level_start_index,
|
||||
src_mask=None,
|
||||
query_pos_embed=None):
|
||||
# self attention
|
||||
src2 = self.self_attn(
|
||||
self.with_pos_embed(src, query_pos_embed), reference_points, src,
|
||||
spatial_shapes, level_start_index, src_mask)
|
||||
src = src + self.dropout1(src2)
|
||||
src = self.norm1(src)
|
||||
# ffn
|
||||
src = self.forward_ffn(src)
|
||||
|
||||
return src
|
||||
|
||||
|
||||
class DeformableTransformerEncoder(nn.Layer):
|
||||
def __init__(self, encoder_layer, num_layers):
|
||||
super(DeformableTransformerEncoder, self).__init__()
|
||||
self.layers = _get_clones(encoder_layer, num_layers)
|
||||
self.num_layers = num_layers
|
||||
|
||||
@staticmethod
|
||||
def get_reference_points(spatial_shapes, valid_ratios, offset=0.5):
|
||||
valid_ratios = valid_ratios.unsqueeze(1)
|
||||
reference_points = []
|
||||
for i, (H, W) in enumerate(spatial_shapes):
|
||||
ref_y, ref_x = paddle.meshgrid(
|
||||
paddle.arange(end=H) + offset, paddle.arange(end=W) + offset)
|
||||
ref_y = ref_y.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 1] *
|
||||
H)
|
||||
ref_x = ref_x.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 0] *
|
||||
W)
|
||||
reference_points.append(paddle.stack((ref_x, ref_y), axis=-1))
|
||||
reference_points = paddle.concat(reference_points, 1).unsqueeze(2)
|
||||
reference_points = reference_points * valid_ratios
|
||||
return reference_points
|
||||
|
||||
def forward(self,
|
||||
feat,
|
||||
spatial_shapes,
|
||||
level_start_index,
|
||||
feat_mask=None,
|
||||
query_pos_embed=None,
|
||||
valid_ratios=None):
|
||||
if valid_ratios is None:
|
||||
valid_ratios = paddle.ones(
|
||||
[feat.shape[0], spatial_shapes.shape[0], 2])
|
||||
reference_points = self.get_reference_points(spatial_shapes,
|
||||
valid_ratios)
|
||||
for layer in self.layers:
|
||||
feat = layer(feat, reference_points, spatial_shapes,
|
||||
level_start_index, feat_mask, query_pos_embed)
|
||||
|
||||
return feat
|
||||
|
||||
|
||||
class DeformableTransformerDecoderLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
d_model=256,
|
||||
n_head=8,
|
||||
dim_feedforward=1024,
|
||||
dropout=0.1,
|
||||
activation="relu",
|
||||
n_levels=4,
|
||||
n_points=4,
|
||||
lr_mult=0.1,
|
||||
weight_attr=None,
|
||||
bias_attr=None):
|
||||
super(DeformableTransformerDecoderLayer, self).__init__()
|
||||
|
||||
# self attention
|
||||
self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
|
||||
self.dropout1 = nn.Dropout(dropout)
|
||||
self.norm1 = nn.LayerNorm(
|
||||
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
|
||||
|
||||
# cross attention
|
||||
self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,
|
||||
n_points, lr_mult)
|
||||
self.dropout2 = nn.Dropout(dropout)
|
||||
self.norm2 = nn.LayerNorm(
|
||||
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
|
||||
|
||||
# ffn
|
||||
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
||||
self.activation = getattr(F, activation)
|
||||
self.dropout3 = nn.Dropout(dropout)
|
||||
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
||||
self.dropout4 = nn.Dropout(dropout)
|
||||
self.norm3 = nn.LayerNorm(
|
||||
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
linear_init_(self.linear1)
|
||||
linear_init_(self.linear2)
|
||||
xavier_uniform_(self.linear1.weight)
|
||||
xavier_uniform_(self.linear2.weight)
|
||||
|
||||
def with_pos_embed(self, tensor, pos):
|
||||
return tensor if pos is None else tensor + pos
|
||||
|
||||
def forward_ffn(self, tgt):
|
||||
tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
|
||||
tgt = tgt + self.dropout4(tgt2)
|
||||
tgt = self.norm3(tgt)
|
||||
return tgt
|
||||
|
||||
def forward(self,
|
||||
tgt,
|
||||
reference_points,
|
||||
memory,
|
||||
memory_spatial_shapes,
|
||||
memory_level_start_index,
|
||||
memory_mask=None,
|
||||
query_pos_embed=None):
|
||||
# self attention
|
||||
q = k = self.with_pos_embed(tgt, query_pos_embed)
|
||||
tgt2 = self.self_attn(q, k, value=tgt)
|
||||
tgt = tgt + self.dropout1(tgt2)
|
||||
tgt = self.norm1(tgt)
|
||||
|
||||
# cross attention
|
||||
tgt2 = self.cross_attn(
|
||||
self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
|
||||
memory_spatial_shapes, memory_level_start_index, memory_mask)
|
||||
tgt = tgt + self.dropout2(tgt2)
|
||||
tgt = self.norm2(tgt)
|
||||
|
||||
# ffn
|
||||
tgt = self.forward_ffn(tgt)
|
||||
|
||||
return tgt
|
||||
|
||||
|
||||
class DeformableTransformerDecoder(nn.Layer):
|
||||
def __init__(self, decoder_layer, num_layers, return_intermediate=False):
|
||||
super(DeformableTransformerDecoder, self).__init__()
|
||||
self.layers = _get_clones(decoder_layer, num_layers)
|
||||
self.num_layers = num_layers
|
||||
self.return_intermediate = return_intermediate
|
||||
|
||||
def forward(self,
|
||||
tgt,
|
||||
reference_points,
|
||||
memory,
|
||||
memory_spatial_shapes,
|
||||
memory_level_start_index,
|
||||
memory_mask=None,
|
||||
query_pos_embed=None):
|
||||
output = tgt
|
||||
intermediate = []
|
||||
for lid, layer in enumerate(self.layers):
|
||||
output = layer(output, reference_points, memory,
|
||||
memory_spatial_shapes, memory_level_start_index,
|
||||
memory_mask, query_pos_embed)
|
||||
|
||||
if self.return_intermediate:
|
||||
intermediate.append(output)
|
||||
|
||||
if self.return_intermediate:
|
||||
return paddle.stack(intermediate)
|
||||
|
||||
return output.unsqueeze(0)
|
||||
|
||||
|
||||
@register
|
||||
class DeformableTransformer(nn.Layer):
|
||||
__shared__ = ['hidden_dim']
|
||||
|
||||
def __init__(self,
|
||||
num_queries=300,
|
||||
position_embed_type='sine',
|
||||
return_intermediate_dec=True,
|
||||
in_feats_channel=[512, 1024, 2048],
|
||||
num_feature_levels=4,
|
||||
num_encoder_points=4,
|
||||
num_decoder_points=4,
|
||||
hidden_dim=256,
|
||||
nhead=8,
|
||||
num_encoder_layers=6,
|
||||
num_decoder_layers=6,
|
||||
dim_feedforward=1024,
|
||||
dropout=0.1,
|
||||
activation="relu",
|
||||
lr_mult=0.1,
|
||||
pe_temperature=10000,
|
||||
pe_offset=-0.5):
|
||||
super(DeformableTransformer, self).__init__()
|
||||
assert position_embed_type in ['sine', 'learned'], \
|
||||
f'ValueError: position_embed_type not supported {position_embed_type}!'
|
||||
assert len(in_feats_channel) <= num_feature_levels
|
||||
|
||||
self.hidden_dim = hidden_dim
|
||||
self.nhead = nhead
|
||||
self.num_feature_levels = num_feature_levels
|
||||
|
||||
encoder_layer = DeformableTransformerEncoderLayer(
|
||||
hidden_dim, nhead, dim_feedforward, dropout, activation,
|
||||
num_feature_levels, num_encoder_points, lr_mult)
|
||||
self.encoder = DeformableTransformerEncoder(encoder_layer,
|
||||
num_encoder_layers)
|
||||
|
||||
decoder_layer = DeformableTransformerDecoderLayer(
|
||||
hidden_dim, nhead, dim_feedforward, dropout, activation,
|
||||
num_feature_levels, num_decoder_points)
|
||||
self.decoder = DeformableTransformerDecoder(
|
||||
decoder_layer, num_decoder_layers, return_intermediate_dec)
|
||||
|
||||
self.level_embed = nn.Embedding(num_feature_levels, hidden_dim)
|
||||
self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
|
||||
self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)
|
||||
|
||||
self.reference_points = nn.Linear(
|
||||
hidden_dim,
|
||||
2,
|
||||
weight_attr=ParamAttr(learning_rate=lr_mult),
|
||||
bias_attr=ParamAttr(learning_rate=lr_mult))
|
||||
|
||||
self.input_proj = nn.LayerList()
|
||||
for in_channels in in_feats_channel:
|
||||
self.input_proj.append(
|
||||
nn.Sequential(
|
||||
nn.Conv2D(
|
||||
in_channels, hidden_dim, kernel_size=1),
|
||||
nn.GroupNorm(32, hidden_dim)))
|
||||
in_channels = in_feats_channel[-1]
|
||||
for _ in range(num_feature_levels - len(in_feats_channel)):
|
||||
self.input_proj.append(
|
||||
nn.Sequential(
|
||||
nn.Conv2D(
|
||||
in_channels,
|
||||
hidden_dim,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1),
|
||||
nn.GroupNorm(32, hidden_dim)))
|
||||
in_channels = hidden_dim
|
||||
|
||||
self.position_embedding = PositionEmbedding(
|
||||
hidden_dim // 2,
|
||||
temperature=pe_temperature,
|
||||
normalize=True if position_embed_type == 'sine' else False,
|
||||
embed_type=position_embed_type,
|
||||
offset=pe_offset,
|
||||
eps=1e-4)
|
||||
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
normal_(self.level_embed.weight)
|
||||
normal_(self.tgt_embed.weight)
|
||||
normal_(self.query_pos_embed.weight)
|
||||
xavier_uniform_(self.reference_points.weight)
|
||||
constant_(self.reference_points.bias)
|
||||
for l in self.input_proj:
|
||||
xavier_uniform_(l[0].weight)
|
||||
constant_(l[0].bias)
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, cfg, input_shape):
|
||||
return {'in_feats_channel': [i.channels for i in input_shape], }
|
||||
|
||||
def forward(self, src_feats, src_mask=None, *args, **kwargs):
|
||||
srcs = []
|
||||
for i in range(len(src_feats)):
|
||||
srcs.append(self.input_proj[i](src_feats[i]))
|
||||
if self.num_feature_levels > len(srcs):
|
||||
len_srcs = len(srcs)
|
||||
for i in range(len_srcs, self.num_feature_levels):
|
||||
if i == len_srcs:
|
||||
srcs.append(self.input_proj[i](src_feats[-1]))
|
||||
else:
|
||||
srcs.append(self.input_proj[i](srcs[-1]))
|
||||
src_flatten = []
|
||||
mask_flatten = []
|
||||
lvl_pos_embed_flatten = []
|
||||
spatial_shapes = []
|
||||
valid_ratios = []
|
||||
for level, src in enumerate(srcs):
|
||||
src_shape = paddle.shape(src)
|
||||
bs = src_shape[0:1]
|
||||
h = src_shape[2:3]
|
||||
w = src_shape[3:4]
|
||||
spatial_shapes.append(paddle.concat([h, w]))
|
||||
src = src.flatten(2).transpose([0, 2, 1])
|
||||
src_flatten.append(src)
|
||||
if src_mask is not None:
|
||||
mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]
|
||||
else:
|
||||
mask = paddle.ones([bs, h, w])
|
||||
valid_ratios.append(get_valid_ratio(mask))
|
||||
pos_embed = self.position_embedding(mask).flatten(1, 2)
|
||||
lvl_pos_embed = pos_embed + self.level_embed.weight[level]
|
||||
lvl_pos_embed_flatten.append(lvl_pos_embed)
|
||||
mask = mask.flatten(1)
|
||||
mask_flatten.append(mask)
|
||||
src_flatten = paddle.concat(src_flatten, 1)
|
||||
mask_flatten = None if src_mask is None else paddle.concat(mask_flatten,
|
||||
1)
|
||||
lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
|
||||
# [l, 2]
|
||||
spatial_shapes = paddle.to_tensor(
|
||||
paddle.stack(spatial_shapes).astype('int64'))
|
||||
# [l], 每一个level的起始index
|
||||
level_start_index = paddle.concat([
|
||||
paddle.zeros(
|
||||
[1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
|
||||
])
|
||||
# [b, l, 2]
|
||||
valid_ratios = paddle.stack(valid_ratios, 1)
|
||||
|
||||
# encoder
|
||||
memory = self.encoder(src_flatten, spatial_shapes, level_start_index,
|
||||
mask_flatten, lvl_pos_embed_flatten, valid_ratios)
|
||||
|
||||
# prepare input for decoder
|
||||
bs, _, c = memory.shape
|
||||
query_embed = self.query_pos_embed.weight.unsqueeze(0).tile([bs, 1, 1])
|
||||
tgt = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
|
||||
reference_points = F.sigmoid(self.reference_points(query_embed))
|
||||
reference_points_input = reference_points.unsqueeze(
|
||||
2) * valid_ratios.unsqueeze(1)
|
||||
|
||||
# decoder
|
||||
hs = self.decoder(tgt, reference_points_input, memory, spatial_shapes,
|
||||
level_start_index, mask_flatten, query_embed)
|
||||
|
||||
return (hs, memory, reference_points)
|
||||
359
rtdetr_paddle/ppdet/modeling/transformers/detr_transformer.py
Normal file
359
rtdetr_paddle/ppdet/modeling/transformers/detr_transformer.py
Normal file
@@ -0,0 +1,359 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Modified from DETR (https://github.com/facebookresearch/detr)
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
|
||||
from ppdet.core.workspace import register
|
||||
from ..layers import MultiHeadAttention, _convert_attention_mask
|
||||
from .position_encoding import PositionEmbedding
|
||||
from .utils import _get_clones
|
||||
from ..initializer import linear_init_, conv_init_, xavier_uniform_, normal_
|
||||
|
||||
__all__ = ['DETRTransformer']
|
||||
|
||||
|
||||
class TransformerEncoderLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
d_model,
|
||||
nhead,
|
||||
dim_feedforward=2048,
|
||||
dropout=0.1,
|
||||
activation="relu",
|
||||
attn_dropout=None,
|
||||
act_dropout=None,
|
||||
normalize_before=False):
|
||||
super(TransformerEncoderLayer, self).__init__()
|
||||
attn_dropout = dropout if attn_dropout is None else attn_dropout
|
||||
act_dropout = dropout if act_dropout is None else act_dropout
|
||||
self.normalize_before = normalize_before
|
||||
|
||||
self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
|
||||
# Implementation of Feedforward model
|
||||
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
||||
self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
|
||||
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
||||
|
||||
self.norm1 = nn.LayerNorm(d_model)
|
||||
self.norm2 = nn.LayerNorm(d_model)
|
||||
self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
|
||||
self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
|
||||
self.activation = getattr(F, activation)
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
linear_init_(self.linear1)
|
||||
linear_init_(self.linear2)
|
||||
|
||||
@staticmethod
|
||||
def with_pos_embed(tensor, pos_embed):
|
||||
return tensor if pos_embed is None else tensor + pos_embed
|
||||
|
||||
def forward(self, src, src_mask=None, pos_embed=None):
|
||||
residual = src
|
||||
if self.normalize_before:
|
||||
src = self.norm1(src)
|
||||
q = k = self.with_pos_embed(src, pos_embed)
|
||||
src = self.self_attn(q, k, value=src, attn_mask=src_mask)
|
||||
|
||||
src = residual + self.dropout1(src)
|
||||
if not self.normalize_before:
|
||||
src = self.norm1(src)
|
||||
|
||||
residual = src
|
||||
if self.normalize_before:
|
||||
src = self.norm2(src)
|
||||
src = self.linear2(self.dropout(self.activation(self.linear1(src))))
|
||||
src = residual + self.dropout2(src)
|
||||
if not self.normalize_before:
|
||||
src = self.norm2(src)
|
||||
return src
|
||||
|
||||
|
||||
class TransformerEncoder(nn.Layer):
|
||||
def __init__(self, encoder_layer, num_layers, norm=None):
|
||||
super(TransformerEncoder, self).__init__()
|
||||
self.layers = _get_clones(encoder_layer, num_layers)
|
||||
self.num_layers = num_layers
|
||||
self.norm = norm
|
||||
|
||||
def forward(self, src, src_mask=None, pos_embed=None):
|
||||
output = src
|
||||
for layer in self.layers:
|
||||
output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
|
||||
|
||||
if self.norm is not None:
|
||||
output = self.norm(output)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class TransformerDecoderLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
d_model,
|
||||
nhead,
|
||||
dim_feedforward=2048,
|
||||
dropout=0.1,
|
||||
activation="relu",
|
||||
attn_dropout=None,
|
||||
act_dropout=None,
|
||||
normalize_before=False):
|
||||
super(TransformerDecoderLayer, self).__init__()
|
||||
attn_dropout = dropout if attn_dropout is None else attn_dropout
|
||||
act_dropout = dropout if act_dropout is None else act_dropout
|
||||
self.normalize_before = normalize_before
|
||||
|
||||
self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
|
||||
self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
|
||||
# Implementation of Feedforward model
|
||||
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
||||
self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
|
||||
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
||||
|
||||
self.norm1 = nn.LayerNorm(d_model)
|
||||
self.norm2 = nn.LayerNorm(d_model)
|
||||
self.norm3 = nn.LayerNorm(d_model)
|
||||
self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
|
||||
self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
|
||||
self.dropout3 = nn.Dropout(dropout, mode="upscale_in_train")
|
||||
self.activation = getattr(F, activation)
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
linear_init_(self.linear1)
|
||||
linear_init_(self.linear2)
|
||||
|
||||
@staticmethod
|
||||
def with_pos_embed(tensor, pos_embed):
|
||||
return tensor if pos_embed is None else tensor + pos_embed
|
||||
|
||||
def forward(self,
|
||||
tgt,
|
||||
memory,
|
||||
tgt_mask=None,
|
||||
memory_mask=None,
|
||||
pos_embed=None,
|
||||
query_pos_embed=None):
|
||||
tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
|
||||
|
||||
residual = tgt
|
||||
if self.normalize_before:
|
||||
tgt = self.norm1(tgt)
|
||||
q = k = self.with_pos_embed(tgt, query_pos_embed)
|
||||
tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask)
|
||||
tgt = residual + self.dropout1(tgt)
|
||||
if not self.normalize_before:
|
||||
tgt = self.norm1(tgt)
|
||||
|
||||
residual = tgt
|
||||
if self.normalize_before:
|
||||
tgt = self.norm2(tgt)
|
||||
q = self.with_pos_embed(tgt, query_pos_embed)
|
||||
k = self.with_pos_embed(memory, pos_embed)
|
||||
tgt = self.cross_attn(q, k, value=memory, attn_mask=memory_mask)
|
||||
tgt = residual + self.dropout2(tgt)
|
||||
if not self.normalize_before:
|
||||
tgt = self.norm2(tgt)
|
||||
|
||||
residual = tgt
|
||||
if self.normalize_before:
|
||||
tgt = self.norm3(tgt)
|
||||
tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
|
||||
tgt = residual + self.dropout3(tgt)
|
||||
if not self.normalize_before:
|
||||
tgt = self.norm3(tgt)
|
||||
return tgt
|
||||
|
||||
|
||||
class TransformerDecoder(nn.Layer):
|
||||
def __init__(self,
|
||||
decoder_layer,
|
||||
num_layers,
|
||||
norm=None,
|
||||
return_intermediate=False):
|
||||
super(TransformerDecoder, self).__init__()
|
||||
self.layers = _get_clones(decoder_layer, num_layers)
|
||||
self.num_layers = num_layers
|
||||
self.norm = norm
|
||||
self.return_intermediate = return_intermediate
|
||||
|
||||
def forward(self,
|
||||
tgt,
|
||||
memory,
|
||||
tgt_mask=None,
|
||||
memory_mask=None,
|
||||
pos_embed=None,
|
||||
query_pos_embed=None):
|
||||
tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
|
||||
|
||||
output = tgt
|
||||
intermediate = []
|
||||
for layer in self.layers:
|
||||
output = layer(
|
||||
output,
|
||||
memory,
|
||||
tgt_mask=tgt_mask,
|
||||
memory_mask=memory_mask,
|
||||
pos_embed=pos_embed,
|
||||
query_pos_embed=query_pos_embed)
|
||||
if self.return_intermediate:
|
||||
intermediate.append(self.norm(output))
|
||||
|
||||
if self.norm is not None:
|
||||
output = self.norm(output)
|
||||
|
||||
if self.return_intermediate:
|
||||
return paddle.stack(intermediate)
|
||||
|
||||
return output.unsqueeze(0)
|
||||
|
||||
|
||||
@register
|
||||
class DETRTransformer(nn.Layer):
|
||||
__shared__ = ['hidden_dim']
|
||||
|
||||
def __init__(self,
|
||||
num_queries=100,
|
||||
position_embed_type='sine',
|
||||
return_intermediate_dec=True,
|
||||
backbone_num_channels=2048,
|
||||
hidden_dim=256,
|
||||
nhead=8,
|
||||
num_encoder_layers=6,
|
||||
num_decoder_layers=6,
|
||||
dim_feedforward=2048,
|
||||
dropout=0.1,
|
||||
activation="relu",
|
||||
pe_temperature=10000,
|
||||
pe_offset=0.,
|
||||
attn_dropout=None,
|
||||
act_dropout=None,
|
||||
normalize_before=False):
|
||||
super(DETRTransformer, self).__init__()
|
||||
assert position_embed_type in ['sine', 'learned'],\
|
||||
f'ValueError: position_embed_type not supported {position_embed_type}!'
|
||||
self.hidden_dim = hidden_dim
|
||||
self.nhead = nhead
|
||||
|
||||
encoder_layer = TransformerEncoderLayer(
|
||||
hidden_dim, nhead, dim_feedforward, dropout, activation,
|
||||
attn_dropout, act_dropout, normalize_before)
|
||||
encoder_norm = nn.LayerNorm(hidden_dim) if normalize_before else None
|
||||
self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
|
||||
encoder_norm)
|
||||
|
||||
decoder_layer = TransformerDecoderLayer(
|
||||
hidden_dim, nhead, dim_feedforward, dropout, activation,
|
||||
attn_dropout, act_dropout, normalize_before)
|
||||
decoder_norm = nn.LayerNorm(hidden_dim)
|
||||
self.decoder = TransformerDecoder(
|
||||
decoder_layer,
|
||||
num_decoder_layers,
|
||||
decoder_norm,
|
||||
return_intermediate=return_intermediate_dec)
|
||||
|
||||
self.input_proj = nn.Conv2D(
|
||||
backbone_num_channels, hidden_dim, kernel_size=1)
|
||||
self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)
|
||||
self.position_embedding = PositionEmbedding(
|
||||
hidden_dim // 2,
|
||||
temperature=pe_temperature,
|
||||
normalize=True if position_embed_type == 'sine' else False,
|
||||
embed_type=position_embed_type,
|
||||
offset=pe_offset)
|
||||
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
for p in self.parameters():
|
||||
if p.dim() > 1:
|
||||
xavier_uniform_(p)
|
||||
conv_init_(self.input_proj)
|
||||
normal_(self.query_pos_embed.weight)
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, cfg, input_shape):
|
||||
return {
|
||||
'backbone_num_channels': [i.channels for i in input_shape][-1],
|
||||
}
|
||||
|
||||
def _convert_attention_mask(self, mask):
|
||||
return (mask - 1.0) * 1e9
|
||||
|
||||
def forward(self, src, src_mask=None, *args, **kwargs):
|
||||
r"""
|
||||
Applies a Transformer model on the inputs.
|
||||
|
||||
Parameters:
|
||||
src (List(Tensor)): Backbone feature maps with shape [[bs, c, h, w]].
|
||||
src_mask (Tensor, optional): A tensor used in multi-head attention
|
||||
to prevents attention to some unwanted positions, usually the
|
||||
paddings or the subsequent positions. It is a tensor with shape
|
||||
[bs, H, W]`. When the data type is bool, the unwanted positions
|
||||
have `False` values and the others have `True` values. When the
|
||||
data type is int, the unwanted positions have 0 values and the
|
||||
others have 1 values. When the data type is float, the unwanted
|
||||
positions have `-INF` values and the others have 0 values. It
|
||||
can be None when nothing wanted or needed to be prevented
|
||||
attention to. Default None.
|
||||
|
||||
Returns:
|
||||
output (Tensor): [num_levels, batch_size, num_queries, hidden_dim]
|
||||
memory (Tensor): [batch_size, hidden_dim, h, w]
|
||||
"""
|
||||
# use last level feature map
|
||||
src_proj = self.input_proj(src[-1])
|
||||
bs, c, h, w = paddle.shape(src_proj)
|
||||
# flatten [B, C, H, W] to [B, HxW, C]
|
||||
src_flatten = src_proj.flatten(2).transpose([0, 2, 1])
|
||||
if src_mask is not None:
|
||||
src_mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]
|
||||
else:
|
||||
src_mask = paddle.ones([bs, h, w])
|
||||
pos_embed = self.position_embedding(src_mask).flatten(1, 2)
|
||||
|
||||
if self.training:
|
||||
src_mask = self._convert_attention_mask(src_mask)
|
||||
src_mask = src_mask.reshape([bs, 1, 1, h * w])
|
||||
else:
|
||||
src_mask = None
|
||||
|
||||
memory = self.encoder(
|
||||
src_flatten, src_mask=src_mask, pos_embed=pos_embed)
|
||||
|
||||
query_pos_embed = self.query_pos_embed.weight.unsqueeze(0).tile(
|
||||
[bs, 1, 1])
|
||||
tgt = paddle.zeros_like(query_pos_embed)
|
||||
output = self.decoder(
|
||||
tgt,
|
||||
memory,
|
||||
memory_mask=src_mask,
|
||||
pos_embed=pos_embed,
|
||||
query_pos_embed=query_pos_embed)
|
||||
|
||||
if self.training:
|
||||
src_mask = src_mask.reshape([bs, 1, 1, h, w])
|
||||
else:
|
||||
src_mask = None
|
||||
|
||||
return (output, memory.transpose([0, 2, 1]).reshape([bs, c, h, w]),
|
||||
src_proj, src_mask)
|
||||
527
rtdetr_paddle/ppdet/modeling/transformers/dino_transformer.py
Normal file
527
rtdetr_paddle/ppdet/modeling/transformers/dino_transformer.py
Normal file
@@ -0,0 +1,527 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
|
||||
# Copyright (c) 2020 SenseTime. All Rights Reserved.
|
||||
# Modified from detrex (https://github.com/IDEA-Research/detrex)
|
||||
# Copyright 2022 The IDEA Authors. All rights reserved.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
from paddle.regularizer import L2Decay
|
||||
|
||||
from ppdet.core.workspace import register
|
||||
from ..layers import MultiHeadAttention
|
||||
from .position_encoding import PositionEmbedding
|
||||
from .deformable_transformer import (MSDeformableAttention,
|
||||
DeformableTransformerEncoderLayer,
|
||||
DeformableTransformerEncoder)
|
||||
from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,
|
||||
bias_init_with_prob)
|
||||
from .utils import (_get_clones, get_valid_ratio,
|
||||
get_contrastive_denoising_training_group,
|
||||
get_sine_pos_embed, inverse_sigmoid, MLP)
|
||||
|
||||
__all__ = ['DINOTransformer']
|
||||
|
||||
|
||||
class DINOTransformerDecoderLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
d_model=256,
|
||||
n_head=8,
|
||||
dim_feedforward=1024,
|
||||
dropout=0.,
|
||||
activation="relu",
|
||||
n_levels=4,
|
||||
n_points=4,
|
||||
lr_mult=1.0,
|
||||
weight_attr=None,
|
||||
bias_attr=None):
|
||||
super(DINOTransformerDecoderLayer, self).__init__()
|
||||
|
||||
# self attention
|
||||
self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
|
||||
self.dropout1 = nn.Dropout(dropout)
|
||||
self.norm1 = nn.LayerNorm(
|
||||
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
|
||||
|
||||
# cross attention
|
||||
self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,
|
||||
n_points, lr_mult)
|
||||
self.dropout2 = nn.Dropout(dropout)
|
||||
self.norm2 = nn.LayerNorm(
|
||||
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
|
||||
|
||||
# ffn
|
||||
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
||||
self.activation = getattr(F, activation)
|
||||
self.dropout3 = nn.Dropout(dropout)
|
||||
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
||||
self.dropout4 = nn.Dropout(dropout)
|
||||
self.norm3 = nn.LayerNorm(
|
||||
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
linear_init_(self.linear1)
|
||||
linear_init_(self.linear2)
|
||||
xavier_uniform_(self.linear1.weight)
|
||||
xavier_uniform_(self.linear2.weight)
|
||||
|
||||
def with_pos_embed(self, tensor, pos):
|
||||
return tensor if pos is None else tensor + pos
|
||||
|
||||
def forward_ffn(self, tgt):
|
||||
return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
|
||||
|
||||
def forward(self,
|
||||
tgt,
|
||||
reference_points,
|
||||
memory,
|
||||
memory_spatial_shapes,
|
||||
memory_level_start_index,
|
||||
attn_mask=None,
|
||||
memory_mask=None,
|
||||
query_pos_embed=None):
|
||||
# self attention
|
||||
q = k = self.with_pos_embed(tgt, query_pos_embed)
|
||||
if attn_mask is not None:
|
||||
attn_mask = paddle.where(
|
||||
attn_mask.astype('bool'),
|
||||
paddle.zeros(attn_mask.shape, tgt.dtype),
|
||||
paddle.full(attn_mask.shape, float("-inf"), tgt.dtype))
|
||||
tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
|
||||
tgt = tgt + self.dropout1(tgt2)
|
||||
tgt = self.norm1(tgt)
|
||||
|
||||
# cross attention
|
||||
tgt2 = self.cross_attn(
|
||||
self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
|
||||
memory_spatial_shapes, memory_level_start_index, memory_mask)
|
||||
tgt = tgt + self.dropout2(tgt2)
|
||||
tgt = self.norm2(tgt)
|
||||
|
||||
# ffn
|
||||
tgt2 = self.forward_ffn(tgt)
|
||||
tgt = tgt + self.dropout4(tgt2)
|
||||
tgt = self.norm3(tgt)
|
||||
|
||||
return tgt
|
||||
|
||||
|
||||
class DINOTransformerDecoder(nn.Layer):
|
||||
def __init__(self,
|
||||
hidden_dim,
|
||||
decoder_layer,
|
||||
num_layers,
|
||||
weight_attr=None,
|
||||
bias_attr=None):
|
||||
super(DINOTransformerDecoder, self).__init__()
|
||||
self.layers = _get_clones(decoder_layer, num_layers)
|
||||
self.hidden_dim = hidden_dim
|
||||
self.num_layers = num_layers
|
||||
self.norm = nn.LayerNorm(
|
||||
hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)
|
||||
|
||||
def forward(self,
|
||||
tgt,
|
||||
ref_points_unact,
|
||||
memory,
|
||||
memory_spatial_shapes,
|
||||
memory_level_start_index,
|
||||
bbox_head,
|
||||
query_pos_head,
|
||||
valid_ratios=None,
|
||||
attn_mask=None,
|
||||
memory_mask=None):
|
||||
if valid_ratios is None:
|
||||
valid_ratios = paddle.ones(
|
||||
[memory.shape[0], memory_spatial_shapes.shape[0], 2])
|
||||
|
||||
output = tgt
|
||||
intermediate = []
|
||||
inter_bboxes = []
|
||||
ref_points = F.sigmoid(ref_points_unact)
|
||||
for i, layer in enumerate(self.layers):
|
||||
reference_points_input = ref_points.detach().unsqueeze(
|
||||
2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1)
|
||||
query_pos_embed = get_sine_pos_embed(
|
||||
reference_points_input[..., 0, :], self.hidden_dim // 2)
|
||||
query_pos_embed = query_pos_head(query_pos_embed)
|
||||
|
||||
output = layer(output, reference_points_input, memory,
|
||||
memory_spatial_shapes, memory_level_start_index,
|
||||
attn_mask, memory_mask, query_pos_embed)
|
||||
|
||||
ref_points = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
|
||||
ref_points.detach()))
|
||||
|
||||
intermediate.append(self.norm(output))
|
||||
inter_bboxes.append(ref_points)
|
||||
|
||||
return paddle.stack(intermediate), paddle.stack(inter_bboxes)
|
||||
|
||||
|
||||
@register
|
||||
class DINOTransformer(nn.Layer):
|
||||
__shared__ = ['num_classes', 'hidden_dim']
|
||||
|
||||
def __init__(self,
|
||||
num_classes=80,
|
||||
hidden_dim=256,
|
||||
num_queries=900,
|
||||
position_embed_type='sine',
|
||||
in_feats_channel=[512, 1024, 2048],
|
||||
num_levels=4,
|
||||
num_encoder_points=4,
|
||||
num_decoder_points=4,
|
||||
nhead=8,
|
||||
num_encoder_layers=6,
|
||||
num_decoder_layers=6,
|
||||
dim_feedforward=1024,
|
||||
dropout=0.,
|
||||
activation="relu",
|
||||
lr_mult=1.0,
|
||||
pe_temperature=10000,
|
||||
pe_offset=-0.5,
|
||||
num_denoising=100,
|
||||
label_noise_ratio=0.5,
|
||||
box_noise_scale=1.0,
|
||||
learnt_init_query=True,
|
||||
eps=1e-2):
|
||||
super(DINOTransformer, self).__init__()
|
||||
assert position_embed_type in ['sine', 'learned'], \
|
||||
f'ValueError: position_embed_type not supported {position_embed_type}!'
|
||||
assert len(in_feats_channel) <= num_levels
|
||||
|
||||
self.hidden_dim = hidden_dim
|
||||
self.nhead = nhead
|
||||
self.num_levels = num_levels
|
||||
self.num_classes = num_classes
|
||||
self.num_queries = num_queries
|
||||
self.eps = eps
|
||||
self.num_decoder_layers = num_decoder_layers
|
||||
|
||||
weight_attr = ParamAttr(regularizer=L2Decay(0.0))
|
||||
bias_attr = ParamAttr(regularizer=L2Decay(0.0))
|
||||
# backbone feature projection
|
||||
self._build_input_proj_layer(in_feats_channel, weight_attr, bias_attr)
|
||||
|
||||
# Transformer module
|
||||
encoder_layer = DeformableTransformerEncoderLayer(
|
||||
hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
|
||||
num_encoder_points, lr_mult, weight_attr, bias_attr)
|
||||
self.encoder = DeformableTransformerEncoder(encoder_layer,
|
||||
num_encoder_layers)
|
||||
decoder_layer = DINOTransformerDecoderLayer(
|
||||
hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
|
||||
num_decoder_points, lr_mult, weight_attr, bias_attr)
|
||||
self.decoder = DINOTransformerDecoder(hidden_dim, decoder_layer,
|
||||
num_decoder_layers, weight_attr,
|
||||
bias_attr)
|
||||
|
||||
# denoising part
|
||||
self.denoising_class_embed = nn.Embedding(
|
||||
num_classes,
|
||||
hidden_dim,
|
||||
weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
|
||||
self.num_denoising = num_denoising
|
||||
self.label_noise_ratio = label_noise_ratio
|
||||
self.box_noise_scale = box_noise_scale
|
||||
|
||||
# position embedding
|
||||
self.position_embedding = PositionEmbedding(
|
||||
hidden_dim // 2,
|
||||
temperature=pe_temperature,
|
||||
normalize=True if position_embed_type == 'sine' else False,
|
||||
embed_type=position_embed_type,
|
||||
offset=pe_offset)
|
||||
self.level_embed = nn.Embedding(num_levels, hidden_dim)
|
||||
# decoder embedding
|
||||
self.learnt_init_query = learnt_init_query
|
||||
if learnt_init_query:
|
||||
self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
|
||||
self.query_pos_head = MLP(2 * hidden_dim,
|
||||
hidden_dim,
|
||||
hidden_dim,
|
||||
num_layers=2)
|
||||
|
||||
# encoder head
|
||||
self.enc_output = nn.Sequential(
|
||||
nn.Linear(hidden_dim, hidden_dim),
|
||||
nn.LayerNorm(
|
||||
hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr))
|
||||
self.enc_score_head = nn.Linear(hidden_dim, num_classes)
|
||||
self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
|
||||
# decoder head
|
||||
self.dec_score_head = nn.LayerList([
|
||||
nn.Linear(hidden_dim, num_classes)
|
||||
for _ in range(num_decoder_layers)
|
||||
])
|
||||
self.dec_bbox_head = nn.LayerList([
|
||||
MLP(hidden_dim, hidden_dim, 4, num_layers=3)
|
||||
for _ in range(num_decoder_layers)
|
||||
])
|
||||
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
# class and bbox head init
|
||||
bias_cls = bias_init_with_prob(0.01)
|
||||
linear_init_(self.enc_score_head)
|
||||
constant_(self.enc_score_head.bias, bias_cls)
|
||||
constant_(self.enc_bbox_head.layers[-1].weight)
|
||||
constant_(self.enc_bbox_head.layers[-1].bias)
|
||||
for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
|
||||
linear_init_(cls_)
|
||||
constant_(cls_.bias, bias_cls)
|
||||
constant_(reg_.layers[-1].weight)
|
||||
constant_(reg_.layers[-1].bias)
|
||||
|
||||
linear_init_(self.enc_output[0])
|
||||
xavier_uniform_(self.enc_output[0].weight)
|
||||
normal_(self.level_embed.weight)
|
||||
if self.learnt_init_query:
|
||||
xavier_uniform_(self.tgt_embed.weight)
|
||||
xavier_uniform_(self.query_pos_head.layers[0].weight)
|
||||
xavier_uniform_(self.query_pos_head.layers[1].weight)
|
||||
for l in self.input_proj:
|
||||
xavier_uniform_(l[0].weight)
|
||||
constant_(l[0].bias)
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, cfg, input_shape):
|
||||
return {'in_feats_channel': [i.channels for i in input_shape], }
|
||||
|
||||
def _build_input_proj_layer(self,
|
||||
in_feats_channel,
|
||||
weight_attr=None,
|
||||
bias_attr=None):
|
||||
self.input_proj = nn.LayerList()
|
||||
for in_channels in in_feats_channel:
|
||||
self.input_proj.append(
|
||||
nn.Sequential(
|
||||
('conv', nn.Conv2D(
|
||||
in_channels, self.hidden_dim, kernel_size=1)), (
|
||||
'norm', nn.GroupNorm(
|
||||
32,
|
||||
self.hidden_dim,
|
||||
weight_attr=weight_attr,
|
||||
bias_attr=bias_attr))))
|
||||
in_channels = in_feats_channel[-1]
|
||||
for _ in range(self.num_levels - len(in_feats_channel)):
|
||||
self.input_proj.append(
|
||||
nn.Sequential(
|
||||
('conv', nn.Conv2D(
|
||||
in_channels,
|
||||
self.hidden_dim,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1)), ('norm', nn.GroupNorm(
|
||||
32,
|
||||
self.hidden_dim,
|
||||
weight_attr=weight_attr,
|
||||
bias_attr=bias_attr))))
|
||||
in_channels = self.hidden_dim
|
||||
|
||||
def _get_encoder_input(self, feats, pad_mask=None):
|
||||
# get projection features
|
||||
proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
|
||||
if self.num_levels > len(proj_feats):
|
||||
len_srcs = len(proj_feats)
|
||||
for i in range(len_srcs, self.num_levels):
|
||||
if i == len_srcs:
|
||||
proj_feats.append(self.input_proj[i](feats[-1]))
|
||||
else:
|
||||
proj_feats.append(self.input_proj[i](proj_feats[-1]))
|
||||
|
||||
# get encoder inputs
|
||||
feat_flatten = []
|
||||
mask_flatten = []
|
||||
lvl_pos_embed_flatten = []
|
||||
spatial_shapes = []
|
||||
valid_ratios = []
|
||||
for i, feat in enumerate(proj_feats):
|
||||
bs, _, h, w = paddle.shape(feat)
|
||||
spatial_shapes.append(paddle.stack([h, w]))
|
||||
# [b,c,h,w] -> [b,h*w,c]
|
||||
feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
|
||||
if pad_mask is not None:
|
||||
mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0]
|
||||
else:
|
||||
mask = paddle.ones([bs, h, w])
|
||||
valid_ratios.append(get_valid_ratio(mask))
|
||||
# [b, h*w, c]
|
||||
pos_embed = self.position_embedding(mask).flatten(1, 2)
|
||||
lvl_pos_embed = pos_embed + self.level_embed.weight[i]
|
||||
lvl_pos_embed_flatten.append(lvl_pos_embed)
|
||||
if pad_mask is not None:
|
||||
# [b, h*w]
|
||||
mask_flatten.append(mask.flatten(1))
|
||||
|
||||
# [b, l, c]
|
||||
feat_flatten = paddle.concat(feat_flatten, 1)
|
||||
# [b, l]
|
||||
mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten,
|
||||
1)
|
||||
# [b, l, c]
|
||||
lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
|
||||
# [num_levels, 2]
|
||||
spatial_shapes = paddle.to_tensor(
|
||||
paddle.stack(spatial_shapes).astype('int64'))
|
||||
# [l] start index of each level
|
||||
level_start_index = paddle.concat([
|
||||
paddle.zeros(
|
||||
[1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
|
||||
])
|
||||
# [b, num_levels, 2]
|
||||
valid_ratios = paddle.stack(valid_ratios, 1)
|
||||
return (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
|
||||
lvl_pos_embed_flatten, valid_ratios)
|
||||
|
||||
def forward(self, feats, pad_mask=None, gt_meta=None):
|
||||
# input projection and embedding
|
||||
(feat_flatten, spatial_shapes, level_start_index, mask_flatten,
|
||||
lvl_pos_embed_flatten,
|
||||
valid_ratios) = self._get_encoder_input(feats, pad_mask)
|
||||
|
||||
# encoder
|
||||
memory = self.encoder(feat_flatten, spatial_shapes, level_start_index,
|
||||
mask_flatten, lvl_pos_embed_flatten, valid_ratios)
|
||||
|
||||
# prepare denoising training
|
||||
if self.training:
|
||||
denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
|
||||
get_contrastive_denoising_training_group(gt_meta,
|
||||
self.num_classes,
|
||||
self.num_queries,
|
||||
self.denoising_class_embed.weight,
|
||||
self.num_denoising,
|
||||
self.label_noise_ratio,
|
||||
self.box_noise_scale)
|
||||
else:
|
||||
denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
|
||||
|
||||
target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
|
||||
self._get_decoder_input(
|
||||
memory, spatial_shapes, mask_flatten, denoising_class,
|
||||
denoising_bbox_unact)
|
||||
|
||||
# decoder
|
||||
inter_feats, inter_bboxes = self.decoder(
|
||||
target, init_ref_points_unact, memory, spatial_shapes,
|
||||
level_start_index, self.dec_bbox_head, self.query_pos_head,
|
||||
valid_ratios, attn_mask, mask_flatten)
|
||||
out_bboxes = []
|
||||
out_logits = []
|
||||
for i in range(self.num_decoder_layers):
|
||||
out_logits.append(self.dec_score_head[i](inter_feats[i]))
|
||||
if i == 0:
|
||||
out_bboxes.append(
|
||||
F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
|
||||
init_ref_points_unact))
|
||||
else:
|
||||
out_bboxes.append(
|
||||
F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
|
||||
inverse_sigmoid(inter_bboxes[i - 1])))
|
||||
out_bboxes = paddle.stack(out_bboxes)
|
||||
out_logits = paddle.stack(out_logits)
|
||||
|
||||
return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
|
||||
dn_meta)
|
||||
|
||||
def _get_encoder_output_anchors(self,
|
||||
memory,
|
||||
spatial_shapes,
|
||||
memory_mask=None,
|
||||
grid_size=0.05):
|
||||
output_anchors = []
|
||||
idx = 0
|
||||
for lvl, (h, w) in enumerate(spatial_shapes):
|
||||
if memory_mask is not None:
|
||||
mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w])
|
||||
valid_H = paddle.sum(mask_[:, :, 0], 1)
|
||||
valid_W = paddle.sum(mask_[:, 0, :], 1)
|
||||
else:
|
||||
valid_H, valid_W = h, w
|
||||
|
||||
grid_y, grid_x = paddle.meshgrid(
|
||||
paddle.arange(end=h), paddle.arange(end=w))
|
||||
grid_xy = paddle.stack([grid_x, grid_y], -1).astype(memory.dtype)
|
||||
|
||||
valid_WH = paddle.stack([valid_W, valid_H], -1).reshape(
|
||||
[-1, 1, 1, 2]).astype(grid_xy.dtype)
|
||||
grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
|
||||
wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
|
||||
output_anchors.append(
|
||||
paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
|
||||
idx += h * w
|
||||
|
||||
output_anchors = paddle.concat(output_anchors, 1)
|
||||
valid_mask = ((output_anchors > self.eps) *
|
||||
(output_anchors < 1 - self.eps)).all(-1, keepdim=True)
|
||||
output_anchors = paddle.log(output_anchors / (1 - output_anchors))
|
||||
if memory_mask is not None:
|
||||
valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0
|
||||
output_anchors = paddle.where(valid_mask, output_anchors,
|
||||
paddle.to_tensor(float("inf")))
|
||||
|
||||
memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
|
||||
output_memory = self.enc_output(memory)
|
||||
return output_memory, output_anchors
|
||||
|
||||
def _get_decoder_input(self,
|
||||
memory,
|
||||
spatial_shapes,
|
||||
memory_mask=None,
|
||||
denoising_class=None,
|
||||
denoising_bbox_unact=None):
|
||||
bs, _, _ = memory.shape
|
||||
# prepare input for decoder
|
||||
output_memory, output_anchors = self._get_encoder_output_anchors(
|
||||
memory, spatial_shapes, memory_mask)
|
||||
enc_outputs_class = self.enc_score_head(output_memory)
|
||||
enc_outputs_coord_unact = self.enc_bbox_head(
|
||||
output_memory) + output_anchors
|
||||
|
||||
_, topk_ind = paddle.topk(
|
||||
enc_outputs_class.max(-1), self.num_queries, axis=1)
|
||||
# extract region proposal boxes
|
||||
batch_ind = paddle.arange(end=bs).astype(topk_ind.dtype)
|
||||
batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
|
||||
topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
|
||||
reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact,
|
||||
topk_ind) # unsigmoided.
|
||||
enc_topk_bboxes = F.sigmoid(reference_points_unact)
|
||||
if denoising_bbox_unact is not None:
|
||||
reference_points_unact = paddle.concat(
|
||||
[denoising_bbox_unact, reference_points_unact], 1)
|
||||
enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)
|
||||
|
||||
# extract region features
|
||||
if self.learnt_init_query:
|
||||
target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
|
||||
else:
|
||||
target = paddle.gather_nd(output_memory, topk_ind).detach()
|
||||
if denoising_class is not None:
|
||||
target = paddle.concat([denoising_class, target], 1)
|
||||
|
||||
return target, reference_points_unact.detach(
|
||||
), enc_topk_bboxes, enc_topk_logits
|
||||
85
rtdetr_paddle/ppdet/modeling/transformers/ext_op/README.md
Normal file
85
rtdetr_paddle/ppdet/modeling/transformers/ext_op/README.md
Normal file
@@ -0,0 +1,85 @@
|
||||
# Multi-scale deformable attention自定义OP编译
|
||||
该自定义OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。
|
||||
|
||||
## 1. 环境依赖
|
||||
- Paddle >= 2.3.2
|
||||
- gcc 8.2
|
||||
|
||||
## 2. 安装
|
||||
请在当前路径下进行编译安装
|
||||
```
|
||||
cd rtdetr_paddle/ppdet/modeling/transformers/ext_op/
|
||||
python setup_ms_deformable_attn_op.py install
|
||||
```
|
||||
|
||||
编译完成后即可使用,以下为`ms_deformable_attn`的使用示例
|
||||
```
|
||||
# 引入自定义op
|
||||
from deformable_detr_ops import ms_deformable_attn
|
||||
|
||||
# 构造fake input tensor
|
||||
bs, n_heads, c = 2, 8, 8
|
||||
query_length, n_levels, n_points = 2, 2, 2
|
||||
spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
|
||||
level_start_index = paddle.concat((paddle.to_tensor(
|
||||
[0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
|
||||
value_length = sum([(H * W).item() for H, W in spatial_shapes])
|
||||
|
||||
def get_test_tensors(channels):
|
||||
value = paddle.rand(
|
||||
[bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
|
||||
sampling_locations = paddle.rand(
|
||||
[bs, query_length, n_heads, n_levels, n_points, 2],
|
||||
dtype=paddle.float32)
|
||||
attention_weights = paddle.rand(
|
||||
[bs, query_length, n_heads, n_levels, n_points],
|
||||
dtype=paddle.float32) + 1e-5
|
||||
attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
|
||||
-2, keepdim=True)
|
||||
return [value, sampling_locations, attention_weights]
|
||||
|
||||
value, sampling_locations, attention_weights = get_test_tensors(c)
|
||||
|
||||
output = ms_deformable_attn(value,
|
||||
spatial_shapes,
|
||||
level_start_index,
|
||||
sampling_locations,
|
||||
attention_weights)
|
||||
```
|
||||
|
||||
## 3. 单元测试
|
||||
可以通过执行单元测试来确认自定义算子功能的正确性,执行单元测试的示例如下所示:
|
||||
```
|
||||
python test_ms_deformable_attn_op.py
|
||||
```
|
||||
运行成功后,打印如下:
|
||||
```
|
||||
*True check_forward_equal_with_paddle_float: max_abs_err 6.98e-10 max_rel_err 2.03e-07
|
||||
*tensor1 True check_gradient_numerical(D=30)
|
||||
*tensor2 True check_gradient_numerical(D=30)
|
||||
*tensor3 True check_gradient_numerical(D=30)
|
||||
*tensor1 True check_gradient_numerical(D=32)
|
||||
*tensor2 True check_gradient_numerical(D=32)
|
||||
*tensor3 True check_gradient_numerical(D=32)
|
||||
*tensor1 True check_gradient_numerical(D=64)
|
||||
*tensor2 True check_gradient_numerical(D=64)
|
||||
*tensor3 True check_gradient_numerical(D=64)
|
||||
*tensor1 True check_gradient_numerical(D=71)
|
||||
*tensor2 True check_gradient_numerical(D=71)
|
||||
*tensor3 True check_gradient_numerical(D=71)
|
||||
*tensor1 True check_gradient_numerical(D=128)
|
||||
*tensor2 True check_gradient_numerical(D=128)
|
||||
*tensor3 True check_gradient_numerical(D=128)
|
||||
*tensor1 True check_gradient_numerical(D=1024)
|
||||
*tensor2 True check_gradient_numerical(D=1024)
|
||||
*tensor3 True check_gradient_numerical(D=1024)
|
||||
*tensor1 True check_gradient_numerical(D=1025)
|
||||
*tensor2 True check_gradient_numerical(D=1025)
|
||||
*tensor3 True check_gradient_numerical(D=1025)
|
||||
*tensor1 True check_gradient_numerical(D=2048)
|
||||
*tensor2 True check_gradient_numerical(D=2048)
|
||||
*tensor3 True check_gradient_numerical(D=2048)
|
||||
*tensor1 True check_gradient_numerical(D=3096)
|
||||
*tensor2 True check_gradient_numerical(D=3096)
|
||||
*tensor3 True check_gradient_numerical(D=3096)
|
||||
```
|
||||
@@ -0,0 +1,65 @@
|
||||
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. */
|
||||
|
||||
#include "paddle/extension.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
// declare GPU implementation
|
||||
std::vector<paddle::Tensor>
|
||||
MSDeformableAttnCUDAForward(const paddle::Tensor &value,
|
||||
const paddle::Tensor &value_spatial_shapes,
|
||||
const paddle::Tensor &value_level_start_index,
|
||||
const paddle::Tensor &sampling_locations,
|
||||
const paddle::Tensor &attention_weights);
|
||||
|
||||
std::vector<paddle::Tensor> MSDeformableAttnCUDABackward(
|
||||
const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes,
|
||||
const paddle::Tensor &value_level_start_index,
|
||||
const paddle::Tensor &sampling_locations,
|
||||
const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out);
|
||||
|
||||
//// CPU not implemented
|
||||
|
||||
std::vector<std::vector<int64_t>>
|
||||
MSDeformableAttnInferShape(std::vector<int64_t> value_shape,
|
||||
std::vector<int64_t> value_spatial_shapes_shape,
|
||||
std::vector<int64_t> value_level_start_index_shape,
|
||||
std::vector<int64_t> sampling_locations_shape,
|
||||
std::vector<int64_t> attention_weights_shape) {
|
||||
return {{value_shape[0], sampling_locations_shape[1],
|
||||
value_shape[2] * value_shape[3]}};
|
||||
}
|
||||
|
||||
std::vector<paddle::DataType>
|
||||
MSDeformableAttnInferDtype(paddle::DataType value_dtype,
|
||||
paddle::DataType value_spatial_shapes_dtype,
|
||||
paddle::DataType value_level_start_index_dtype,
|
||||
paddle::DataType sampling_locations_dtype,
|
||||
paddle::DataType attention_weights_dtype) {
|
||||
return {value_dtype};
|
||||
}
|
||||
|
||||
PD_BUILD_OP(ms_deformable_attn)
|
||||
.Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
|
||||
"AttentionWeights"})
|
||||
.Outputs({"Out"})
|
||||
.SetKernelFn(PD_KERNEL(MSDeformableAttnCUDAForward))
|
||||
.SetInferShapeFn(PD_INFER_SHAPE(MSDeformableAttnInferShape))
|
||||
.SetInferDtypeFn(PD_INFER_DTYPE(MSDeformableAttnInferDtype));
|
||||
|
||||
PD_BUILD_GRAD_OP(ms_deformable_attn)
|
||||
.Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
|
||||
"AttentionWeights", paddle::Grad("Out")})
|
||||
.Outputs({paddle::Grad("Value"), paddle::Grad("SpatialShapes"),
|
||||
paddle::Grad("LevelIndex"), paddle::Grad("SamplingLocations"),
|
||||
paddle::Grad("AttentionWeights")})
|
||||
.SetKernelFn(PD_KERNEL(MSDeformableAttnCUDABackward));
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,7 @@
|
||||
from paddle.utils.cpp_extension import CUDAExtension, setup
|
||||
|
||||
if __name__ == "__main__":
|
||||
setup(
|
||||
name='deformable_detr_ops',
|
||||
ext_modules=CUDAExtension(
|
||||
sources=['ms_deformable_attn_op.cc', 'ms_deformable_attn_op.cu']))
|
||||
@@ -0,0 +1,140 @@
|
||||
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import print_function
|
||||
from __future__ import division
|
||||
|
||||
import os
|
||||
import sys
|
||||
import random
|
||||
import numpy as np
|
||||
import paddle
|
||||
# add python path of PaddleDetection to sys.path
|
||||
parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 5)))
|
||||
if parent_path not in sys.path:
|
||||
sys.path.append(parent_path)
|
||||
|
||||
from ppdet.modeling.transformers.utils import deformable_attention_core_func
|
||||
ms_deform_attn_core_paddle = deformable_attention_core_func
|
||||
|
||||
try:
|
||||
gpu_index = int(sys.argv[1])
|
||||
except:
|
||||
gpu_index = 0
|
||||
print(f'Use gpu {gpu_index} to test...')
|
||||
paddle.set_device(f'gpu:{gpu_index}')
|
||||
|
||||
try:
|
||||
from deformable_detr_ops import ms_deformable_attn
|
||||
except Exception as e:
|
||||
print('import deformable_detr_ops error', e)
|
||||
sys.exit(-1)
|
||||
|
||||
paddle.seed(1)
|
||||
random.seed(1)
|
||||
np.random.seed(1)
|
||||
|
||||
bs, n_heads, c = 2, 8, 8
|
||||
query_length, n_levels, n_points = 2, 2, 2
|
||||
spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
|
||||
level_start_index = paddle.concat((paddle.to_tensor(
|
||||
[0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
|
||||
value_length = sum([(H * W).item() for H, W in spatial_shapes])
|
||||
|
||||
|
||||
def get_test_tensors(channels):
|
||||
value = paddle.rand(
|
||||
[bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
|
||||
sampling_locations = paddle.rand(
|
||||
[bs, query_length, n_heads, n_levels, n_points, 2],
|
||||
dtype=paddle.float32)
|
||||
attention_weights = paddle.rand(
|
||||
[bs, query_length, n_heads, n_levels, n_points],
|
||||
dtype=paddle.float32) + 1e-5
|
||||
attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
|
||||
-2, keepdim=True)
|
||||
|
||||
return [value, sampling_locations, attention_weights]
|
||||
|
||||
|
||||
@paddle.no_grad()
|
||||
def check_forward_equal_with_paddle_float():
|
||||
value, sampling_locations, attention_weights = get_test_tensors(c)
|
||||
|
||||
output_paddle = ms_deform_attn_core_paddle(
|
||||
value, spatial_shapes, level_start_index, sampling_locations,
|
||||
attention_weights).detach().cpu()
|
||||
output_cuda = ms_deformable_attn(value, spatial_shapes, level_start_index,
|
||||
sampling_locations,
|
||||
attention_weights).detach().cpu()
|
||||
fwdok = paddle.allclose(
|
||||
output_cuda, output_paddle, rtol=1e-2, atol=1e-3).item()
|
||||
max_abs_err = (output_cuda - output_paddle).abs().max().item()
|
||||
max_rel_err = (
|
||||
(output_cuda - output_paddle).abs() / output_paddle.abs()).max().item()
|
||||
|
||||
print(
|
||||
f'*{fwdok} check_forward_equal_with_paddle_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}'
|
||||
)
|
||||
|
||||
|
||||
def check_gradient_numerical(channels=4):
|
||||
value_paddle, sampling_locations_paddle, attention_weights_paddle = get_test_tensors(
|
||||
channels)
|
||||
value_paddle.stop_gradient = False
|
||||
sampling_locations_paddle.stop_gradient = False
|
||||
attention_weights_paddle.stop_gradient = False
|
||||
|
||||
value_cuda = value_paddle.detach().clone()
|
||||
sampling_locations_cuda = sampling_locations_paddle.detach().clone()
|
||||
attention_weights_cuda = attention_weights_paddle.detach().clone()
|
||||
value_cuda.stop_gradient = False
|
||||
sampling_locations_cuda.stop_gradient = False
|
||||
attention_weights_cuda.stop_gradient = False
|
||||
|
||||
output_paddle = ms_deform_attn_core_paddle(
|
||||
value_paddle, spatial_shapes, level_start_index,
|
||||
sampling_locations_paddle, attention_weights_paddle)
|
||||
output_paddle.sum().backward()
|
||||
|
||||
output_cuda = ms_deformable_attn(value_cuda, spatial_shapes,
|
||||
level_start_index, sampling_locations_cuda,
|
||||
attention_weights_cuda)
|
||||
output_cuda.sum().backward()
|
||||
|
||||
res = paddle.allclose(
|
||||
value_paddle.grad, value_cuda.grad, rtol=1e-2, atol=1e-3).item()
|
||||
print(f'*tensor1 {res} check_gradient_numerical(D={channels})')
|
||||
|
||||
res = paddle.allclose(
|
||||
sampling_locations_paddle.grad,
|
||||
sampling_locations_cuda.grad,
|
||||
rtol=1e-2,
|
||||
atol=1e-3).item()
|
||||
print(f'*tensor2 {res} check_gradient_numerical(D={channels})')
|
||||
|
||||
res = paddle.allclose(
|
||||
attention_weights_paddle.grad,
|
||||
attention_weights_cuda.grad,
|
||||
rtol=1e-2,
|
||||
atol=1e-3).item()
|
||||
print(f'*tensor3 {res} check_gradient_numerical(D={channels})')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
check_forward_equal_with_paddle_float()
|
||||
|
||||
for channels in [30, 32, 64, 71, 128, 1024, 1025, 2048, 3096]:
|
||||
check_gradient_numerical(channels)
|
||||
287
rtdetr_paddle/ppdet/modeling/transformers/hybrid_encoder.py
Normal file
287
rtdetr_paddle/ppdet/modeling/transformers/hybrid_encoder.py
Normal file
@@ -0,0 +1,287 @@
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from ppdet.modeling.ops import get_act_fn
|
||||
from ..shape_spec import ShapeSpec
|
||||
from ..backbones.csp_darknet import BaseConv
|
||||
from ..backbones.cspresnet import RepVggBlock
|
||||
from ppdet.modeling.transformers.detr_transformer import TransformerEncoder
|
||||
from ..initializer import xavier_uniform_, linear_init_
|
||||
from ..layers import MultiHeadAttention
|
||||
from paddle import ParamAttr
|
||||
from paddle.regularizer import L2Decay
|
||||
|
||||
__all__ = ['HybridEncoder']
|
||||
|
||||
|
||||
class CSPRepLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
num_blocks=3,
|
||||
expansion=1.0,
|
||||
bias=False,
|
||||
act="silu"):
|
||||
super(CSPRepLayer, self).__init__()
|
||||
hidden_channels = int(out_channels * expansion)
|
||||
self.conv1 = BaseConv(
|
||||
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
|
||||
self.conv2 = BaseConv(
|
||||
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
|
||||
self.bottlenecks = nn.Sequential(*[
|
||||
RepVggBlock(
|
||||
hidden_channels, hidden_channels, act=act)
|
||||
for _ in range(num_blocks)
|
||||
])
|
||||
if hidden_channels != out_channels:
|
||||
self.conv3 = BaseConv(
|
||||
hidden_channels,
|
||||
out_channels,
|
||||
ksize=1,
|
||||
stride=1,
|
||||
bias=bias,
|
||||
act=act)
|
||||
else:
|
||||
self.conv3 = nn.Identity()
|
||||
|
||||
def forward(self, x):
|
||||
x_1 = self.conv1(x)
|
||||
x_1 = self.bottlenecks(x_1)
|
||||
x_2 = self.conv2(x)
|
||||
return self.conv3(x_1 + x_2)
|
||||
|
||||
|
||||
@register
|
||||
class TransformerLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
d_model,
|
||||
nhead,
|
||||
dim_feedforward=1024,
|
||||
dropout=0.,
|
||||
activation="relu",
|
||||
attn_dropout=None,
|
||||
act_dropout=None,
|
||||
normalize_before=False):
|
||||
super(TransformerLayer, self).__init__()
|
||||
attn_dropout = dropout if attn_dropout is None else attn_dropout
|
||||
act_dropout = dropout if act_dropout is None else act_dropout
|
||||
self.normalize_before = normalize_before
|
||||
|
||||
self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
|
||||
# Implementation of Feedforward model
|
||||
self.linear1 = nn.Linear(d_model, dim_feedforward)
|
||||
self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
|
||||
self.linear2 = nn.Linear(dim_feedforward, d_model)
|
||||
|
||||
self.norm1 = nn.LayerNorm(d_model)
|
||||
self.norm2 = nn.LayerNorm(d_model)
|
||||
self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
|
||||
self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
|
||||
self.activation = getattr(F, activation)
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
linear_init_(self.linear1)
|
||||
linear_init_(self.linear2)
|
||||
|
||||
@staticmethod
|
||||
def with_pos_embed(tensor, pos_embed):
|
||||
return tensor if pos_embed is None else tensor + pos_embed
|
||||
|
||||
def forward(self, src, src_mask=None, pos_embed=None):
|
||||
residual = src
|
||||
if self.normalize_before:
|
||||
src = self.norm1(src)
|
||||
q = k = self.with_pos_embed(src, pos_embed)
|
||||
src = self.self_attn(q, k, value=src, attn_mask=src_mask)
|
||||
|
||||
src = residual + self.dropout1(src)
|
||||
if not self.normalize_before:
|
||||
src = self.norm1(src)
|
||||
|
||||
residual = src
|
||||
if self.normalize_before:
|
||||
src = self.norm2(src)
|
||||
src = self.linear2(self.dropout(self.activation(self.linear1(src))))
|
||||
src = residual + self.dropout2(src)
|
||||
if not self.normalize_before:
|
||||
src = self.norm2(src)
|
||||
return src
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class HybridEncoder(nn.Layer):
|
||||
__shared__ = ['depth_mult', 'act', 'trt', 'eval_size']
|
||||
__inject__ = ['encoder_layer']
|
||||
|
||||
def __init__(self,
|
||||
in_channels=[512, 1024, 2048],
|
||||
feat_strides=[8, 16, 32],
|
||||
hidden_dim=256,
|
||||
use_encoder_idx=[2],
|
||||
num_encoder_layers=1,
|
||||
encoder_layer='TransformerLayer',
|
||||
pe_temperature=10000,
|
||||
expansion=1.0,
|
||||
depth_mult=1.0,
|
||||
act='silu',
|
||||
trt=False,
|
||||
eval_size=None):
|
||||
super(HybridEncoder, self).__init__()
|
||||
self.in_channels = in_channels
|
||||
self.feat_strides = feat_strides
|
||||
self.hidden_dim = hidden_dim
|
||||
self.use_encoder_idx = use_encoder_idx
|
||||
self.num_encoder_layers = num_encoder_layers
|
||||
self.pe_temperature = pe_temperature
|
||||
self.eval_size = eval_size
|
||||
|
||||
# channel projection
|
||||
self.input_proj = nn.LayerList()
|
||||
for in_channel in in_channels:
|
||||
self.input_proj.append(
|
||||
nn.Sequential(
|
||||
nn.Conv2D(
|
||||
in_channel, hidden_dim, kernel_size=1, bias_attr=False),
|
||||
nn.BatchNorm2D(
|
||||
hidden_dim,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))
|
||||
# encoder transformer
|
||||
self.encoder = nn.LayerList([
|
||||
TransformerEncoder(encoder_layer, num_encoder_layers)
|
||||
for _ in range(len(use_encoder_idx))
|
||||
])
|
||||
|
||||
act = get_act_fn(
|
||||
act, trt=trt) if act is None or isinstance(act,
|
||||
(str, dict)) else act
|
||||
# top-down fpn
|
||||
self.lateral_convs = nn.LayerList()
|
||||
self.fpn_blocks = nn.LayerList()
|
||||
for idx in range(len(in_channels) - 1, 0, -1):
|
||||
self.lateral_convs.append(
|
||||
BaseConv(
|
||||
hidden_dim, hidden_dim, 1, 1, act=act))
|
||||
self.fpn_blocks.append(
|
||||
CSPRepLayer(
|
||||
hidden_dim * 2,
|
||||
hidden_dim,
|
||||
round(3 * depth_mult),
|
||||
act=act,
|
||||
expansion=expansion))
|
||||
|
||||
# bottom-up pan
|
||||
self.downsample_convs = nn.LayerList()
|
||||
self.pan_blocks = nn.LayerList()
|
||||
for idx in range(len(in_channels) - 1):
|
||||
self.downsample_convs.append(
|
||||
BaseConv(
|
||||
hidden_dim, hidden_dim, 3, stride=2, act=act))
|
||||
self.pan_blocks.append(
|
||||
CSPRepLayer(
|
||||
hidden_dim * 2,
|
||||
hidden_dim,
|
||||
round(3 * depth_mult),
|
||||
act=act,
|
||||
expansion=expansion))
|
||||
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
if self.eval_size:
|
||||
for idx in self.use_encoder_idx:
|
||||
stride = self.feat_strides[idx]
|
||||
pos_embed = self.build_2d_sincos_position_embedding(
|
||||
self.eval_size[1] // stride, self.eval_size[0] // stride,
|
||||
self.hidden_dim, self.pe_temperature)
|
||||
setattr(self, f'pos_embed{idx}', pos_embed)
|
||||
|
||||
@staticmethod
|
||||
def build_2d_sincos_position_embedding(w,
|
||||
h,
|
||||
embed_dim=256,
|
||||
temperature=10000.):
|
||||
grid_w = paddle.arange(int(w), dtype=paddle.float32)
|
||||
grid_h = paddle.arange(int(h), dtype=paddle.float32)
|
||||
grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
|
||||
assert embed_dim % 4 == 0, \
|
||||
'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
|
||||
pos_dim = embed_dim // 4
|
||||
omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
|
||||
omega = 1. / (temperature**omega)
|
||||
|
||||
out_w = grid_w.flatten()[..., None] @omega[None]
|
||||
out_h = grid_h.flatten()[..., None] @omega[None]
|
||||
|
||||
return paddle.concat(
|
||||
[
|
||||
paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
|
||||
paddle.cos(out_h)
|
||||
],
|
||||
axis=1)[None, :, :]
|
||||
|
||||
def forward(self, feats, for_mot=False):
|
||||
assert len(feats) == len(self.in_channels)
|
||||
# get projection features
|
||||
proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
|
||||
# encoder
|
||||
if self.num_encoder_layers > 0:
|
||||
for i, enc_ind in enumerate(self.use_encoder_idx):
|
||||
h, w = proj_feats[enc_ind].shape[2:]
|
||||
# flatten [B, C, H, W] to [B, HxW, C]
|
||||
src_flatten = proj_feats[enc_ind].flatten(2).transpose(
|
||||
[0, 2, 1])
|
||||
if self.training or self.eval_size is None:
|
||||
pos_embed = self.build_2d_sincos_position_embedding(
|
||||
w, h, self.hidden_dim, self.pe_temperature)
|
||||
else:
|
||||
pos_embed = getattr(self, f'pos_embed{enc_ind}', None)
|
||||
memory = self.encoder[i](src_flatten, pos_embed=pos_embed)
|
||||
proj_feats[enc_ind] = memory.transpose([0, 2, 1]).reshape(
|
||||
[-1, self.hidden_dim, h, w])
|
||||
|
||||
# top-down fpn
|
||||
inner_outs = [proj_feats[-1]]
|
||||
for idx in range(len(self.in_channels) - 1, 0, -1):
|
||||
feat_heigh = inner_outs[0]
|
||||
feat_low = proj_feats[idx - 1]
|
||||
feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
|
||||
feat_heigh)
|
||||
inner_outs[0] = feat_heigh
|
||||
|
||||
upsample_feat = F.interpolate(
|
||||
feat_heigh, scale_factor=2., mode="nearest")
|
||||
inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
|
||||
paddle.concat(
|
||||
[upsample_feat, feat_low], axis=1))
|
||||
inner_outs.insert(0, inner_out)
|
||||
|
||||
# bottom-up pan
|
||||
outs = [inner_outs[0]]
|
||||
for idx in range(len(self.in_channels) - 1):
|
||||
feat_low = outs[-1]
|
||||
feat_height = inner_outs[idx + 1]
|
||||
downsample_feat = self.downsample_convs[idx](feat_low)
|
||||
out = self.pan_blocks[idx](paddle.concat(
|
||||
[downsample_feat, feat_height], axis=1))
|
||||
outs.append(out)
|
||||
|
||||
return outs
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, cfg, input_shape):
|
||||
return {
|
||||
'in_channels': [i.channels for i in input_shape],
|
||||
'feat_strides': [i.stride for i in input_shape]
|
||||
}
|
||||
|
||||
@property
|
||||
def out_shape(self):
|
||||
return [
|
||||
ShapeSpec(
|
||||
channels=self.hidden_dim, stride=self.feat_strides[idx])
|
||||
for idx in range(len(self.in_channels))
|
||||
]
|
||||
184
rtdetr_paddle/ppdet/modeling/transformers/matchers.py
Normal file
184
rtdetr_paddle/ppdet/modeling/transformers/matchers.py
Normal file
@@ -0,0 +1,184 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Modified from DETR (https://github.com/facebookresearch/detr)
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from scipy.optimize import linear_sum_assignment
|
||||
|
||||
from ppdet.core.workspace import register, serializable
|
||||
from ..losses.iou_loss import GIoULoss
|
||||
from .utils import bbox_cxcywh_to_xyxy
|
||||
|
||||
__all__ = ['HungarianMatcher']
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class HungarianMatcher(nn.Layer):
|
||||
__shared__ = ['use_focal_loss', 'with_mask', 'num_sample_points']
|
||||
|
||||
def __init__(self,
|
||||
matcher_coeff={
|
||||
'class': 1,
|
||||
'bbox': 5,
|
||||
'giou': 2,
|
||||
'mask': 1,
|
||||
'dice': 1
|
||||
},
|
||||
use_focal_loss=False,
|
||||
with_mask=False,
|
||||
num_sample_points=12544,
|
||||
alpha=0.25,
|
||||
gamma=2.0):
|
||||
r"""
|
||||
Args:
|
||||
matcher_coeff (dict): The coefficient of hungarian matcher cost.
|
||||
"""
|
||||
super(HungarianMatcher, self).__init__()
|
||||
self.matcher_coeff = matcher_coeff
|
||||
self.use_focal_loss = use_focal_loss
|
||||
self.with_mask = with_mask
|
||||
self.num_sample_points = num_sample_points
|
||||
self.alpha = alpha
|
||||
self.gamma = gamma
|
||||
|
||||
self.giou_loss = GIoULoss()
|
||||
|
||||
def forward(self,
|
||||
boxes,
|
||||
logits,
|
||||
gt_bbox,
|
||||
gt_class,
|
||||
masks=None,
|
||||
gt_mask=None):
|
||||
r"""
|
||||
Args:
|
||||
boxes (Tensor): [b, query, 4]
|
||||
logits (Tensor): [b, query, num_classes]
|
||||
gt_bbox (List(Tensor)): list[[n, 4]]
|
||||
gt_class (List(Tensor)): list[[n, 1]]
|
||||
masks (Tensor|None): [b, query, h, w]
|
||||
gt_mask (List(Tensor)): list[[n, H, W]]
|
||||
|
||||
Returns:
|
||||
A list of size batch_size, containing tuples of (index_i, index_j) where:
|
||||
- index_i is the indices of the selected predictions (in order)
|
||||
- index_j is the indices of the corresponding selected targets (in order)
|
||||
For each batch element, it holds:
|
||||
len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
|
||||
"""
|
||||
bs, num_queries = boxes.shape[:2]
|
||||
|
||||
num_gts = [len(a) for a in gt_class]
|
||||
if sum(num_gts) == 0:
|
||||
return [(paddle.to_tensor(
|
||||
[], dtype=paddle.int64), paddle.to_tensor(
|
||||
[], dtype=paddle.int64)) for _ in range(bs)]
|
||||
|
||||
# We flatten to compute the cost matrices in a batch
|
||||
# [batch_size * num_queries, num_classes]
|
||||
logits = logits.detach()
|
||||
out_prob = F.sigmoid(logits.flatten(
|
||||
0, 1)) if self.use_focal_loss else F.softmax(logits.flatten(0, 1))
|
||||
# [batch_size * num_queries, 4]
|
||||
out_bbox = boxes.detach().flatten(0, 1)
|
||||
|
||||
# Also concat the target labels and boxes
|
||||
tgt_ids = paddle.concat(gt_class).flatten()
|
||||
tgt_bbox = paddle.concat(gt_bbox)
|
||||
|
||||
# Compute the classification cost
|
||||
out_prob = paddle.gather(out_prob, tgt_ids, axis=1)
|
||||
if self.use_focal_loss:
|
||||
neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-(
|
||||
1 - out_prob + 1e-8).log())
|
||||
pos_cost_class = self.alpha * (
|
||||
(1 - out_prob)**self.gamma) * (-(out_prob + 1e-8).log())
|
||||
cost_class = pos_cost_class - neg_cost_class
|
||||
else:
|
||||
cost_class = -out_prob
|
||||
|
||||
# Compute the L1 cost between boxes
|
||||
cost_bbox = (
|
||||
out_bbox.unsqueeze(1) - tgt_bbox.unsqueeze(0)).abs().sum(-1)
|
||||
|
||||
# Compute the giou cost betwen boxes
|
||||
cost_giou = self.giou_loss(
|
||||
bbox_cxcywh_to_xyxy(out_bbox.unsqueeze(1)),
|
||||
bbox_cxcywh_to_xyxy(tgt_bbox.unsqueeze(0))).squeeze(-1)
|
||||
|
||||
# Final cost matrix
|
||||
C = self.matcher_coeff['class'] * cost_class + \
|
||||
self.matcher_coeff['bbox'] * cost_bbox + \
|
||||
self.matcher_coeff['giou'] * cost_giou
|
||||
# Compute the mask cost and dice cost
|
||||
if self.with_mask:
|
||||
assert (masks is not None and gt_mask is not None,
|
||||
'Make sure the input has `mask` and `gt_mask`')
|
||||
# all masks share the same set of points for efficient matching
|
||||
sample_points = paddle.rand([bs, 1, self.num_sample_points, 2])
|
||||
sample_points = 2.0 * sample_points - 1.0
|
||||
|
||||
out_mask = F.grid_sample(
|
||||
masks.detach(), sample_points, align_corners=False).squeeze(-2)
|
||||
out_mask = out_mask.flatten(0, 1)
|
||||
|
||||
tgt_mask = paddle.concat(gt_mask).unsqueeze(1)
|
||||
sample_points = paddle.concat([
|
||||
a.tile([b, 1, 1, 1]) for a, b in zip(sample_points, num_gts)
|
||||
if b > 0
|
||||
])
|
||||
tgt_mask = F.grid_sample(
|
||||
tgt_mask, sample_points, align_corners=False).squeeze([1, 2])
|
||||
|
||||
with paddle.amp.auto_cast(enable=False):
|
||||
# binary cross entropy cost
|
||||
pos_cost_mask = F.binary_cross_entropy_with_logits(
|
||||
out_mask, paddle.ones_like(out_mask), reduction='none')
|
||||
neg_cost_mask = F.binary_cross_entropy_with_logits(
|
||||
out_mask, paddle.zeros_like(out_mask), reduction='none')
|
||||
cost_mask = paddle.matmul(
|
||||
pos_cost_mask, tgt_mask, transpose_y=True) + paddle.matmul(
|
||||
neg_cost_mask, 1 - tgt_mask, transpose_y=True)
|
||||
cost_mask /= self.num_sample_points
|
||||
|
||||
# dice cost
|
||||
out_mask = F.sigmoid(out_mask)
|
||||
numerator = 2 * paddle.matmul(
|
||||
out_mask, tgt_mask, transpose_y=True)
|
||||
denominator = out_mask.sum(
|
||||
-1, keepdim=True) + tgt_mask.sum(-1).unsqueeze(0)
|
||||
cost_dice = 1 - (numerator + 1) / (denominator + 1)
|
||||
|
||||
C = C + self.matcher_coeff['mask'] * cost_mask + \
|
||||
self.matcher_coeff['dice'] * cost_dice
|
||||
|
||||
C = C.reshape([bs, num_queries, -1])
|
||||
C = [a.squeeze(0) for a in C.chunk(bs)]
|
||||
sizes = [a.shape[0] for a in gt_bbox]
|
||||
indices = [
|
||||
linear_sum_assignment(c.split(sizes, -1)[i].numpy())
|
||||
for i, c in enumerate(C)
|
||||
]
|
||||
return [(paddle.to_tensor(
|
||||
i, dtype=paddle.int64), paddle.to_tensor(
|
||||
j, dtype=paddle.int64)) for i, j in indices]
|
||||
100
rtdetr_paddle/ppdet/modeling/transformers/position_encoding.py
Normal file
100
rtdetr_paddle/ppdet/modeling/transformers/position_encoding.py
Normal file
@@ -0,0 +1,100 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Modified from DETR (https://github.com/facebookresearch/detr)
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import math
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
|
||||
from ppdet.core.workspace import register, serializable
|
||||
|
||||
|
||||
@register
|
||||
@serializable
|
||||
class PositionEmbedding(nn.Layer):
|
||||
def __init__(self,
|
||||
num_pos_feats=128,
|
||||
temperature=10000,
|
||||
normalize=True,
|
||||
scale=2 * math.pi,
|
||||
embed_type='sine',
|
||||
num_embeddings=50,
|
||||
offset=0.,
|
||||
eps=1e-6):
|
||||
super(PositionEmbedding, self).__init__()
|
||||
assert embed_type in ['sine', 'learned']
|
||||
|
||||
self.embed_type = embed_type
|
||||
self.offset = offset
|
||||
self.eps = eps
|
||||
if self.embed_type == 'sine':
|
||||
self.num_pos_feats = num_pos_feats
|
||||
self.temperature = temperature
|
||||
self.normalize = normalize
|
||||
self.scale = scale
|
||||
elif self.embed_type == 'learned':
|
||||
self.row_embed = nn.Embedding(num_embeddings, num_pos_feats)
|
||||
self.col_embed = nn.Embedding(num_embeddings, num_pos_feats)
|
||||
else:
|
||||
raise ValueError(f"{self.embed_type} is not supported.")
|
||||
|
||||
def forward(self, mask):
|
||||
"""
|
||||
Args:
|
||||
mask (Tensor): [B, H, W]
|
||||
Returns:
|
||||
pos (Tensor): [B, H, W, C]
|
||||
"""
|
||||
if self.embed_type == 'sine':
|
||||
y_embed = mask.cumsum(1)
|
||||
x_embed = mask.cumsum(2)
|
||||
if self.normalize:
|
||||
y_embed = (y_embed + self.offset) / (
|
||||
y_embed[:, -1:, :] + self.eps) * self.scale
|
||||
x_embed = (x_embed + self.offset) / (
|
||||
x_embed[:, :, -1:] + self.eps) * self.scale
|
||||
|
||||
dim_t = 2 * (paddle.arange(self.num_pos_feats) //
|
||||
2).astype('float32')
|
||||
dim_t = self.temperature**(dim_t / self.num_pos_feats)
|
||||
|
||||
pos_x = x_embed.unsqueeze(-1) / dim_t
|
||||
pos_y = y_embed.unsqueeze(-1) / dim_t
|
||||
pos_x = paddle.stack(
|
||||
(pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
|
||||
axis=4).flatten(3)
|
||||
pos_y = paddle.stack(
|
||||
(pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
|
||||
axis=4).flatten(3)
|
||||
return paddle.concat((pos_y, pos_x), axis=3)
|
||||
elif self.embed_type == 'learned':
|
||||
h, w = mask.shape[-2:]
|
||||
i = paddle.arange(w)
|
||||
j = paddle.arange(h)
|
||||
x_emb = self.col_embed(i)
|
||||
y_emb = self.row_embed(j)
|
||||
return paddle.concat(
|
||||
[
|
||||
x_emb.unsqueeze(0).tile([h, 1, 1]),
|
||||
y_emb.unsqueeze(1).tile([1, w, 1]),
|
||||
],
|
||||
axis=-1).unsqueeze(0)
|
||||
else:
|
||||
raise ValueError(f"not supported {self.embed_type}")
|
||||
523
rtdetr_paddle/ppdet/modeling/transformers/rtdetr_transformer.py
Normal file
523
rtdetr_paddle/ppdet/modeling/transformers/rtdetr_transformer.py
Normal file
@@ -0,0 +1,523 @@
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
from paddle import ParamAttr
|
||||
from paddle.regularizer import L2Decay
|
||||
|
||||
from ppdet.core.workspace import register
|
||||
from ..layers import MultiHeadAttention
|
||||
from .deformable_transformer import MSDeformableAttention
|
||||
from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,
|
||||
bias_init_with_prob)
|
||||
from .utils import (_get_clones, get_sine_pos_embed,
|
||||
get_contrastive_denoising_training_group, inverse_sigmoid, MLP)
|
||||
|
||||
__all__ = ['RTDETRTransformer']
|
||||
|
||||
|
||||
class PPMSDeformableAttention(MSDeformableAttention):
|
||||
def forward(self,
|
||||
query,
|
||||
reference_points,
|
||||
value,
|
||||
value_spatial_shapes,
|
||||
value_level_start_index,
|
||||
value_mask=None):
|
||||
"""
|
||||
Args:
|
||||
query (Tensor): [bs, query_length, C]
|
||||
reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
|
||||
bottom-right (1, 1), including padding area
|
||||
value (Tensor): [bs, value_length, C]
|
||||
value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
|
||||
value_level_start_index (List): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
|
||||
value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
|
||||
|
||||
Returns:
|
||||
output (Tensor): [bs, Length_{query}, C]
|
||||
"""
|
||||
bs, Len_q = query.shape[:2]
|
||||
Len_v = value.shape[1]
|
||||
|
||||
value = self.value_proj(value)
|
||||
if value_mask is not None:
|
||||
value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
|
||||
value *= value_mask
|
||||
value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])
|
||||
|
||||
sampling_offsets = self.sampling_offsets(query).reshape(
|
||||
[bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
|
||||
attention_weights = self.attention_weights(query).reshape(
|
||||
[bs, Len_q, self.num_heads, self.num_levels * self.num_points])
|
||||
attention_weights = F.softmax(attention_weights).reshape(
|
||||
[bs, Len_q, self.num_heads, self.num_levels, self.num_points])
|
||||
|
||||
if reference_points.shape[-1] == 2:
|
||||
offset_normalizer = paddle.to_tensor(value_spatial_shapes)
|
||||
offset_normalizer = offset_normalizer.flip([1]).reshape(
|
||||
[1, 1, 1, self.num_levels, 1, 2])
|
||||
sampling_locations = reference_points.reshape([
|
||||
bs, Len_q, 1, self.num_levels, 1, 2
|
||||
]) + sampling_offsets / offset_normalizer
|
||||
elif reference_points.shape[-1] == 4:
|
||||
sampling_locations = (
|
||||
reference_points[:, :, None, :, None, :2] + sampling_offsets /
|
||||
self.num_points * reference_points[:, :, None, :, None, 2:] *
|
||||
0.5)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Last dim of reference_points must be 2 or 4, but get {} instead.".
|
||||
format(reference_points.shape[-1]))
|
||||
|
||||
if not isinstance(query, paddle.Tensor):
|
||||
from ppdet.modeling.transformers.utils import deformable_attention_core_func
|
||||
output = deformable_attention_core_func(
|
||||
value, value_spatial_shapes, value_level_start_index,
|
||||
sampling_locations, attention_weights)
|
||||
else:
|
||||
value_spatial_shapes = paddle.to_tensor(value_spatial_shapes)
|
||||
value_level_start_index = paddle.to_tensor(value_level_start_index)
|
||||
output = self.ms_deformable_attn_core(
|
||||
value, value_spatial_shapes, value_level_start_index,
|
||||
sampling_locations, attention_weights)
|
||||
output = self.output_proj(output)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class TransformerDecoderLayer(nn.Layer):
|
||||
def __init__(self,
|
||||
d_model=256,
|
||||
n_head=8,
|
||||
dim_feedforward=1024,
|
||||
dropout=0.,
|
||||
activation="relu",
|
||||
n_levels=4,
|
||||
n_points=4,
|
||||
weight_attr=None,
|
||||
bias_attr=None):
|
||||
super(TransformerDecoderLayer, self).__init__()
|
||||
|
||||
# self attention
|
||||
self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
|
||||
self.dropout1 = nn.Dropout(dropout)
|
||||
self.norm1 = nn.LayerNorm(
|
||||
d_model,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
|
||||
|
||||
# cross attention
|
||||
self.cross_attn = PPMSDeformableAttention(d_model, n_head, n_levels,
|
||||
n_points, 1.0)
|
||||
self.dropout2 = nn.Dropout(dropout)
|
||||
self.norm2 = nn.LayerNorm(
|
||||
d_model,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
|
||||
|
||||
# ffn
|
||||
self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr,
|
||||
bias_attr)
|
||||
self.activation = getattr(F, activation)
|
||||
self.dropout3 = nn.Dropout(dropout)
|
||||
self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr,
|
||||
bias_attr)
|
||||
self.dropout4 = nn.Dropout(dropout)
|
||||
self.norm3 = nn.LayerNorm(
|
||||
d_model,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
linear_init_(self.linear1)
|
||||
linear_init_(self.linear2)
|
||||
xavier_uniform_(self.linear1.weight)
|
||||
xavier_uniform_(self.linear2.weight)
|
||||
|
||||
def with_pos_embed(self, tensor, pos):
|
||||
return tensor if pos is None else tensor + pos
|
||||
|
||||
def forward_ffn(self, tgt):
|
||||
return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
|
||||
|
||||
def forward(self,
|
||||
tgt,
|
||||
reference_points,
|
||||
memory,
|
||||
memory_spatial_shapes,
|
||||
memory_level_start_index,
|
||||
attn_mask=None,
|
||||
memory_mask=None,
|
||||
query_pos_embed=None):
|
||||
# self attention
|
||||
q = k = self.with_pos_embed(tgt, query_pos_embed)
|
||||
if attn_mask is not None:
|
||||
attn_mask = paddle.where(
|
||||
attn_mask.astype('bool'),
|
||||
paddle.zeros(attn_mask.shape, tgt.dtype),
|
||||
paddle.full(attn_mask.shape, float("-inf"), tgt.dtype))
|
||||
tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
|
||||
tgt = tgt + self.dropout1(tgt2)
|
||||
tgt = self.norm1(tgt)
|
||||
|
||||
# cross attention
|
||||
tgt2 = self.cross_attn(
|
||||
self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
|
||||
memory_spatial_shapes, memory_level_start_index, memory_mask)
|
||||
tgt = tgt + self.dropout2(tgt2)
|
||||
tgt = self.norm2(tgt)
|
||||
|
||||
# ffn
|
||||
tgt2 = self.forward_ffn(tgt)
|
||||
tgt = tgt + self.dropout4(tgt2)
|
||||
tgt = self.norm3(tgt)
|
||||
|
||||
return tgt
|
||||
|
||||
|
||||
class TransformerDecoder(nn.Layer):
|
||||
def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
|
||||
super(TransformerDecoder, self).__init__()
|
||||
self.layers = _get_clones(decoder_layer, num_layers)
|
||||
self.hidden_dim = hidden_dim
|
||||
self.num_layers = num_layers
|
||||
self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
|
||||
|
||||
def forward(self,
|
||||
tgt,
|
||||
ref_points_unact,
|
||||
memory,
|
||||
memory_spatial_shapes,
|
||||
memory_level_start_index,
|
||||
bbox_head,
|
||||
score_head,
|
||||
query_pos_head,
|
||||
attn_mask=None,
|
||||
memory_mask=None):
|
||||
output = tgt
|
||||
dec_out_bboxes = []
|
||||
dec_out_logits = []
|
||||
ref_points_detach = F.sigmoid(ref_points_unact)
|
||||
for i, layer in enumerate(self.layers):
|
||||
ref_points_input = ref_points_detach.unsqueeze(2)
|
||||
query_pos_embed = query_pos_head(ref_points_detach)
|
||||
|
||||
output = layer(output, ref_points_input, memory,
|
||||
memory_spatial_shapes, memory_level_start_index,
|
||||
attn_mask, memory_mask, query_pos_embed)
|
||||
|
||||
inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
|
||||
ref_points_detach))
|
||||
|
||||
if self.training:
|
||||
dec_out_logits.append(score_head[i](output))
|
||||
if i == 0:
|
||||
dec_out_bboxes.append(inter_ref_bbox)
|
||||
else:
|
||||
dec_out_bboxes.append(
|
||||
F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
|
||||
ref_points)))
|
||||
elif i == self.eval_idx:
|
||||
dec_out_logits.append(score_head[i](output))
|
||||
dec_out_bboxes.append(inter_ref_bbox)
|
||||
break
|
||||
|
||||
ref_points = inter_ref_bbox
|
||||
ref_points_detach = inter_ref_bbox.detach(
|
||||
) if self.training else inter_ref_bbox
|
||||
|
||||
return paddle.stack(dec_out_bboxes), paddle.stack(dec_out_logits)
|
||||
|
||||
|
||||
@register
|
||||
class RTDETRTransformer(nn.Layer):
|
||||
__shared__ = ['num_classes', 'hidden_dim', 'eval_size']
|
||||
|
||||
def __init__(self,
|
||||
num_classes=80,
|
||||
hidden_dim=256,
|
||||
num_queries=300,
|
||||
position_embed_type='sine',
|
||||
backbone_feat_channels=[512, 1024, 2048],
|
||||
feat_strides=[8, 16, 32],
|
||||
num_levels=3,
|
||||
num_decoder_points=4,
|
||||
nhead=8,
|
||||
num_decoder_layers=6,
|
||||
dim_feedforward=1024,
|
||||
dropout=0.,
|
||||
activation="relu",
|
||||
num_denoising=100,
|
||||
label_noise_ratio=0.5,
|
||||
box_noise_scale=1.0,
|
||||
learnt_init_query=True,
|
||||
eval_size=None,
|
||||
eval_idx=-1,
|
||||
eps=1e-2):
|
||||
super(RTDETRTransformer, self).__init__()
|
||||
assert position_embed_type in ['sine', 'learned'], \
|
||||
f'ValueError: position_embed_type not supported {position_embed_type}!'
|
||||
assert len(backbone_feat_channels) <= num_levels
|
||||
assert len(feat_strides) == len(backbone_feat_channels)
|
||||
for _ in range(num_levels - len(feat_strides)):
|
||||
feat_strides.append(feat_strides[-1] * 2)
|
||||
|
||||
self.hidden_dim = hidden_dim
|
||||
self.nhead = nhead
|
||||
self.feat_strides = feat_strides
|
||||
self.num_levels = num_levels
|
||||
self.num_classes = num_classes
|
||||
self.num_queries = num_queries
|
||||
self.eps = eps
|
||||
self.num_decoder_layers = num_decoder_layers
|
||||
self.eval_size = eval_size
|
||||
|
||||
# backbone feature projection
|
||||
self._build_input_proj_layer(backbone_feat_channels)
|
||||
|
||||
# Transformer module
|
||||
decoder_layer = TransformerDecoderLayer(
|
||||
hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
|
||||
num_decoder_points)
|
||||
self.decoder = TransformerDecoder(hidden_dim, decoder_layer,
|
||||
num_decoder_layers, eval_idx)
|
||||
|
||||
# denoising part
|
||||
self.denoising_class_embed = nn.Embedding(
|
||||
num_classes,
|
||||
hidden_dim,
|
||||
weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
|
||||
self.num_denoising = num_denoising
|
||||
self.label_noise_ratio = label_noise_ratio
|
||||
self.box_noise_scale = box_noise_scale
|
||||
|
||||
# decoder embedding
|
||||
self.learnt_init_query = learnt_init_query
|
||||
if learnt_init_query:
|
||||
self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
|
||||
self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)
|
||||
|
||||
# encoder head
|
||||
self.enc_output = nn.Sequential(
|
||||
nn.Linear(hidden_dim, hidden_dim),
|
||||
nn.LayerNorm(
|
||||
hidden_dim,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0))))
|
||||
self.enc_score_head = nn.Linear(hidden_dim, num_classes)
|
||||
self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
|
||||
|
||||
# decoder head
|
||||
self.dec_score_head = nn.LayerList([
|
||||
nn.Linear(hidden_dim, num_classes)
|
||||
for _ in range(num_decoder_layers)
|
||||
])
|
||||
self.dec_bbox_head = nn.LayerList([
|
||||
MLP(hidden_dim, hidden_dim, 4, num_layers=3)
|
||||
for _ in range(num_decoder_layers)
|
||||
])
|
||||
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
# class and bbox head init
|
||||
bias_cls = bias_init_with_prob(0.01)
|
||||
linear_init_(self.enc_score_head)
|
||||
constant_(self.enc_score_head.bias, bias_cls)
|
||||
constant_(self.enc_bbox_head.layers[-1].weight)
|
||||
constant_(self.enc_bbox_head.layers[-1].bias)
|
||||
for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
|
||||
linear_init_(cls_)
|
||||
constant_(cls_.bias, bias_cls)
|
||||
constant_(reg_.layers[-1].weight)
|
||||
constant_(reg_.layers[-1].bias)
|
||||
|
||||
linear_init_(self.enc_output[0])
|
||||
xavier_uniform_(self.enc_output[0].weight)
|
||||
if self.learnt_init_query:
|
||||
xavier_uniform_(self.tgt_embed.weight)
|
||||
xavier_uniform_(self.query_pos_head.layers[0].weight)
|
||||
xavier_uniform_(self.query_pos_head.layers[1].weight)
|
||||
for l in self.input_proj:
|
||||
xavier_uniform_(l[0].weight)
|
||||
|
||||
# init encoder output anchors and valid_mask
|
||||
if self.eval_size:
|
||||
self.anchors, self.valid_mask = self._generate_anchors()
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, cfg, input_shape):
|
||||
return {'backbone_feat_channels': [i.channels for i in input_shape]}
|
||||
|
||||
def _build_input_proj_layer(self, backbone_feat_channels):
|
||||
self.input_proj = nn.LayerList()
|
||||
for in_channels in backbone_feat_channels:
|
||||
self.input_proj.append(
|
||||
nn.Sequential(
|
||||
('conv', nn.Conv2D(
|
||||
in_channels,
|
||||
self.hidden_dim,
|
||||
kernel_size=1,
|
||||
bias_attr=False)), ('norm', nn.BatchNorm2D(
|
||||
self.hidden_dim,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
|
||||
in_channels = backbone_feat_channels[-1]
|
||||
for _ in range(self.num_levels - len(backbone_feat_channels)):
|
||||
self.input_proj.append(
|
||||
nn.Sequential(
|
||||
('conv', nn.Conv2D(
|
||||
in_channels,
|
||||
self.hidden_dim,
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
bias_attr=False)), ('norm', nn.BatchNorm2D(
|
||||
self.hidden_dim,
|
||||
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
|
||||
bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
|
||||
in_channels = self.hidden_dim
|
||||
|
||||
def _get_encoder_input(self, feats):
|
||||
# get projection features
|
||||
proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
|
||||
if self.num_levels > len(proj_feats):
|
||||
len_srcs = len(proj_feats)
|
||||
for i in range(len_srcs, self.num_levels):
|
||||
if i == len_srcs:
|
||||
proj_feats.append(self.input_proj[i](feats[-1]))
|
||||
else:
|
||||
proj_feats.append(self.input_proj[i](proj_feats[-1]))
|
||||
|
||||
# get encoder inputs
|
||||
feat_flatten = []
|
||||
spatial_shapes = []
|
||||
level_start_index = [0, ]
|
||||
for i, feat in enumerate(proj_feats):
|
||||
_, _, h, w = feat.shape
|
||||
# [b, c, h, w] -> [b, h*w, c]
|
||||
feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
|
||||
# [num_levels, 2]
|
||||
spatial_shapes.append([h, w])
|
||||
# [l], start index of each level
|
||||
level_start_index.append(h * w + level_start_index[-1])
|
||||
|
||||
# [b, l, c]
|
||||
feat_flatten = paddle.concat(feat_flatten, 1)
|
||||
level_start_index.pop()
|
||||
return (feat_flatten, spatial_shapes, level_start_index)
|
||||
|
||||
def forward(self, feats, pad_mask=None, gt_meta=None):
|
||||
# input projection and embedding
|
||||
(memory, spatial_shapes,
|
||||
level_start_index) = self._get_encoder_input(feats)
|
||||
|
||||
# prepare denoising training
|
||||
if self.training:
|
||||
denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
|
||||
get_contrastive_denoising_training_group(gt_meta,
|
||||
self.num_classes,
|
||||
self.num_queries,
|
||||
self.denoising_class_embed.weight,
|
||||
self.num_denoising,
|
||||
self.label_noise_ratio,
|
||||
self.box_noise_scale)
|
||||
else:
|
||||
denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
|
||||
|
||||
target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
|
||||
self._get_decoder_input(
|
||||
memory, spatial_shapes, denoising_class, denoising_bbox_unact)
|
||||
|
||||
# decoder
|
||||
out_bboxes, out_logits = self.decoder(
|
||||
target,
|
||||
init_ref_points_unact,
|
||||
memory,
|
||||
spatial_shapes,
|
||||
level_start_index,
|
||||
self.dec_bbox_head,
|
||||
self.dec_score_head,
|
||||
self.query_pos_head,
|
||||
attn_mask=attn_mask)
|
||||
return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
|
||||
dn_meta)
|
||||
|
||||
def _generate_anchors(self,
|
||||
spatial_shapes=None,
|
||||
grid_size=0.05,
|
||||
dtype="float32"):
|
||||
if spatial_shapes is None:
|
||||
spatial_shapes = [
|
||||
[int(self.eval_size[0] / s), int(self.eval_size[1] / s)]
|
||||
for s in self.feat_strides
|
||||
]
|
||||
anchors = []
|
||||
for lvl, (h, w) in enumerate(spatial_shapes):
|
||||
grid_y, grid_x = paddle.meshgrid(
|
||||
paddle.arange(
|
||||
end=h, dtype=dtype),
|
||||
paddle.arange(
|
||||
end=w, dtype=dtype))
|
||||
grid_xy = paddle.stack([grid_x, grid_y], -1)
|
||||
|
||||
valid_WH = paddle.to_tensor([w, h]).astype(dtype)
|
||||
grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
|
||||
wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
|
||||
anchors.append(
|
||||
paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
|
||||
|
||||
anchors = paddle.concat(anchors, 1)
|
||||
valid_mask = ((anchors > self.eps) *
|
||||
(anchors < 1 - self.eps)).all(-1, keepdim=True)
|
||||
anchors = paddle.log(anchors / (1 - anchors))
|
||||
anchors = paddle.where(valid_mask, anchors,
|
||||
paddle.to_tensor(float("inf")))
|
||||
return anchors, valid_mask
|
||||
|
||||
def _get_decoder_input(self,
|
||||
memory,
|
||||
spatial_shapes,
|
||||
denoising_class=None,
|
||||
denoising_bbox_unact=None):
|
||||
bs, _, _ = memory.shape
|
||||
# prepare input for decoder
|
||||
if self.training or self.eval_size is None:
|
||||
anchors, valid_mask = self._generate_anchors(spatial_shapes)
|
||||
else:
|
||||
anchors, valid_mask = self.anchors, self.valid_mask
|
||||
memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
|
||||
output_memory = self.enc_output(memory)
|
||||
|
||||
enc_outputs_class = self.enc_score_head(output_memory)
|
||||
enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors
|
||||
|
||||
_, topk_ind = paddle.topk(
|
||||
enc_outputs_class.max(-1), self.num_queries, axis=1)
|
||||
# extract region proposal boxes
|
||||
batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype)
|
||||
batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
|
||||
topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
|
||||
|
||||
reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact,
|
||||
topk_ind) # unsigmoided.
|
||||
enc_topk_bboxes = F.sigmoid(reference_points_unact)
|
||||
if denoising_bbox_unact is not None:
|
||||
reference_points_unact = paddle.concat(
|
||||
[denoising_bbox_unact, reference_points_unact], 1)
|
||||
if self.training:
|
||||
reference_points_unact = reference_points_unact.detach()
|
||||
enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)
|
||||
|
||||
# extract region features
|
||||
if self.learnt_init_query:
|
||||
target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
|
||||
else:
|
||||
target = paddle.gather_nd(output_memory, topk_ind)
|
||||
if self.training:
|
||||
target = target.detach()
|
||||
if denoising_class is not None:
|
||||
target = paddle.concat([denoising_class, target], 1)
|
||||
|
||||
return target, reference_points_unact, enc_topk_bboxes, enc_topk_logits
|
||||
481
rtdetr_paddle/ppdet/modeling/transformers/utils.py
Normal file
481
rtdetr_paddle/ppdet/modeling/transformers/utils.py
Normal file
@@ -0,0 +1,481 @@
|
||||
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# Modified from DETR (https://github.com/facebookresearch/detr)
|
||||
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
|
||||
# Modified from detrex (https://github.com/IDEA-Research/detrex)
|
||||
# Copyright 2022 The IDEA Authors. All rights reserved.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import copy
|
||||
import math
|
||||
import paddle
|
||||
import paddle.nn as nn
|
||||
import paddle.nn.functional as F
|
||||
|
||||
|
||||
__all__ = [
|
||||
'_get_clones', 'bbox_cxcywh_to_xyxy',
|
||||
'bbox_xyxy_to_cxcywh', 'sigmoid_focal_loss', 'inverse_sigmoid',
|
||||
'deformable_attention_core_func', 'varifocal_loss_with_logits'
|
||||
]
|
||||
|
||||
|
||||
|
||||
def bbox_area(boxes):
|
||||
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
|
||||
|
||||
|
||||
def bbox_overlaps(boxes1, boxes2):
|
||||
"""
|
||||
Calculate overlaps between boxes1 and boxes2
|
||||
|
||||
Args:
|
||||
boxes1 (Tensor): boxes with shape [M, 4]
|
||||
boxes2 (Tensor): boxes with shape [N, 4]
|
||||
|
||||
Return:
|
||||
overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N]
|
||||
"""
|
||||
M = boxes1.shape[0]
|
||||
N = boxes2.shape[0]
|
||||
if M * N == 0:
|
||||
return paddle.zeros([M, N], dtype='float32')
|
||||
area1 = bbox_area(boxes1)
|
||||
area2 = bbox_area(boxes2)
|
||||
|
||||
xy_max = paddle.minimum(
|
||||
paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:])
|
||||
xy_min = paddle.maximum(
|
||||
paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2])
|
||||
width_height = xy_max - xy_min
|
||||
width_height = width_height.clip(min=0)
|
||||
inter = width_height.prod(axis=2)
|
||||
|
||||
overlaps = paddle.where(inter > 0, inter /
|
||||
(paddle.unsqueeze(area1, 1) + area2 - inter),
|
||||
paddle.zeros_like(inter))
|
||||
return overlaps
|
||||
|
||||
|
||||
def _get_clones(module, N):
|
||||
return nn.LayerList([copy.deepcopy(module) for _ in range(N)])
|
||||
|
||||
|
||||
def bbox_cxcywh_to_xyxy(x):
|
||||
cxcy, wh = paddle.split(x, 2, axis=-1)
|
||||
return paddle.concat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], axis=-1)
|
||||
|
||||
|
||||
def bbox_xyxy_to_cxcywh(x):
|
||||
x1, y1, x2, y2 = x.split(4, axis=-1)
|
||||
return paddle.concat(
|
||||
[(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)], axis=-1)
|
||||
|
||||
|
||||
def sigmoid_focal_loss(logit, label, normalizer=1.0, alpha=0.25, gamma=2.0):
|
||||
prob = F.sigmoid(logit)
|
||||
ce_loss = F.binary_cross_entropy_with_logits(logit, label, reduction="none")
|
||||
p_t = prob * label + (1 - prob) * (1 - label)
|
||||
loss = ce_loss * ((1 - p_t)**gamma)
|
||||
|
||||
if alpha >= 0:
|
||||
alpha_t = alpha * label + (1 - alpha) * (1 - label)
|
||||
loss = alpha_t * loss
|
||||
return loss.mean(1).sum() / normalizer
|
||||
|
||||
|
||||
def inverse_sigmoid(x, eps=1e-5):
|
||||
x = x.clip(min=0., max=1.)
|
||||
return paddle.log(x.clip(min=eps) / (1 - x).clip(min=eps))
|
||||
|
||||
|
||||
def deformable_attention_core_func(value, value_spatial_shapes,
|
||||
value_level_start_index, sampling_locations,
|
||||
attention_weights):
|
||||
"""
|
||||
Args:
|
||||
value (Tensor): [bs, value_length, n_head, c]
|
||||
value_spatial_shapes (Tensor|List): [n_levels, 2]
|
||||
value_level_start_index (Tensor|List): [n_levels]
|
||||
sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2]
|
||||
attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points]
|
||||
|
||||
Returns:
|
||||
output (Tensor): [bs, Length_{query}, C]
|
||||
"""
|
||||
bs, _, n_head, c = value.shape
|
||||
_, Len_q, _, n_levels, n_points, _ = sampling_locations.shape
|
||||
|
||||
split_shape = [h * w for h, w in value_spatial_shapes]
|
||||
value_list = value.split(split_shape, axis=1)
|
||||
sampling_grids = 2 * sampling_locations - 1
|
||||
sampling_value_list = []
|
||||
for level, (h, w) in enumerate(value_spatial_shapes):
|
||||
# N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
|
||||
value_l_ = value_list[level].flatten(2).transpose(
|
||||
[0, 2, 1]).reshape([bs * n_head, c, h, w])
|
||||
# N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
|
||||
sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(
|
||||
[0, 2, 1, 3, 4]).flatten(0, 1)
|
||||
# N_*M_, D_, Lq_, P_
|
||||
sampling_value_l_ = F.grid_sample(
|
||||
value_l_,
|
||||
sampling_grid_l_,
|
||||
mode='bilinear',
|
||||
padding_mode='zeros',
|
||||
align_corners=False)
|
||||
sampling_value_list.append(sampling_value_l_)
|
||||
# (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_)
|
||||
attention_weights = attention_weights.transpose([0, 2, 1, 3, 4]).reshape(
|
||||
[bs * n_head, 1, Len_q, n_levels * n_points])
|
||||
output = (paddle.stack(
|
||||
sampling_value_list, axis=-2).flatten(-2) *
|
||||
attention_weights).sum(-1).reshape([bs, n_head * c, Len_q])
|
||||
|
||||
return output.transpose([0, 2, 1])
|
||||
|
||||
|
||||
def get_valid_ratio(mask):
|
||||
_, H, W = paddle.shape(mask)
|
||||
valid_ratio_h = paddle.sum(mask[:, :, 0], 1) / H
|
||||
valid_ratio_w = paddle.sum(mask[:, 0, :], 1) / W
|
||||
# [b, 2]
|
||||
return paddle.stack([valid_ratio_w, valid_ratio_h], -1)
|
||||
|
||||
|
||||
def get_denoising_training_group(targets,
|
||||
num_classes,
|
||||
num_queries,
|
||||
class_embed,
|
||||
num_denoising=100,
|
||||
label_noise_ratio=0.5,
|
||||
box_noise_scale=1.0):
|
||||
if num_denoising <= 0:
|
||||
return None, None, None, None
|
||||
num_gts = [len(t) for t in targets["gt_class"]]
|
||||
max_gt_num = max(num_gts)
|
||||
if max_gt_num == 0:
|
||||
return None, None, None, None
|
||||
|
||||
num_group = num_denoising // max_gt_num
|
||||
num_group = 1 if num_group == 0 else num_group
|
||||
# pad gt to max_num of a batch
|
||||
bs = len(targets["gt_class"])
|
||||
input_query_class = paddle.full(
|
||||
[bs, max_gt_num], num_classes, dtype='int32')
|
||||
input_query_bbox = paddle.zeros([bs, max_gt_num, 4])
|
||||
pad_gt_mask = paddle.zeros([bs, max_gt_num])
|
||||
for i in range(bs):
|
||||
num_gt = num_gts[i]
|
||||
if num_gt > 0:
|
||||
input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1)
|
||||
input_query_bbox[i, :num_gt] = targets["gt_bbox"][i]
|
||||
pad_gt_mask[i, :num_gt] = 1
|
||||
|
||||
input_query_class = input_query_class.tile([1, num_group])
|
||||
input_query_bbox = input_query_bbox.tile([1, num_group, 1])
|
||||
pad_gt_mask = pad_gt_mask.tile([1, num_group])
|
||||
|
||||
dn_positive_idx = paddle.nonzero(pad_gt_mask)[:, 1]
|
||||
dn_positive_idx = paddle.split(dn_positive_idx,
|
||||
[n * num_group for n in num_gts])
|
||||
# total denoising queries
|
||||
num_denoising = int(max_gt_num * num_group)
|
||||
|
||||
if label_noise_ratio > 0:
|
||||
input_query_class = input_query_class.flatten()
|
||||
pad_gt_mask = pad_gt_mask.flatten()
|
||||
# half of bbox prob
|
||||
mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5)
|
||||
chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1)
|
||||
# randomly put a new one here
|
||||
new_label = paddle.randint_like(
|
||||
chosen_idx, 0, num_classes, dtype=input_query_class.dtype)
|
||||
input_query_class.scatter_(chosen_idx, new_label)
|
||||
input_query_class.reshape_([bs, num_denoising])
|
||||
pad_gt_mask.reshape_([bs, num_denoising])
|
||||
|
||||
if box_noise_scale > 0:
|
||||
diff = paddle.concat(
|
||||
[input_query_bbox[..., 2:] * 0.5, input_query_bbox[..., 2:]],
|
||||
axis=-1) * box_noise_scale
|
||||
diff *= (paddle.rand(input_query_bbox.shape) * 2.0 - 1.0)
|
||||
input_query_bbox += diff
|
||||
input_query_bbox = inverse_sigmoid(input_query_bbox)
|
||||
|
||||
class_embed = paddle.concat(
|
||||
[class_embed, paddle.zeros([1, class_embed.shape[-1]])])
|
||||
input_query_class = paddle.gather(
|
||||
class_embed, input_query_class.flatten(),
|
||||
axis=0).reshape([bs, num_denoising, -1])
|
||||
|
||||
tgt_size = num_denoising + num_queries
|
||||
attn_mask = paddle.ones([tgt_size, tgt_size]) < 0
|
||||
# match query cannot see the reconstruction
|
||||
attn_mask[num_denoising:, :num_denoising] = True
|
||||
# reconstruct cannot see each other
|
||||
for i in range(num_group):
|
||||
if i == 0:
|
||||
attn_mask[max_gt_num * i:max_gt_num * (i + 1), max_gt_num * (i + 1):
|
||||
num_denoising] = True
|
||||
if i == num_group - 1:
|
||||
attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num *
|
||||
i] = True
|
||||
else:
|
||||
attn_mask[max_gt_num * i:max_gt_num * (i + 1), max_gt_num * (i + 1):
|
||||
num_denoising] = True
|
||||
attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num *
|
||||
i] = True
|
||||
attn_mask = ~attn_mask
|
||||
dn_meta = {
|
||||
"dn_positive_idx": dn_positive_idx,
|
||||
"dn_num_group": num_group,
|
||||
"dn_num_split": [num_denoising, num_queries]
|
||||
}
|
||||
|
||||
return input_query_class, input_query_bbox, attn_mask, dn_meta
|
||||
|
||||
|
||||
def get_contrastive_denoising_training_group(targets,
|
||||
num_classes,
|
||||
num_queries,
|
||||
class_embed,
|
||||
num_denoising=100,
|
||||
label_noise_ratio=0.5,
|
||||
box_noise_scale=1.0):
|
||||
if num_denoising <= 0:
|
||||
return None, None, None, None
|
||||
num_gts = [len(t) for t in targets["gt_class"]]
|
||||
max_gt_num = max(num_gts)
|
||||
if max_gt_num == 0:
|
||||
return None, None, None, None
|
||||
|
||||
num_group = num_denoising // max_gt_num
|
||||
num_group = 1 if num_group == 0 else num_group
|
||||
# pad gt to max_num of a batch
|
||||
bs = len(targets["gt_class"])
|
||||
input_query_class = paddle.full(
|
||||
[bs, max_gt_num], num_classes, dtype='int32')
|
||||
input_query_bbox = paddle.zeros([bs, max_gt_num, 4])
|
||||
pad_gt_mask = paddle.zeros([bs, max_gt_num])
|
||||
for i in range(bs):
|
||||
num_gt = num_gts[i]
|
||||
if num_gt > 0:
|
||||
input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1)
|
||||
input_query_bbox[i, :num_gt] = targets["gt_bbox"][i]
|
||||
pad_gt_mask[i, :num_gt] = 1
|
||||
# each group has positive and negative queries.
|
||||
input_query_class = input_query_class.tile([1, 2 * num_group])
|
||||
input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1])
|
||||
pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group])
|
||||
# positive and negative mask
|
||||
negative_gt_mask = paddle.zeros([bs, max_gt_num * 2, 1])
|
||||
negative_gt_mask[:, max_gt_num:] = 1
|
||||
negative_gt_mask = negative_gt_mask.tile([1, num_group, 1])
|
||||
positive_gt_mask = 1 - negative_gt_mask
|
||||
# contrastive denoising training positive index
|
||||
positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
|
||||
dn_positive_idx = paddle.nonzero(positive_gt_mask)[:, 1]
|
||||
dn_positive_idx = paddle.split(dn_positive_idx,
|
||||
[n * num_group for n in num_gts])
|
||||
# total denoising queries
|
||||
num_denoising = int(max_gt_num * 2 * num_group)
|
||||
|
||||
if label_noise_ratio > 0:
|
||||
input_query_class = input_query_class.flatten()
|
||||
pad_gt_mask = pad_gt_mask.flatten()
|
||||
|
||||
# Convert pad_gt_mask to bool if it's not already
|
||||
pad_gt_mask = pad_gt_mask.astype('bool')
|
||||
|
||||
# half of bbox prob
|
||||
mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5)
|
||||
chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1)
|
||||
|
||||
# randomly put a new one here
|
||||
new_label = paddle.randint_like(
|
||||
chosen_idx, 0, num_classes, dtype=input_query_class.dtype)
|
||||
|
||||
input_query_class.scatter_(chosen_idx, new_label)
|
||||
input_query_class.reshape_([bs, num_denoising])
|
||||
pad_gt_mask.reshape_([bs, num_denoising])
|
||||
|
||||
if box_noise_scale > 0:
|
||||
known_bbox = bbox_cxcywh_to_xyxy(input_query_bbox)
|
||||
|
||||
diff = paddle.tile(input_query_bbox[..., 2:] * 0.5,
|
||||
[1, 1, 2]) * box_noise_scale
|
||||
|
||||
rand_sign = paddle.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
|
||||
rand_part = paddle.rand(input_query_bbox.shape)
|
||||
rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (
|
||||
1 - negative_gt_mask)
|
||||
rand_part *= rand_sign
|
||||
known_bbox += rand_part * diff
|
||||
known_bbox.clip_(min=0.0, max=1.0)
|
||||
input_query_bbox = bbox_xyxy_to_cxcywh(known_bbox)
|
||||
input_query_bbox = inverse_sigmoid(input_query_bbox)
|
||||
|
||||
class_embed = paddle.concat(
|
||||
[class_embed, paddle.zeros([1, class_embed.shape[-1]])])
|
||||
input_query_class = paddle.gather(
|
||||
class_embed, input_query_class.flatten(),
|
||||
axis=0).reshape([bs, num_denoising, -1])
|
||||
|
||||
tgt_size = num_denoising + num_queries
|
||||
attn_mask = paddle.ones([tgt_size, tgt_size]) < 0
|
||||
# match query cannot see the reconstruction
|
||||
attn_mask[num_denoising:, :num_denoising] = True
|
||||
# reconstruct cannot see each other
|
||||
for i in range(num_group):
|
||||
if i == 0:
|
||||
attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num *
|
||||
2 * (i + 1):num_denoising] = True
|
||||
if i == num_group - 1:
|
||||
attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *
|
||||
i * 2] = True
|
||||
else:
|
||||
attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num *
|
||||
2 * (i + 1):num_denoising] = True
|
||||
attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *
|
||||
2 * i] = True
|
||||
attn_mask = ~attn_mask
|
||||
dn_meta = {
|
||||
"dn_positive_idx": dn_positive_idx,
|
||||
"dn_num_group": num_group,
|
||||
"dn_num_split": [num_denoising, num_queries]
|
||||
}
|
||||
|
||||
return input_query_class, input_query_bbox, attn_mask, dn_meta
|
||||
|
||||
|
||||
def get_sine_pos_embed(pos_tensor,
|
||||
num_pos_feats=128,
|
||||
temperature=10000,
|
||||
exchange_xy=True):
|
||||
"""generate sine position embedding from a position tensor
|
||||
|
||||
Args:
|
||||
pos_tensor (Tensor): Shape as `(None, n)`.
|
||||
num_pos_feats (int): projected shape for each float in the tensor. Default: 128
|
||||
temperature (int): The temperature used for scaling
|
||||
the position embedding. Default: 10000.
|
||||
exchange_xy (bool, optional): exchange pos x and pos y. \
|
||||
For example, input tensor is `[x, y]`, the results will # noqa
|
||||
be `[pos(y), pos(x)]`. Defaults: True.
|
||||
|
||||
Returns:
|
||||
Tensor: Returned position embedding # noqa
|
||||
with shape `(None, n * num_pos_feats)`.
|
||||
"""
|
||||
scale = 2. * math.pi
|
||||
dim_t = 2. * paddle.floor_divide(
|
||||
paddle.arange(num_pos_feats), paddle.to_tensor(2))
|
||||
dim_t = scale / temperature**(dim_t / num_pos_feats)
|
||||
|
||||
def sine_func(x):
|
||||
x *= dim_t
|
||||
return paddle.stack(
|
||||
(x[:, :, 0::2].sin(), x[:, :, 1::2].cos()), axis=3).flatten(2)
|
||||
|
||||
pos_res = [sine_func(x) for x in pos_tensor.split(pos_tensor.shape[-1], -1)]
|
||||
if exchange_xy:
|
||||
pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
|
||||
pos_res = paddle.concat(pos_res, axis=2)
|
||||
return pos_res
|
||||
|
||||
|
||||
def mask_to_box_coordinate(mask,
|
||||
normalize=False,
|
||||
format="xyxy",
|
||||
dtype="float32"):
|
||||
"""
|
||||
Compute the bounding boxes around the provided mask.
|
||||
Args:
|
||||
mask (Tensor:bool): [b, c, h, w]
|
||||
|
||||
Returns:
|
||||
bbox (Tensor): [b, c, 4]
|
||||
"""
|
||||
assert mask.ndim == 4
|
||||
assert format in ["xyxy", "xywh"]
|
||||
if mask.sum() == 0:
|
||||
return paddle.zeros([mask.shape[0], mask.shape[1], 4], dtype=dtype)
|
||||
|
||||
h, w = mask.shape[-2:]
|
||||
y, x = paddle.meshgrid(
|
||||
paddle.arange(
|
||||
end=h, dtype=dtype), paddle.arange(
|
||||
end=w, dtype=dtype))
|
||||
|
||||
x_mask = x * mask
|
||||
x_max = x_mask.flatten(-2).max(-1) + 1
|
||||
x_min = paddle.where(mask, x_mask,
|
||||
paddle.to_tensor(1e8)).flatten(-2).min(-1)
|
||||
|
||||
y_mask = y * mask
|
||||
y_max = y_mask.flatten(-2).max(-1) + 1
|
||||
y_min = paddle.where(mask, y_mask,
|
||||
paddle.to_tensor(1e8)).flatten(-2).min(-1)
|
||||
out_bbox = paddle.stack([x_min, y_min, x_max, y_max], axis=-1)
|
||||
if normalize:
|
||||
out_bbox /= paddle.to_tensor([w, h, w, h]).astype(dtype)
|
||||
|
||||
return out_bbox if format == "xyxy" else bbox_xyxy_to_cxcywh(out_bbox)
|
||||
|
||||
|
||||
def varifocal_loss_with_logits(pred_logits,
|
||||
gt_score,
|
||||
label,
|
||||
normalizer=1.0,
|
||||
alpha=0.75,
|
||||
gamma=2.0):
|
||||
pred_score = F.sigmoid(pred_logits)
|
||||
weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
|
||||
loss = F.binary_cross_entropy_with_logits(
|
||||
pred_logits, gt_score, weight=weight, reduction='none')
|
||||
return loss.mean(1).sum() / normalizer
|
||||
|
||||
|
||||
|
||||
|
||||
from ..initializer import linear_init_
|
||||
|
||||
class MLP(nn.Layer):
|
||||
"""This code is based on
|
||||
https://github.com/facebookresearch/detr/blob/main/models/detr.py
|
||||
"""
|
||||
|
||||
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
|
||||
super().__init__()
|
||||
self.num_layers = num_layers
|
||||
h = [hidden_dim] * (num_layers - 1)
|
||||
self.layers = nn.LayerList(
|
||||
nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
|
||||
|
||||
self._reset_parameters()
|
||||
|
||||
def _reset_parameters(self):
|
||||
for l in self.layers:
|
||||
linear_init_(l)
|
||||
|
||||
def forward(self, x):
|
||||
for i, layer in enumerate(self.layers):
|
||||
x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
|
||||
return x
|
||||
|
||||
Reference in New Issue
Block a user