first commit

This commit is contained in:
陈赣
2026-06-03 12:42:47 +08:00
commit ec23799148
339 changed files with 57120 additions and 0 deletions

View File

@@ -0,0 +1,20 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .utils import *
from .matchers import *
from .position_encoding import *
from .rtdetr_transformer import *
from .dino_transformer import *
from .hybrid_encoder import *

View File

@@ -0,0 +1,537 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
# Copyright (c) 2020 SenseTime. All Rights Reserved.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from ppdet.core.workspace import register
from ..layers import MultiHeadAttention
from .position_encoding import PositionEmbedding
from .utils import _get_clones, get_valid_ratio
from ..initializer import linear_init_, constant_, xavier_uniform_, normal_
__all__ = ['DeformableTransformer']
class MSDeformableAttention(nn.Layer):
def __init__(self,
embed_dim=256,
num_heads=8,
num_levels=4,
num_points=4,
lr_mult=0.1):
"""
Multi-Scale Deformable Attention Module
"""
super(MSDeformableAttention, self).__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.num_levels = num_levels
self.num_points = num_points
self.total_points = num_heads * num_levels * num_points
self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
self.sampling_offsets = nn.Linear(
embed_dim,
self.total_points * 2,
weight_attr=ParamAttr(learning_rate=lr_mult),
bias_attr=ParamAttr(learning_rate=lr_mult))
self.attention_weights = nn.Linear(embed_dim, self.total_points)
self.value_proj = nn.Linear(embed_dim, embed_dim)
self.output_proj = nn.Linear(embed_dim, embed_dim)
try:
# use cuda op
from deformable_detr_ops import ms_deformable_attn
except:
# use paddle func
from .utils import deformable_attention_core_func as ms_deformable_attn
self.ms_deformable_attn_core = ms_deformable_attn
self._reset_parameters()
def _reset_parameters(self):
# sampling_offsets
constant_(self.sampling_offsets.weight)
thetas = paddle.arange(
self.num_heads,
dtype=paddle.float32) * (2.0 * math.pi / self.num_heads)
grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1)
grid_init = grid_init / grid_init.abs().max(-1, keepdim=True)
grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile(
[1, self.num_levels, self.num_points, 1])
scaling = paddle.arange(
1, self.num_points + 1,
dtype=paddle.float32).reshape([1, 1, -1, 1])
grid_init *= scaling
self.sampling_offsets.bias.set_value(grid_init.flatten())
# attention_weights
constant_(self.attention_weights.weight)
constant_(self.attention_weights.bias)
# proj
xavier_uniform_(self.value_proj.weight)
constant_(self.value_proj.bias)
xavier_uniform_(self.output_proj.weight)
constant_(self.output_proj.bias)
def forward(self,
query,
reference_points,
value,
value_spatial_shapes,
value_level_start_index,
value_mask=None):
"""
Args:
query (Tensor): [bs, query_length, C]
reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
bottom-right (1, 1), including padding area
value (Tensor): [bs, value_length, C]
value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
Returns:
output (Tensor): [bs, Length_{query}, C]
"""
bs, Len_q = query.shape[:2]
Len_v = value.shape[1]
assert int(value_spatial_shapes.prod(1).sum()) == Len_v
value = self.value_proj(value)
if value_mask is not None:
value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
value *= value_mask
value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])
sampling_offsets = self.sampling_offsets(query).reshape(
[bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
attention_weights = self.attention_weights(query).reshape(
[bs, Len_q, self.num_heads, self.num_levels * self.num_points])
attention_weights = F.softmax(attention_weights).reshape(
[bs, Len_q, self.num_heads, self.num_levels, self.num_points])
if reference_points.shape[-1] == 2:
offset_normalizer = value_spatial_shapes.flip([1]).reshape(
[1, 1, 1, self.num_levels, 1, 2])
sampling_locations = reference_points.reshape([
bs, Len_q, 1, self.num_levels, 1, 2
]) + sampling_offsets / offset_normalizer
elif reference_points.shape[-1] == 4:
sampling_locations = (
reference_points[:, :, None, :, None, :2] + sampling_offsets /
self.num_points * reference_points[:, :, None, :, None, 2:] *
0.5)
else:
raise ValueError(
"Last dim of reference_points must be 2 or 4, but get {} instead.".
format(reference_points.shape[-1]))
output = self.ms_deformable_attn_core(
value, value_spatial_shapes, value_level_start_index,
sampling_locations, attention_weights)
output = self.output_proj(output)
return output
class DeformableTransformerEncoderLayer(nn.Layer):
def __init__(self,
d_model=256,
n_head=8,
dim_feedforward=1024,
dropout=0.1,
activation="relu",
n_levels=4,
n_points=4,
lr_mult=0.1,
weight_attr=None,
bias_attr=None):
super(DeformableTransformerEncoderLayer, self).__init__()
# self attention
self.self_attn = MSDeformableAttention(d_model, n_head, n_levels,
n_points, lr_mult)
self.dropout1 = nn.Dropout(dropout)
self.norm1 = nn.LayerNorm(
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
# ffn
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.activation = getattr(F, activation)
self.dropout2 = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.dropout3 = nn.Dropout(dropout)
self.norm2 = nn.LayerNorm(
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.linear1)
linear_init_(self.linear2)
xavier_uniform_(self.linear1.weight)
xavier_uniform_(self.linear2.weight)
def with_pos_embed(self, tensor, pos):
return tensor if pos is None else tensor + pos
def forward_ffn(self, src):
src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
src = src + self.dropout3(src2)
src = self.norm2(src)
return src
def forward(self,
src,
reference_points,
spatial_shapes,
level_start_index,
src_mask=None,
query_pos_embed=None):
# self attention
src2 = self.self_attn(
self.with_pos_embed(src, query_pos_embed), reference_points, src,
spatial_shapes, level_start_index, src_mask)
src = src + self.dropout1(src2)
src = self.norm1(src)
# ffn
src = self.forward_ffn(src)
return src
class DeformableTransformerEncoder(nn.Layer):
def __init__(self, encoder_layer, num_layers):
super(DeformableTransformerEncoder, self).__init__()
self.layers = _get_clones(encoder_layer, num_layers)
self.num_layers = num_layers
@staticmethod
def get_reference_points(spatial_shapes, valid_ratios, offset=0.5):
valid_ratios = valid_ratios.unsqueeze(1)
reference_points = []
for i, (H, W) in enumerate(spatial_shapes):
ref_y, ref_x = paddle.meshgrid(
paddle.arange(end=H) + offset, paddle.arange(end=W) + offset)
ref_y = ref_y.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 1] *
H)
ref_x = ref_x.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 0] *
W)
reference_points.append(paddle.stack((ref_x, ref_y), axis=-1))
reference_points = paddle.concat(reference_points, 1).unsqueeze(2)
reference_points = reference_points * valid_ratios
return reference_points
def forward(self,
feat,
spatial_shapes,
level_start_index,
feat_mask=None,
query_pos_embed=None,
valid_ratios=None):
if valid_ratios is None:
valid_ratios = paddle.ones(
[feat.shape[0], spatial_shapes.shape[0], 2])
reference_points = self.get_reference_points(spatial_shapes,
valid_ratios)
for layer in self.layers:
feat = layer(feat, reference_points, spatial_shapes,
level_start_index, feat_mask, query_pos_embed)
return feat
class DeformableTransformerDecoderLayer(nn.Layer):
def __init__(self,
d_model=256,
n_head=8,
dim_feedforward=1024,
dropout=0.1,
activation="relu",
n_levels=4,
n_points=4,
lr_mult=0.1,
weight_attr=None,
bias_attr=None):
super(DeformableTransformerDecoderLayer, self).__init__()
# self attention
self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
self.dropout1 = nn.Dropout(dropout)
self.norm1 = nn.LayerNorm(
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
# cross attention
self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,
n_points, lr_mult)
self.dropout2 = nn.Dropout(dropout)
self.norm2 = nn.LayerNorm(
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
# ffn
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.activation = getattr(F, activation)
self.dropout3 = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.dropout4 = nn.Dropout(dropout)
self.norm3 = nn.LayerNorm(
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.linear1)
linear_init_(self.linear2)
xavier_uniform_(self.linear1.weight)
xavier_uniform_(self.linear2.weight)
def with_pos_embed(self, tensor, pos):
return tensor if pos is None else tensor + pos
def forward_ffn(self, tgt):
tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
tgt = tgt + self.dropout4(tgt2)
tgt = self.norm3(tgt)
return tgt
def forward(self,
tgt,
reference_points,
memory,
memory_spatial_shapes,
memory_level_start_index,
memory_mask=None,
query_pos_embed=None):
# self attention
q = k = self.with_pos_embed(tgt, query_pos_embed)
tgt2 = self.self_attn(q, k, value=tgt)
tgt = tgt + self.dropout1(tgt2)
tgt = self.norm1(tgt)
# cross attention
tgt2 = self.cross_attn(
self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
memory_spatial_shapes, memory_level_start_index, memory_mask)
tgt = tgt + self.dropout2(tgt2)
tgt = self.norm2(tgt)
# ffn
tgt = self.forward_ffn(tgt)
return tgt
class DeformableTransformerDecoder(nn.Layer):
def __init__(self, decoder_layer, num_layers, return_intermediate=False):
super(DeformableTransformerDecoder, self).__init__()
self.layers = _get_clones(decoder_layer, num_layers)
self.num_layers = num_layers
self.return_intermediate = return_intermediate
def forward(self,
tgt,
reference_points,
memory,
memory_spatial_shapes,
memory_level_start_index,
memory_mask=None,
query_pos_embed=None):
output = tgt
intermediate = []
for lid, layer in enumerate(self.layers):
output = layer(output, reference_points, memory,
memory_spatial_shapes, memory_level_start_index,
memory_mask, query_pos_embed)
if self.return_intermediate:
intermediate.append(output)
if self.return_intermediate:
return paddle.stack(intermediate)
return output.unsqueeze(0)
@register
class DeformableTransformer(nn.Layer):
__shared__ = ['hidden_dim']
def __init__(self,
num_queries=300,
position_embed_type='sine',
return_intermediate_dec=True,
in_feats_channel=[512, 1024, 2048],
num_feature_levels=4,
num_encoder_points=4,
num_decoder_points=4,
hidden_dim=256,
nhead=8,
num_encoder_layers=6,
num_decoder_layers=6,
dim_feedforward=1024,
dropout=0.1,
activation="relu",
lr_mult=0.1,
pe_temperature=10000,
pe_offset=-0.5):
super(DeformableTransformer, self).__init__()
assert position_embed_type in ['sine', 'learned'], \
f'ValueError: position_embed_type not supported {position_embed_type}!'
assert len(in_feats_channel) <= num_feature_levels
self.hidden_dim = hidden_dim
self.nhead = nhead
self.num_feature_levels = num_feature_levels
encoder_layer = DeformableTransformerEncoderLayer(
hidden_dim, nhead, dim_feedforward, dropout, activation,
num_feature_levels, num_encoder_points, lr_mult)
self.encoder = DeformableTransformerEncoder(encoder_layer,
num_encoder_layers)
decoder_layer = DeformableTransformerDecoderLayer(
hidden_dim, nhead, dim_feedforward, dropout, activation,
num_feature_levels, num_decoder_points)
self.decoder = DeformableTransformerDecoder(
decoder_layer, num_decoder_layers, return_intermediate_dec)
self.level_embed = nn.Embedding(num_feature_levels, hidden_dim)
self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)
self.reference_points = nn.Linear(
hidden_dim,
2,
weight_attr=ParamAttr(learning_rate=lr_mult),
bias_attr=ParamAttr(learning_rate=lr_mult))
self.input_proj = nn.LayerList()
for in_channels in in_feats_channel:
self.input_proj.append(
nn.Sequential(
nn.Conv2D(
in_channels, hidden_dim, kernel_size=1),
nn.GroupNorm(32, hidden_dim)))
in_channels = in_feats_channel[-1]
for _ in range(num_feature_levels - len(in_feats_channel)):
self.input_proj.append(
nn.Sequential(
nn.Conv2D(
in_channels,
hidden_dim,
kernel_size=3,
stride=2,
padding=1),
nn.GroupNorm(32, hidden_dim)))
in_channels = hidden_dim
self.position_embedding = PositionEmbedding(
hidden_dim // 2,
temperature=pe_temperature,
normalize=True if position_embed_type == 'sine' else False,
embed_type=position_embed_type,
offset=pe_offset,
eps=1e-4)
self._reset_parameters()
def _reset_parameters(self):
normal_(self.level_embed.weight)
normal_(self.tgt_embed.weight)
normal_(self.query_pos_embed.weight)
xavier_uniform_(self.reference_points.weight)
constant_(self.reference_points.bias)
for l in self.input_proj:
xavier_uniform_(l[0].weight)
constant_(l[0].bias)
@classmethod
def from_config(cls, cfg, input_shape):
return {'in_feats_channel': [i.channels for i in input_shape], }
def forward(self, src_feats, src_mask=None, *args, **kwargs):
srcs = []
for i in range(len(src_feats)):
srcs.append(self.input_proj[i](src_feats[i]))
if self.num_feature_levels > len(srcs):
len_srcs = len(srcs)
for i in range(len_srcs, self.num_feature_levels):
if i == len_srcs:
srcs.append(self.input_proj[i](src_feats[-1]))
else:
srcs.append(self.input_proj[i](srcs[-1]))
src_flatten = []
mask_flatten = []
lvl_pos_embed_flatten = []
spatial_shapes = []
valid_ratios = []
for level, src in enumerate(srcs):
src_shape = paddle.shape(src)
bs = src_shape[0:1]
h = src_shape[2:3]
w = src_shape[3:4]
spatial_shapes.append(paddle.concat([h, w]))
src = src.flatten(2).transpose([0, 2, 1])
src_flatten.append(src)
if src_mask is not None:
mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]
else:
mask = paddle.ones([bs, h, w])
valid_ratios.append(get_valid_ratio(mask))
pos_embed = self.position_embedding(mask).flatten(1, 2)
lvl_pos_embed = pos_embed + self.level_embed.weight[level]
lvl_pos_embed_flatten.append(lvl_pos_embed)
mask = mask.flatten(1)
mask_flatten.append(mask)
src_flatten = paddle.concat(src_flatten, 1)
mask_flatten = None if src_mask is None else paddle.concat(mask_flatten,
1)
lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
# [l, 2]
spatial_shapes = paddle.to_tensor(
paddle.stack(spatial_shapes).astype('int64'))
# [l], 每一个level的起始index
level_start_index = paddle.concat([
paddle.zeros(
[1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
])
# [b, l, 2]
valid_ratios = paddle.stack(valid_ratios, 1)
# encoder
memory = self.encoder(src_flatten, spatial_shapes, level_start_index,
mask_flatten, lvl_pos_embed_flatten, valid_ratios)
# prepare input for decoder
bs, _, c = memory.shape
query_embed = self.query_pos_embed.weight.unsqueeze(0).tile([bs, 1, 1])
tgt = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
reference_points = F.sigmoid(self.reference_points(query_embed))
reference_points_input = reference_points.unsqueeze(
2) * valid_ratios.unsqueeze(1)
# decoder
hs = self.decoder(tgt, reference_points_input, memory, spatial_shapes,
level_start_index, mask_flatten, query_embed)
return (hs, memory, reference_points)

View File

@@ -0,0 +1,359 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from ..layers import MultiHeadAttention, _convert_attention_mask
from .position_encoding import PositionEmbedding
from .utils import _get_clones
from ..initializer import linear_init_, conv_init_, xavier_uniform_, normal_
__all__ = ['DETRTransformer']
class TransformerEncoderLayer(nn.Layer):
def __init__(self,
d_model,
nhead,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
attn_dropout=None,
act_dropout=None,
normalize_before=False):
super(TransformerEncoderLayer, self).__init__()
attn_dropout = dropout if attn_dropout is None else attn_dropout
act_dropout = dropout if act_dropout is None else act_dropout
self.normalize_before = normalize_before
self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
self.activation = getattr(F, activation)
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.linear1)
linear_init_(self.linear2)
@staticmethod
def with_pos_embed(tensor, pos_embed):
return tensor if pos_embed is None else tensor + pos_embed
def forward(self, src, src_mask=None, pos_embed=None):
residual = src
if self.normalize_before:
src = self.norm1(src)
q = k = self.with_pos_embed(src, pos_embed)
src = self.self_attn(q, k, value=src, attn_mask=src_mask)
src = residual + self.dropout1(src)
if not self.normalize_before:
src = self.norm1(src)
residual = src
if self.normalize_before:
src = self.norm2(src)
src = self.linear2(self.dropout(self.activation(self.linear1(src))))
src = residual + self.dropout2(src)
if not self.normalize_before:
src = self.norm2(src)
return src
class TransformerEncoder(nn.Layer):
def __init__(self, encoder_layer, num_layers, norm=None):
super(TransformerEncoder, self).__init__()
self.layers = _get_clones(encoder_layer, num_layers)
self.num_layers = num_layers
self.norm = norm
def forward(self, src, src_mask=None, pos_embed=None):
output = src
for layer in self.layers:
output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
if self.norm is not None:
output = self.norm(output)
return output
class TransformerDecoderLayer(nn.Layer):
def __init__(self,
d_model,
nhead,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
attn_dropout=None,
act_dropout=None,
normalize_before=False):
super(TransformerDecoderLayer, self).__init__()
attn_dropout = dropout if attn_dropout is None else attn_dropout
act_dropout = dropout if act_dropout is None else act_dropout
self.normalize_before = normalize_before
self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
self.dropout3 = nn.Dropout(dropout, mode="upscale_in_train")
self.activation = getattr(F, activation)
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.linear1)
linear_init_(self.linear2)
@staticmethod
def with_pos_embed(tensor, pos_embed):
return tensor if pos_embed is None else tensor + pos_embed
def forward(self,
tgt,
memory,
tgt_mask=None,
memory_mask=None,
pos_embed=None,
query_pos_embed=None):
tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
residual = tgt
if self.normalize_before:
tgt = self.norm1(tgt)
q = k = self.with_pos_embed(tgt, query_pos_embed)
tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask)
tgt = residual + self.dropout1(tgt)
if not self.normalize_before:
tgt = self.norm1(tgt)
residual = tgt
if self.normalize_before:
tgt = self.norm2(tgt)
q = self.with_pos_embed(tgt, query_pos_embed)
k = self.with_pos_embed(memory, pos_embed)
tgt = self.cross_attn(q, k, value=memory, attn_mask=memory_mask)
tgt = residual + self.dropout2(tgt)
if not self.normalize_before:
tgt = self.norm2(tgt)
residual = tgt
if self.normalize_before:
tgt = self.norm3(tgt)
tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
tgt = residual + self.dropout3(tgt)
if not self.normalize_before:
tgt = self.norm3(tgt)
return tgt
class TransformerDecoder(nn.Layer):
def __init__(self,
decoder_layer,
num_layers,
norm=None,
return_intermediate=False):
super(TransformerDecoder, self).__init__()
self.layers = _get_clones(decoder_layer, num_layers)
self.num_layers = num_layers
self.norm = norm
self.return_intermediate = return_intermediate
def forward(self,
tgt,
memory,
tgt_mask=None,
memory_mask=None,
pos_embed=None,
query_pos_embed=None):
tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
output = tgt
intermediate = []
for layer in self.layers:
output = layer(
output,
memory,
tgt_mask=tgt_mask,
memory_mask=memory_mask,
pos_embed=pos_embed,
query_pos_embed=query_pos_embed)
if self.return_intermediate:
intermediate.append(self.norm(output))
if self.norm is not None:
output = self.norm(output)
if self.return_intermediate:
return paddle.stack(intermediate)
return output.unsqueeze(0)
@register
class DETRTransformer(nn.Layer):
__shared__ = ['hidden_dim']
def __init__(self,
num_queries=100,
position_embed_type='sine',
return_intermediate_dec=True,
backbone_num_channels=2048,
hidden_dim=256,
nhead=8,
num_encoder_layers=6,
num_decoder_layers=6,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
pe_temperature=10000,
pe_offset=0.,
attn_dropout=None,
act_dropout=None,
normalize_before=False):
super(DETRTransformer, self).__init__()
assert position_embed_type in ['sine', 'learned'],\
f'ValueError: position_embed_type not supported {position_embed_type}!'
self.hidden_dim = hidden_dim
self.nhead = nhead
encoder_layer = TransformerEncoderLayer(
hidden_dim, nhead, dim_feedforward, dropout, activation,
attn_dropout, act_dropout, normalize_before)
encoder_norm = nn.LayerNorm(hidden_dim) if normalize_before else None
self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
encoder_norm)
decoder_layer = TransformerDecoderLayer(
hidden_dim, nhead, dim_feedforward, dropout, activation,
attn_dropout, act_dropout, normalize_before)
decoder_norm = nn.LayerNorm(hidden_dim)
self.decoder = TransformerDecoder(
decoder_layer,
num_decoder_layers,
decoder_norm,
return_intermediate=return_intermediate_dec)
self.input_proj = nn.Conv2D(
backbone_num_channels, hidden_dim, kernel_size=1)
self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)
self.position_embedding = PositionEmbedding(
hidden_dim // 2,
temperature=pe_temperature,
normalize=True if position_embed_type == 'sine' else False,
embed_type=position_embed_type,
offset=pe_offset)
self._reset_parameters()
def _reset_parameters(self):
for p in self.parameters():
if p.dim() > 1:
xavier_uniform_(p)
conv_init_(self.input_proj)
normal_(self.query_pos_embed.weight)
@classmethod
def from_config(cls, cfg, input_shape):
return {
'backbone_num_channels': [i.channels for i in input_shape][-1],
}
def _convert_attention_mask(self, mask):
return (mask - 1.0) * 1e9
def forward(self, src, src_mask=None, *args, **kwargs):
r"""
Applies a Transformer model on the inputs.
Parameters:
src (List(Tensor)): Backbone feature maps with shape [[bs, c, h, w]].
src_mask (Tensor, optional): A tensor used in multi-head attention
to prevents attention to some unwanted positions, usually the
paddings or the subsequent positions. It is a tensor with shape
[bs, H, W]`. When the data type is bool, the unwanted positions
have `False` values and the others have `True` values. When the
data type is int, the unwanted positions have 0 values and the
others have 1 values. When the data type is float, the unwanted
positions have `-INF` values and the others have 0 values. It
can be None when nothing wanted or needed to be prevented
attention to. Default None.
Returns:
output (Tensor): [num_levels, batch_size, num_queries, hidden_dim]
memory (Tensor): [batch_size, hidden_dim, h, w]
"""
# use last level feature map
src_proj = self.input_proj(src[-1])
bs, c, h, w = paddle.shape(src_proj)
# flatten [B, C, H, W] to [B, HxW, C]
src_flatten = src_proj.flatten(2).transpose([0, 2, 1])
if src_mask is not None:
src_mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]
else:
src_mask = paddle.ones([bs, h, w])
pos_embed = self.position_embedding(src_mask).flatten(1, 2)
if self.training:
src_mask = self._convert_attention_mask(src_mask)
src_mask = src_mask.reshape([bs, 1, 1, h * w])
else:
src_mask = None
memory = self.encoder(
src_flatten, src_mask=src_mask, pos_embed=pos_embed)
query_pos_embed = self.query_pos_embed.weight.unsqueeze(0).tile(
[bs, 1, 1])
tgt = paddle.zeros_like(query_pos_embed)
output = self.decoder(
tgt,
memory,
memory_mask=src_mask,
pos_embed=pos_embed,
query_pos_embed=query_pos_embed)
if self.training:
src_mask = src_mask.reshape([bs, 1, 1, h, w])
else:
src_mask = None
return (output, memory.transpose([0, 2, 1]).reshape([bs, c, h, w]),
src_proj, src_mask)

View File

@@ -0,0 +1,527 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Modified from detrex (https://github.com/IDEA-Research/detrex)
# Copyright 2022 The IDEA Authors. All rights reserved.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register
from ..layers import MultiHeadAttention
from .position_encoding import PositionEmbedding
from .deformable_transformer import (MSDeformableAttention,
DeformableTransformerEncoderLayer,
DeformableTransformerEncoder)
from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,
bias_init_with_prob)
from .utils import (_get_clones, get_valid_ratio,
get_contrastive_denoising_training_group,
get_sine_pos_embed, inverse_sigmoid, MLP)
__all__ = ['DINOTransformer']
class DINOTransformerDecoderLayer(nn.Layer):
def __init__(self,
d_model=256,
n_head=8,
dim_feedforward=1024,
dropout=0.,
activation="relu",
n_levels=4,
n_points=4,
lr_mult=1.0,
weight_attr=None,
bias_attr=None):
super(DINOTransformerDecoderLayer, self).__init__()
# self attention
self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
self.dropout1 = nn.Dropout(dropout)
self.norm1 = nn.LayerNorm(
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
# cross attention
self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,
n_points, lr_mult)
self.dropout2 = nn.Dropout(dropout)
self.norm2 = nn.LayerNorm(
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
# ffn
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.activation = getattr(F, activation)
self.dropout3 = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.dropout4 = nn.Dropout(dropout)
self.norm3 = nn.LayerNorm(
d_model, weight_attr=weight_attr, bias_attr=bias_attr)
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.linear1)
linear_init_(self.linear2)
xavier_uniform_(self.linear1.weight)
xavier_uniform_(self.linear2.weight)
def with_pos_embed(self, tensor, pos):
return tensor if pos is None else tensor + pos
def forward_ffn(self, tgt):
return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
def forward(self,
tgt,
reference_points,
memory,
memory_spatial_shapes,
memory_level_start_index,
attn_mask=None,
memory_mask=None,
query_pos_embed=None):
# self attention
q = k = self.with_pos_embed(tgt, query_pos_embed)
if attn_mask is not None:
attn_mask = paddle.where(
attn_mask.astype('bool'),
paddle.zeros(attn_mask.shape, tgt.dtype),
paddle.full(attn_mask.shape, float("-inf"), tgt.dtype))
tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
tgt = tgt + self.dropout1(tgt2)
tgt = self.norm1(tgt)
# cross attention
tgt2 = self.cross_attn(
self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
memory_spatial_shapes, memory_level_start_index, memory_mask)
tgt = tgt + self.dropout2(tgt2)
tgt = self.norm2(tgt)
# ffn
tgt2 = self.forward_ffn(tgt)
tgt = tgt + self.dropout4(tgt2)
tgt = self.norm3(tgt)
return tgt
class DINOTransformerDecoder(nn.Layer):
def __init__(self,
hidden_dim,
decoder_layer,
num_layers,
weight_attr=None,
bias_attr=None):
super(DINOTransformerDecoder, self).__init__()
self.layers = _get_clones(decoder_layer, num_layers)
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.norm = nn.LayerNorm(
hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)
def forward(self,
tgt,
ref_points_unact,
memory,
memory_spatial_shapes,
memory_level_start_index,
bbox_head,
query_pos_head,
valid_ratios=None,
attn_mask=None,
memory_mask=None):
if valid_ratios is None:
valid_ratios = paddle.ones(
[memory.shape[0], memory_spatial_shapes.shape[0], 2])
output = tgt
intermediate = []
inter_bboxes = []
ref_points = F.sigmoid(ref_points_unact)
for i, layer in enumerate(self.layers):
reference_points_input = ref_points.detach().unsqueeze(
2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1)
query_pos_embed = get_sine_pos_embed(
reference_points_input[..., 0, :], self.hidden_dim // 2)
query_pos_embed = query_pos_head(query_pos_embed)
output = layer(output, reference_points_input, memory,
memory_spatial_shapes, memory_level_start_index,
attn_mask, memory_mask, query_pos_embed)
ref_points = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
ref_points.detach()))
intermediate.append(self.norm(output))
inter_bboxes.append(ref_points)
return paddle.stack(intermediate), paddle.stack(inter_bboxes)
@register
class DINOTransformer(nn.Layer):
__shared__ = ['num_classes', 'hidden_dim']
def __init__(self,
num_classes=80,
hidden_dim=256,
num_queries=900,
position_embed_type='sine',
in_feats_channel=[512, 1024, 2048],
num_levels=4,
num_encoder_points=4,
num_decoder_points=4,
nhead=8,
num_encoder_layers=6,
num_decoder_layers=6,
dim_feedforward=1024,
dropout=0.,
activation="relu",
lr_mult=1.0,
pe_temperature=10000,
pe_offset=-0.5,
num_denoising=100,
label_noise_ratio=0.5,
box_noise_scale=1.0,
learnt_init_query=True,
eps=1e-2):
super(DINOTransformer, self).__init__()
assert position_embed_type in ['sine', 'learned'], \
f'ValueError: position_embed_type not supported {position_embed_type}!'
assert len(in_feats_channel) <= num_levels
self.hidden_dim = hidden_dim
self.nhead = nhead
self.num_levels = num_levels
self.num_classes = num_classes
self.num_queries = num_queries
self.eps = eps
self.num_decoder_layers = num_decoder_layers
weight_attr = ParamAttr(regularizer=L2Decay(0.0))
bias_attr = ParamAttr(regularizer=L2Decay(0.0))
# backbone feature projection
self._build_input_proj_layer(in_feats_channel, weight_attr, bias_attr)
# Transformer module
encoder_layer = DeformableTransformerEncoderLayer(
hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
num_encoder_points, lr_mult, weight_attr, bias_attr)
self.encoder = DeformableTransformerEncoder(encoder_layer,
num_encoder_layers)
decoder_layer = DINOTransformerDecoderLayer(
hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
num_decoder_points, lr_mult, weight_attr, bias_attr)
self.decoder = DINOTransformerDecoder(hidden_dim, decoder_layer,
num_decoder_layers, weight_attr,
bias_attr)
# denoising part
self.denoising_class_embed = nn.Embedding(
num_classes,
hidden_dim,
weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
self.num_denoising = num_denoising
self.label_noise_ratio = label_noise_ratio
self.box_noise_scale = box_noise_scale
# position embedding
self.position_embedding = PositionEmbedding(
hidden_dim // 2,
temperature=pe_temperature,
normalize=True if position_embed_type == 'sine' else False,
embed_type=position_embed_type,
offset=pe_offset)
self.level_embed = nn.Embedding(num_levels, hidden_dim)
# decoder embedding
self.learnt_init_query = learnt_init_query
if learnt_init_query:
self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
self.query_pos_head = MLP(2 * hidden_dim,
hidden_dim,
hidden_dim,
num_layers=2)
# encoder head
self.enc_output = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
nn.LayerNorm(
hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr))
self.enc_score_head = nn.Linear(hidden_dim, num_classes)
self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
# decoder head
self.dec_score_head = nn.LayerList([
nn.Linear(hidden_dim, num_classes)
for _ in range(num_decoder_layers)
])
self.dec_bbox_head = nn.LayerList([
MLP(hidden_dim, hidden_dim, 4, num_layers=3)
for _ in range(num_decoder_layers)
])
self._reset_parameters()
def _reset_parameters(self):
# class and bbox head init
bias_cls = bias_init_with_prob(0.01)
linear_init_(self.enc_score_head)
constant_(self.enc_score_head.bias, bias_cls)
constant_(self.enc_bbox_head.layers[-1].weight)
constant_(self.enc_bbox_head.layers[-1].bias)
for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
linear_init_(cls_)
constant_(cls_.bias, bias_cls)
constant_(reg_.layers[-1].weight)
constant_(reg_.layers[-1].bias)
linear_init_(self.enc_output[0])
xavier_uniform_(self.enc_output[0].weight)
normal_(self.level_embed.weight)
if self.learnt_init_query:
xavier_uniform_(self.tgt_embed.weight)
xavier_uniform_(self.query_pos_head.layers[0].weight)
xavier_uniform_(self.query_pos_head.layers[1].weight)
for l in self.input_proj:
xavier_uniform_(l[0].weight)
constant_(l[0].bias)
@classmethod
def from_config(cls, cfg, input_shape):
return {'in_feats_channel': [i.channels for i in input_shape], }
def _build_input_proj_layer(self,
in_feats_channel,
weight_attr=None,
bias_attr=None):
self.input_proj = nn.LayerList()
for in_channels in in_feats_channel:
self.input_proj.append(
nn.Sequential(
('conv', nn.Conv2D(
in_channels, self.hidden_dim, kernel_size=1)), (
'norm', nn.GroupNorm(
32,
self.hidden_dim,
weight_attr=weight_attr,
bias_attr=bias_attr))))
in_channels = in_feats_channel[-1]
for _ in range(self.num_levels - len(in_feats_channel)):
self.input_proj.append(
nn.Sequential(
('conv', nn.Conv2D(
in_channels,
self.hidden_dim,
kernel_size=3,
stride=2,
padding=1)), ('norm', nn.GroupNorm(
32,
self.hidden_dim,
weight_attr=weight_attr,
bias_attr=bias_attr))))
in_channels = self.hidden_dim
def _get_encoder_input(self, feats, pad_mask=None):
# get projection features
proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
if self.num_levels > len(proj_feats):
len_srcs = len(proj_feats)
for i in range(len_srcs, self.num_levels):
if i == len_srcs:
proj_feats.append(self.input_proj[i](feats[-1]))
else:
proj_feats.append(self.input_proj[i](proj_feats[-1]))
# get encoder inputs
feat_flatten = []
mask_flatten = []
lvl_pos_embed_flatten = []
spatial_shapes = []
valid_ratios = []
for i, feat in enumerate(proj_feats):
bs, _, h, w = paddle.shape(feat)
spatial_shapes.append(paddle.stack([h, w]))
# [b,c,h,w] -> [b,h*w,c]
feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
if pad_mask is not None:
mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0]
else:
mask = paddle.ones([bs, h, w])
valid_ratios.append(get_valid_ratio(mask))
# [b, h*w, c]
pos_embed = self.position_embedding(mask).flatten(1, 2)
lvl_pos_embed = pos_embed + self.level_embed.weight[i]
lvl_pos_embed_flatten.append(lvl_pos_embed)
if pad_mask is not None:
# [b, h*w]
mask_flatten.append(mask.flatten(1))
# [b, l, c]
feat_flatten = paddle.concat(feat_flatten, 1)
# [b, l]
mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten,
1)
# [b, l, c]
lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
# [num_levels, 2]
spatial_shapes = paddle.to_tensor(
paddle.stack(spatial_shapes).astype('int64'))
# [l] start index of each level
level_start_index = paddle.concat([
paddle.zeros(
[1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
])
# [b, num_levels, 2]
valid_ratios = paddle.stack(valid_ratios, 1)
return (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
lvl_pos_embed_flatten, valid_ratios)
def forward(self, feats, pad_mask=None, gt_meta=None):
# input projection and embedding
(feat_flatten, spatial_shapes, level_start_index, mask_flatten,
lvl_pos_embed_flatten,
valid_ratios) = self._get_encoder_input(feats, pad_mask)
# encoder
memory = self.encoder(feat_flatten, spatial_shapes, level_start_index,
mask_flatten, lvl_pos_embed_flatten, valid_ratios)
# prepare denoising training
if self.training:
denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
get_contrastive_denoising_training_group(gt_meta,
self.num_classes,
self.num_queries,
self.denoising_class_embed.weight,
self.num_denoising,
self.label_noise_ratio,
self.box_noise_scale)
else:
denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
self._get_decoder_input(
memory, spatial_shapes, mask_flatten, denoising_class,
denoising_bbox_unact)
# decoder
inter_feats, inter_bboxes = self.decoder(
target, init_ref_points_unact, memory, spatial_shapes,
level_start_index, self.dec_bbox_head, self.query_pos_head,
valid_ratios, attn_mask, mask_flatten)
out_bboxes = []
out_logits = []
for i in range(self.num_decoder_layers):
out_logits.append(self.dec_score_head[i](inter_feats[i]))
if i == 0:
out_bboxes.append(
F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
init_ref_points_unact))
else:
out_bboxes.append(
F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
inverse_sigmoid(inter_bboxes[i - 1])))
out_bboxes = paddle.stack(out_bboxes)
out_logits = paddle.stack(out_logits)
return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
dn_meta)
def _get_encoder_output_anchors(self,
memory,
spatial_shapes,
memory_mask=None,
grid_size=0.05):
output_anchors = []
idx = 0
for lvl, (h, w) in enumerate(spatial_shapes):
if memory_mask is not None:
mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w])
valid_H = paddle.sum(mask_[:, :, 0], 1)
valid_W = paddle.sum(mask_[:, 0, :], 1)
else:
valid_H, valid_W = h, w
grid_y, grid_x = paddle.meshgrid(
paddle.arange(end=h), paddle.arange(end=w))
grid_xy = paddle.stack([grid_x, grid_y], -1).astype(memory.dtype)
valid_WH = paddle.stack([valid_W, valid_H], -1).reshape(
[-1, 1, 1, 2]).astype(grid_xy.dtype)
grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
output_anchors.append(
paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
idx += h * w
output_anchors = paddle.concat(output_anchors, 1)
valid_mask = ((output_anchors > self.eps) *
(output_anchors < 1 - self.eps)).all(-1, keepdim=True)
output_anchors = paddle.log(output_anchors / (1 - output_anchors))
if memory_mask is not None:
valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0
output_anchors = paddle.where(valid_mask, output_anchors,
paddle.to_tensor(float("inf")))
memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
output_memory = self.enc_output(memory)
return output_memory, output_anchors
def _get_decoder_input(self,
memory,
spatial_shapes,
memory_mask=None,
denoising_class=None,
denoising_bbox_unact=None):
bs, _, _ = memory.shape
# prepare input for decoder
output_memory, output_anchors = self._get_encoder_output_anchors(
memory, spatial_shapes, memory_mask)
enc_outputs_class = self.enc_score_head(output_memory)
enc_outputs_coord_unact = self.enc_bbox_head(
output_memory) + output_anchors
_, topk_ind = paddle.topk(
enc_outputs_class.max(-1), self.num_queries, axis=1)
# extract region proposal boxes
batch_ind = paddle.arange(end=bs).astype(topk_ind.dtype)
batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact,
topk_ind) # unsigmoided.
enc_topk_bboxes = F.sigmoid(reference_points_unact)
if denoising_bbox_unact is not None:
reference_points_unact = paddle.concat(
[denoising_bbox_unact, reference_points_unact], 1)
enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)
# extract region features
if self.learnt_init_query:
target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
else:
target = paddle.gather_nd(output_memory, topk_ind).detach()
if denoising_class is not None:
target = paddle.concat([denoising_class, target], 1)
return target, reference_points_unact.detach(
), enc_topk_bboxes, enc_topk_logits

View File

@@ -0,0 +1,85 @@
# Multi-scale deformable attention自定义OP编译
该自定义OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。
## 1. 环境依赖
- Paddle >= 2.3.2
- gcc 8.2
## 2. 安装
请在当前路径下进行编译安装
```
cd rtdetr_paddle/ppdet/modeling/transformers/ext_op/
python setup_ms_deformable_attn_op.py install
```
编译完成后即可使用,以下为`ms_deformable_attn`的使用示例
```
# 引入自定义op
from deformable_detr_ops import ms_deformable_attn
# 构造fake input tensor
bs, n_heads, c = 2, 8, 8
query_length, n_levels, n_points = 2, 2, 2
spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
level_start_index = paddle.concat((paddle.to_tensor(
[0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
value_length = sum([(H * W).item() for H, W in spatial_shapes])
def get_test_tensors(channels):
value = paddle.rand(
[bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
sampling_locations = paddle.rand(
[bs, query_length, n_heads, n_levels, n_points, 2],
dtype=paddle.float32)
attention_weights = paddle.rand(
[bs, query_length, n_heads, n_levels, n_points],
dtype=paddle.float32) + 1e-5
attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
-2, keepdim=True)
return [value, sampling_locations, attention_weights]
value, sampling_locations, attention_weights = get_test_tensors(c)
output = ms_deformable_attn(value,
spatial_shapes,
level_start_index,
sampling_locations,
attention_weights)
```
## 3. 单元测试
可以通过执行单元测试来确认自定义算子功能的正确性,执行单元测试的示例如下所示:
```
python test_ms_deformable_attn_op.py
```
运行成功后,打印如下:
```
*True check_forward_equal_with_paddle_float: max_abs_err 6.98e-10 max_rel_err 2.03e-07
*tensor1 True check_gradient_numerical(D=30)
*tensor2 True check_gradient_numerical(D=30)
*tensor3 True check_gradient_numerical(D=30)
*tensor1 True check_gradient_numerical(D=32)
*tensor2 True check_gradient_numerical(D=32)
*tensor3 True check_gradient_numerical(D=32)
*tensor1 True check_gradient_numerical(D=64)
*tensor2 True check_gradient_numerical(D=64)
*tensor3 True check_gradient_numerical(D=64)
*tensor1 True check_gradient_numerical(D=71)
*tensor2 True check_gradient_numerical(D=71)
*tensor3 True check_gradient_numerical(D=71)
*tensor1 True check_gradient_numerical(D=128)
*tensor2 True check_gradient_numerical(D=128)
*tensor3 True check_gradient_numerical(D=128)
*tensor1 True check_gradient_numerical(D=1024)
*tensor2 True check_gradient_numerical(D=1024)
*tensor3 True check_gradient_numerical(D=1024)
*tensor1 True check_gradient_numerical(D=1025)
*tensor2 True check_gradient_numerical(D=1025)
*tensor3 True check_gradient_numerical(D=1025)
*tensor1 True check_gradient_numerical(D=2048)
*tensor2 True check_gradient_numerical(D=2048)
*tensor3 True check_gradient_numerical(D=2048)
*tensor1 True check_gradient_numerical(D=3096)
*tensor2 True check_gradient_numerical(D=3096)
*tensor3 True check_gradient_numerical(D=3096)
```

View File

@@ -0,0 +1,65 @@
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/extension.h"
#include <vector>
// declare GPU implementation
std::vector<paddle::Tensor>
MSDeformableAttnCUDAForward(const paddle::Tensor &value,
const paddle::Tensor &value_spatial_shapes,
const paddle::Tensor &value_level_start_index,
const paddle::Tensor &sampling_locations,
const paddle::Tensor &attention_weights);
std::vector<paddle::Tensor> MSDeformableAttnCUDABackward(
const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes,
const paddle::Tensor &value_level_start_index,
const paddle::Tensor &sampling_locations,
const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out);
//// CPU not implemented
std::vector<std::vector<int64_t>>
MSDeformableAttnInferShape(std::vector<int64_t> value_shape,
std::vector<int64_t> value_spatial_shapes_shape,
std::vector<int64_t> value_level_start_index_shape,
std::vector<int64_t> sampling_locations_shape,
std::vector<int64_t> attention_weights_shape) {
return {{value_shape[0], sampling_locations_shape[1],
value_shape[2] * value_shape[3]}};
}
std::vector<paddle::DataType>
MSDeformableAttnInferDtype(paddle::DataType value_dtype,
paddle::DataType value_spatial_shapes_dtype,
paddle::DataType value_level_start_index_dtype,
paddle::DataType sampling_locations_dtype,
paddle::DataType attention_weights_dtype) {
return {value_dtype};
}
PD_BUILD_OP(ms_deformable_attn)
.Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
"AttentionWeights"})
.Outputs({"Out"})
.SetKernelFn(PD_KERNEL(MSDeformableAttnCUDAForward))
.SetInferShapeFn(PD_INFER_SHAPE(MSDeformableAttnInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(MSDeformableAttnInferDtype));
PD_BUILD_GRAD_OP(ms_deformable_attn)
.Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
"AttentionWeights", paddle::Grad("Out")})
.Outputs({paddle::Grad("Value"), paddle::Grad("SpatialShapes"),
paddle::Grad("LevelIndex"), paddle::Grad("SamplingLocations"),
paddle::Grad("AttentionWeights")})
.SetKernelFn(PD_KERNEL(MSDeformableAttnCUDABackward));

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,7 @@
from paddle.utils.cpp_extension import CUDAExtension, setup
if __name__ == "__main__":
setup(
name='deformable_detr_ops',
ext_modules=CUDAExtension(
sources=['ms_deformable_attn_op.cc', 'ms_deformable_attn_op.cu']))

View File

@@ -0,0 +1,140 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import os
import sys
import random
import numpy as np
import paddle
# add python path of PaddleDetection to sys.path
parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 5)))
if parent_path not in sys.path:
sys.path.append(parent_path)
from ppdet.modeling.transformers.utils import deformable_attention_core_func
ms_deform_attn_core_paddle = deformable_attention_core_func
try:
gpu_index = int(sys.argv[1])
except:
gpu_index = 0
print(f'Use gpu {gpu_index} to test...')
paddle.set_device(f'gpu:{gpu_index}')
try:
from deformable_detr_ops import ms_deformable_attn
except Exception as e:
print('import deformable_detr_ops error', e)
sys.exit(-1)
paddle.seed(1)
random.seed(1)
np.random.seed(1)
bs, n_heads, c = 2, 8, 8
query_length, n_levels, n_points = 2, 2, 2
spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
level_start_index = paddle.concat((paddle.to_tensor(
[0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
value_length = sum([(H * W).item() for H, W in spatial_shapes])
def get_test_tensors(channels):
value = paddle.rand(
[bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
sampling_locations = paddle.rand(
[bs, query_length, n_heads, n_levels, n_points, 2],
dtype=paddle.float32)
attention_weights = paddle.rand(
[bs, query_length, n_heads, n_levels, n_points],
dtype=paddle.float32) + 1e-5
attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
-2, keepdim=True)
return [value, sampling_locations, attention_weights]
@paddle.no_grad()
def check_forward_equal_with_paddle_float():
value, sampling_locations, attention_weights = get_test_tensors(c)
output_paddle = ms_deform_attn_core_paddle(
value, spatial_shapes, level_start_index, sampling_locations,
attention_weights).detach().cpu()
output_cuda = ms_deformable_attn(value, spatial_shapes, level_start_index,
sampling_locations,
attention_weights).detach().cpu()
fwdok = paddle.allclose(
output_cuda, output_paddle, rtol=1e-2, atol=1e-3).item()
max_abs_err = (output_cuda - output_paddle).abs().max().item()
max_rel_err = (
(output_cuda - output_paddle).abs() / output_paddle.abs()).max().item()
print(
f'*{fwdok} check_forward_equal_with_paddle_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}'
)
def check_gradient_numerical(channels=4):
value_paddle, sampling_locations_paddle, attention_weights_paddle = get_test_tensors(
channels)
value_paddle.stop_gradient = False
sampling_locations_paddle.stop_gradient = False
attention_weights_paddle.stop_gradient = False
value_cuda = value_paddle.detach().clone()
sampling_locations_cuda = sampling_locations_paddle.detach().clone()
attention_weights_cuda = attention_weights_paddle.detach().clone()
value_cuda.stop_gradient = False
sampling_locations_cuda.stop_gradient = False
attention_weights_cuda.stop_gradient = False
output_paddle = ms_deform_attn_core_paddle(
value_paddle, spatial_shapes, level_start_index,
sampling_locations_paddle, attention_weights_paddle)
output_paddle.sum().backward()
output_cuda = ms_deformable_attn(value_cuda, spatial_shapes,
level_start_index, sampling_locations_cuda,
attention_weights_cuda)
output_cuda.sum().backward()
res = paddle.allclose(
value_paddle.grad, value_cuda.grad, rtol=1e-2, atol=1e-3).item()
print(f'*tensor1 {res} check_gradient_numerical(D={channels})')
res = paddle.allclose(
sampling_locations_paddle.grad,
sampling_locations_cuda.grad,
rtol=1e-2,
atol=1e-3).item()
print(f'*tensor2 {res} check_gradient_numerical(D={channels})')
res = paddle.allclose(
attention_weights_paddle.grad,
attention_weights_cuda.grad,
rtol=1e-2,
atol=1e-3).item()
print(f'*tensor3 {res} check_gradient_numerical(D={channels})')
if __name__ == '__main__':
check_forward_equal_with_paddle_float()
for channels in [30, 32, 64, 71, 128, 1024, 1025, 2048, 3096]:
check_gradient_numerical(channels)

View File

@@ -0,0 +1,287 @@
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register, serializable
from ppdet.modeling.ops import get_act_fn
from ..shape_spec import ShapeSpec
from ..backbones.csp_darknet import BaseConv
from ..backbones.cspresnet import RepVggBlock
from ppdet.modeling.transformers.detr_transformer import TransformerEncoder
from ..initializer import xavier_uniform_, linear_init_
from ..layers import MultiHeadAttention
from paddle import ParamAttr
from paddle.regularizer import L2Decay
__all__ = ['HybridEncoder']
class CSPRepLayer(nn.Layer):
def __init__(self,
in_channels,
out_channels,
num_blocks=3,
expansion=1.0,
bias=False,
act="silu"):
super(CSPRepLayer, self).__init__()
hidden_channels = int(out_channels * expansion)
self.conv1 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.conv2 = BaseConv(
in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
self.bottlenecks = nn.Sequential(*[
RepVggBlock(
hidden_channels, hidden_channels, act=act)
for _ in range(num_blocks)
])
if hidden_channels != out_channels:
self.conv3 = BaseConv(
hidden_channels,
out_channels,
ksize=1,
stride=1,
bias=bias,
act=act)
else:
self.conv3 = nn.Identity()
def forward(self, x):
x_1 = self.conv1(x)
x_1 = self.bottlenecks(x_1)
x_2 = self.conv2(x)
return self.conv3(x_1 + x_2)
@register
class TransformerLayer(nn.Layer):
def __init__(self,
d_model,
nhead,
dim_feedforward=1024,
dropout=0.,
activation="relu",
attn_dropout=None,
act_dropout=None,
normalize_before=False):
super(TransformerLayer, self).__init__()
attn_dropout = dropout if attn_dropout is None else attn_dropout
act_dropout = dropout if act_dropout is None else act_dropout
self.normalize_before = normalize_before
self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
self.activation = getattr(F, activation)
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.linear1)
linear_init_(self.linear2)
@staticmethod
def with_pos_embed(tensor, pos_embed):
return tensor if pos_embed is None else tensor + pos_embed
def forward(self, src, src_mask=None, pos_embed=None):
residual = src
if self.normalize_before:
src = self.norm1(src)
q = k = self.with_pos_embed(src, pos_embed)
src = self.self_attn(q, k, value=src, attn_mask=src_mask)
src = residual + self.dropout1(src)
if not self.normalize_before:
src = self.norm1(src)
residual = src
if self.normalize_before:
src = self.norm2(src)
src = self.linear2(self.dropout(self.activation(self.linear1(src))))
src = residual + self.dropout2(src)
if not self.normalize_before:
src = self.norm2(src)
return src
@register
@serializable
class HybridEncoder(nn.Layer):
__shared__ = ['depth_mult', 'act', 'trt', 'eval_size']
__inject__ = ['encoder_layer']
def __init__(self,
in_channels=[512, 1024, 2048],
feat_strides=[8, 16, 32],
hidden_dim=256,
use_encoder_idx=[2],
num_encoder_layers=1,
encoder_layer='TransformerLayer',
pe_temperature=10000,
expansion=1.0,
depth_mult=1.0,
act='silu',
trt=False,
eval_size=None):
super(HybridEncoder, self).__init__()
self.in_channels = in_channels
self.feat_strides = feat_strides
self.hidden_dim = hidden_dim
self.use_encoder_idx = use_encoder_idx
self.num_encoder_layers = num_encoder_layers
self.pe_temperature = pe_temperature
self.eval_size = eval_size
# channel projection
self.input_proj = nn.LayerList()
for in_channel in in_channels:
self.input_proj.append(
nn.Sequential(
nn.Conv2D(
in_channel, hidden_dim, kernel_size=1, bias_attr=False),
nn.BatchNorm2D(
hidden_dim,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))
# encoder transformer
self.encoder = nn.LayerList([
TransformerEncoder(encoder_layer, num_encoder_layers)
for _ in range(len(use_encoder_idx))
])
act = get_act_fn(
act, trt=trt) if act is None or isinstance(act,
(str, dict)) else act
# top-down fpn
self.lateral_convs = nn.LayerList()
self.fpn_blocks = nn.LayerList()
for idx in range(len(in_channels) - 1, 0, -1):
self.lateral_convs.append(
BaseConv(
hidden_dim, hidden_dim, 1, 1, act=act))
self.fpn_blocks.append(
CSPRepLayer(
hidden_dim * 2,
hidden_dim,
round(3 * depth_mult),
act=act,
expansion=expansion))
# bottom-up pan
self.downsample_convs = nn.LayerList()
self.pan_blocks = nn.LayerList()
for idx in range(len(in_channels) - 1):
self.downsample_convs.append(
BaseConv(
hidden_dim, hidden_dim, 3, stride=2, act=act))
self.pan_blocks.append(
CSPRepLayer(
hidden_dim * 2,
hidden_dim,
round(3 * depth_mult),
act=act,
expansion=expansion))
self._reset_parameters()
def _reset_parameters(self):
if self.eval_size:
for idx in self.use_encoder_idx:
stride = self.feat_strides[idx]
pos_embed = self.build_2d_sincos_position_embedding(
self.eval_size[1] // stride, self.eval_size[0] // stride,
self.hidden_dim, self.pe_temperature)
setattr(self, f'pos_embed{idx}', pos_embed)
@staticmethod
def build_2d_sincos_position_embedding(w,
h,
embed_dim=256,
temperature=10000.):
grid_w = paddle.arange(int(w), dtype=paddle.float32)
grid_h = paddle.arange(int(h), dtype=paddle.float32)
grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
assert embed_dim % 4 == 0, \
'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
pos_dim = embed_dim // 4
omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
omega = 1. / (temperature**omega)
out_w = grid_w.flatten()[..., None] @omega[None]
out_h = grid_h.flatten()[..., None] @omega[None]
return paddle.concat(
[
paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
paddle.cos(out_h)
],
axis=1)[None, :, :]
def forward(self, feats, for_mot=False):
assert len(feats) == len(self.in_channels)
# get projection features
proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
# encoder
if self.num_encoder_layers > 0:
for i, enc_ind in enumerate(self.use_encoder_idx):
h, w = proj_feats[enc_ind].shape[2:]
# flatten [B, C, H, W] to [B, HxW, C]
src_flatten = proj_feats[enc_ind].flatten(2).transpose(
[0, 2, 1])
if self.training or self.eval_size is None:
pos_embed = self.build_2d_sincos_position_embedding(
w, h, self.hidden_dim, self.pe_temperature)
else:
pos_embed = getattr(self, f'pos_embed{enc_ind}', None)
memory = self.encoder[i](src_flatten, pos_embed=pos_embed)
proj_feats[enc_ind] = memory.transpose([0, 2, 1]).reshape(
[-1, self.hidden_dim, h, w])
# top-down fpn
inner_outs = [proj_feats[-1]]
for idx in range(len(self.in_channels) - 1, 0, -1):
feat_heigh = inner_outs[0]
feat_low = proj_feats[idx - 1]
feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
feat_heigh)
inner_outs[0] = feat_heigh
upsample_feat = F.interpolate(
feat_heigh, scale_factor=2., mode="nearest")
inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
paddle.concat(
[upsample_feat, feat_low], axis=1))
inner_outs.insert(0, inner_out)
# bottom-up pan
outs = [inner_outs[0]]
for idx in range(len(self.in_channels) - 1):
feat_low = outs[-1]
feat_height = inner_outs[idx + 1]
downsample_feat = self.downsample_convs[idx](feat_low)
out = self.pan_blocks[idx](paddle.concat(
[downsample_feat, feat_height], axis=1))
outs.append(out)
return outs
@classmethod
def from_config(cls, cfg, input_shape):
return {
'in_channels': [i.channels for i in input_shape],
'feat_strides': [i.stride for i in input_shape]
}
@property
def out_shape(self):
return [
ShapeSpec(
channels=self.hidden_dim, stride=self.feat_strides[idx])
for idx in range(len(self.in_channels))
]

View File

@@ -0,0 +1,184 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from scipy.optimize import linear_sum_assignment
from ppdet.core.workspace import register, serializable
from ..losses.iou_loss import GIoULoss
from .utils import bbox_cxcywh_to_xyxy
__all__ = ['HungarianMatcher']
@register
@serializable
class HungarianMatcher(nn.Layer):
__shared__ = ['use_focal_loss', 'with_mask', 'num_sample_points']
def __init__(self,
matcher_coeff={
'class': 1,
'bbox': 5,
'giou': 2,
'mask': 1,
'dice': 1
},
use_focal_loss=False,
with_mask=False,
num_sample_points=12544,
alpha=0.25,
gamma=2.0):
r"""
Args:
matcher_coeff (dict): The coefficient of hungarian matcher cost.
"""
super(HungarianMatcher, self).__init__()
self.matcher_coeff = matcher_coeff
self.use_focal_loss = use_focal_loss
self.with_mask = with_mask
self.num_sample_points = num_sample_points
self.alpha = alpha
self.gamma = gamma
self.giou_loss = GIoULoss()
def forward(self,
boxes,
logits,
gt_bbox,
gt_class,
masks=None,
gt_mask=None):
r"""
Args:
boxes (Tensor): [b, query, 4]
logits (Tensor): [b, query, num_classes]
gt_bbox (List(Tensor)): list[[n, 4]]
gt_class (List(Tensor)): list[[n, 1]]
masks (Tensor|None): [b, query, h, w]
gt_mask (List(Tensor)): list[[n, H, W]]
Returns:
A list of size batch_size, containing tuples of (index_i, index_j) where:
- index_i is the indices of the selected predictions (in order)
- index_j is the indices of the corresponding selected targets (in order)
For each batch element, it holds:
len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
"""
bs, num_queries = boxes.shape[:2]
num_gts = [len(a) for a in gt_class]
if sum(num_gts) == 0:
return [(paddle.to_tensor(
[], dtype=paddle.int64), paddle.to_tensor(
[], dtype=paddle.int64)) for _ in range(bs)]
# We flatten to compute the cost matrices in a batch
# [batch_size * num_queries, num_classes]
logits = logits.detach()
out_prob = F.sigmoid(logits.flatten(
0, 1)) if self.use_focal_loss else F.softmax(logits.flatten(0, 1))
# [batch_size * num_queries, 4]
out_bbox = boxes.detach().flatten(0, 1)
# Also concat the target labels and boxes
tgt_ids = paddle.concat(gt_class).flatten()
tgt_bbox = paddle.concat(gt_bbox)
# Compute the classification cost
out_prob = paddle.gather(out_prob, tgt_ids, axis=1)
if self.use_focal_loss:
neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-(
1 - out_prob + 1e-8).log())
pos_cost_class = self.alpha * (
(1 - out_prob)**self.gamma) * (-(out_prob + 1e-8).log())
cost_class = pos_cost_class - neg_cost_class
else:
cost_class = -out_prob
# Compute the L1 cost between boxes
cost_bbox = (
out_bbox.unsqueeze(1) - tgt_bbox.unsqueeze(0)).abs().sum(-1)
# Compute the giou cost betwen boxes
cost_giou = self.giou_loss(
bbox_cxcywh_to_xyxy(out_bbox.unsqueeze(1)),
bbox_cxcywh_to_xyxy(tgt_bbox.unsqueeze(0))).squeeze(-1)
# Final cost matrix
C = self.matcher_coeff['class'] * cost_class + \
self.matcher_coeff['bbox'] * cost_bbox + \
self.matcher_coeff['giou'] * cost_giou
# Compute the mask cost and dice cost
if self.with_mask:
assert (masks is not None and gt_mask is not None,
'Make sure the input has `mask` and `gt_mask`')
# all masks share the same set of points for efficient matching
sample_points = paddle.rand([bs, 1, self.num_sample_points, 2])
sample_points = 2.0 * sample_points - 1.0
out_mask = F.grid_sample(
masks.detach(), sample_points, align_corners=False).squeeze(-2)
out_mask = out_mask.flatten(0, 1)
tgt_mask = paddle.concat(gt_mask).unsqueeze(1)
sample_points = paddle.concat([
a.tile([b, 1, 1, 1]) for a, b in zip(sample_points, num_gts)
if b > 0
])
tgt_mask = F.grid_sample(
tgt_mask, sample_points, align_corners=False).squeeze([1, 2])
with paddle.amp.auto_cast(enable=False):
# binary cross entropy cost
pos_cost_mask = F.binary_cross_entropy_with_logits(
out_mask, paddle.ones_like(out_mask), reduction='none')
neg_cost_mask = F.binary_cross_entropy_with_logits(
out_mask, paddle.zeros_like(out_mask), reduction='none')
cost_mask = paddle.matmul(
pos_cost_mask, tgt_mask, transpose_y=True) + paddle.matmul(
neg_cost_mask, 1 - tgt_mask, transpose_y=True)
cost_mask /= self.num_sample_points
# dice cost
out_mask = F.sigmoid(out_mask)
numerator = 2 * paddle.matmul(
out_mask, tgt_mask, transpose_y=True)
denominator = out_mask.sum(
-1, keepdim=True) + tgt_mask.sum(-1).unsqueeze(0)
cost_dice = 1 - (numerator + 1) / (denominator + 1)
C = C + self.matcher_coeff['mask'] * cost_mask + \
self.matcher_coeff['dice'] * cost_dice
C = C.reshape([bs, num_queries, -1])
C = [a.squeeze(0) for a in C.chunk(bs)]
sizes = [a.shape[0] for a in gt_bbox]
indices = [
linear_sum_assignment(c.split(sizes, -1)[i].numpy())
for i, c in enumerate(C)
]
return [(paddle.to_tensor(
i, dtype=paddle.int64), paddle.to_tensor(
j, dtype=paddle.int64)) for i, j in indices]

View File

@@ -0,0 +1,100 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
import paddle.nn as nn
from ppdet.core.workspace import register, serializable
@register
@serializable
class PositionEmbedding(nn.Layer):
def __init__(self,
num_pos_feats=128,
temperature=10000,
normalize=True,
scale=2 * math.pi,
embed_type='sine',
num_embeddings=50,
offset=0.,
eps=1e-6):
super(PositionEmbedding, self).__init__()
assert embed_type in ['sine', 'learned']
self.embed_type = embed_type
self.offset = offset
self.eps = eps
if self.embed_type == 'sine':
self.num_pos_feats = num_pos_feats
self.temperature = temperature
self.normalize = normalize
self.scale = scale
elif self.embed_type == 'learned':
self.row_embed = nn.Embedding(num_embeddings, num_pos_feats)
self.col_embed = nn.Embedding(num_embeddings, num_pos_feats)
else:
raise ValueError(f"{self.embed_type} is not supported.")
def forward(self, mask):
"""
Args:
mask (Tensor): [B, H, W]
Returns:
pos (Tensor): [B, H, W, C]
"""
if self.embed_type == 'sine':
y_embed = mask.cumsum(1)
x_embed = mask.cumsum(2)
if self.normalize:
y_embed = (y_embed + self.offset) / (
y_embed[:, -1:, :] + self.eps) * self.scale
x_embed = (x_embed + self.offset) / (
x_embed[:, :, -1:] + self.eps) * self.scale
dim_t = 2 * (paddle.arange(self.num_pos_feats) //
2).astype('float32')
dim_t = self.temperature**(dim_t / self.num_pos_feats)
pos_x = x_embed.unsqueeze(-1) / dim_t
pos_y = y_embed.unsqueeze(-1) / dim_t
pos_x = paddle.stack(
(pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
axis=4).flatten(3)
pos_y = paddle.stack(
(pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
axis=4).flatten(3)
return paddle.concat((pos_y, pos_x), axis=3)
elif self.embed_type == 'learned':
h, w = mask.shape[-2:]
i = paddle.arange(w)
j = paddle.arange(h)
x_emb = self.col_embed(i)
y_emb = self.row_embed(j)
return paddle.concat(
[
x_emb.unsqueeze(0).tile([h, 1, 1]),
y_emb.unsqueeze(1).tile([1, w, 1]),
],
axis=-1).unsqueeze(0)
else:
raise ValueError(f"not supported {self.embed_type}")

View File

@@ -0,0 +1,523 @@
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from ppdet.core.workspace import register
from ..layers import MultiHeadAttention
from .deformable_transformer import MSDeformableAttention
from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,
bias_init_with_prob)
from .utils import (_get_clones, get_sine_pos_embed,
get_contrastive_denoising_training_group, inverse_sigmoid, MLP)
__all__ = ['RTDETRTransformer']
class PPMSDeformableAttention(MSDeformableAttention):
def forward(self,
query,
reference_points,
value,
value_spatial_shapes,
value_level_start_index,
value_mask=None):
"""
Args:
query (Tensor): [bs, query_length, C]
reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
bottom-right (1, 1), including padding area
value (Tensor): [bs, value_length, C]
value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
value_level_start_index (List): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
Returns:
output (Tensor): [bs, Length_{query}, C]
"""
bs, Len_q = query.shape[:2]
Len_v = value.shape[1]
value = self.value_proj(value)
if value_mask is not None:
value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
value *= value_mask
value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])
sampling_offsets = self.sampling_offsets(query).reshape(
[bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
attention_weights = self.attention_weights(query).reshape(
[bs, Len_q, self.num_heads, self.num_levels * self.num_points])
attention_weights = F.softmax(attention_weights).reshape(
[bs, Len_q, self.num_heads, self.num_levels, self.num_points])
if reference_points.shape[-1] == 2:
offset_normalizer = paddle.to_tensor(value_spatial_shapes)
offset_normalizer = offset_normalizer.flip([1]).reshape(
[1, 1, 1, self.num_levels, 1, 2])
sampling_locations = reference_points.reshape([
bs, Len_q, 1, self.num_levels, 1, 2
]) + sampling_offsets / offset_normalizer
elif reference_points.shape[-1] == 4:
sampling_locations = (
reference_points[:, :, None, :, None, :2] + sampling_offsets /
self.num_points * reference_points[:, :, None, :, None, 2:] *
0.5)
else:
raise ValueError(
"Last dim of reference_points must be 2 or 4, but get {} instead.".
format(reference_points.shape[-1]))
if not isinstance(query, paddle.Tensor):
from ppdet.modeling.transformers.utils import deformable_attention_core_func
output = deformable_attention_core_func(
value, value_spatial_shapes, value_level_start_index,
sampling_locations, attention_weights)
else:
value_spatial_shapes = paddle.to_tensor(value_spatial_shapes)
value_level_start_index = paddle.to_tensor(value_level_start_index)
output = self.ms_deformable_attn_core(
value, value_spatial_shapes, value_level_start_index,
sampling_locations, attention_weights)
output = self.output_proj(output)
return output
class TransformerDecoderLayer(nn.Layer):
def __init__(self,
d_model=256,
n_head=8,
dim_feedforward=1024,
dropout=0.,
activation="relu",
n_levels=4,
n_points=4,
weight_attr=None,
bias_attr=None):
super(TransformerDecoderLayer, self).__init__()
# self attention
self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
self.dropout1 = nn.Dropout(dropout)
self.norm1 = nn.LayerNorm(
d_model,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
# cross attention
self.cross_attn = PPMSDeformableAttention(d_model, n_head, n_levels,
n_points, 1.0)
self.dropout2 = nn.Dropout(dropout)
self.norm2 = nn.LayerNorm(
d_model,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
# ffn
self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr,
bias_attr)
self.activation = getattr(F, activation)
self.dropout3 = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr,
bias_attr)
self.dropout4 = nn.Dropout(dropout)
self.norm3 = nn.LayerNorm(
d_model,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
self._reset_parameters()
def _reset_parameters(self):
linear_init_(self.linear1)
linear_init_(self.linear2)
xavier_uniform_(self.linear1.weight)
xavier_uniform_(self.linear2.weight)
def with_pos_embed(self, tensor, pos):
return tensor if pos is None else tensor + pos
def forward_ffn(self, tgt):
return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
def forward(self,
tgt,
reference_points,
memory,
memory_spatial_shapes,
memory_level_start_index,
attn_mask=None,
memory_mask=None,
query_pos_embed=None):
# self attention
q = k = self.with_pos_embed(tgt, query_pos_embed)
if attn_mask is not None:
attn_mask = paddle.where(
attn_mask.astype('bool'),
paddle.zeros(attn_mask.shape, tgt.dtype),
paddle.full(attn_mask.shape, float("-inf"), tgt.dtype))
tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
tgt = tgt + self.dropout1(tgt2)
tgt = self.norm1(tgt)
# cross attention
tgt2 = self.cross_attn(
self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
memory_spatial_shapes, memory_level_start_index, memory_mask)
tgt = tgt + self.dropout2(tgt2)
tgt = self.norm2(tgt)
# ffn
tgt2 = self.forward_ffn(tgt)
tgt = tgt + self.dropout4(tgt2)
tgt = self.norm3(tgt)
return tgt
class TransformerDecoder(nn.Layer):
def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
super(TransformerDecoder, self).__init__()
self.layers = _get_clones(decoder_layer, num_layers)
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
def forward(self,
tgt,
ref_points_unact,
memory,
memory_spatial_shapes,
memory_level_start_index,
bbox_head,
score_head,
query_pos_head,
attn_mask=None,
memory_mask=None):
output = tgt
dec_out_bboxes = []
dec_out_logits = []
ref_points_detach = F.sigmoid(ref_points_unact)
for i, layer in enumerate(self.layers):
ref_points_input = ref_points_detach.unsqueeze(2)
query_pos_embed = query_pos_head(ref_points_detach)
output = layer(output, ref_points_input, memory,
memory_spatial_shapes, memory_level_start_index,
attn_mask, memory_mask, query_pos_embed)
inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
ref_points_detach))
if self.training:
dec_out_logits.append(score_head[i](output))
if i == 0:
dec_out_bboxes.append(inter_ref_bbox)
else:
dec_out_bboxes.append(
F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
ref_points)))
elif i == self.eval_idx:
dec_out_logits.append(score_head[i](output))
dec_out_bboxes.append(inter_ref_bbox)
break
ref_points = inter_ref_bbox
ref_points_detach = inter_ref_bbox.detach(
) if self.training else inter_ref_bbox
return paddle.stack(dec_out_bboxes), paddle.stack(dec_out_logits)
@register
class RTDETRTransformer(nn.Layer):
__shared__ = ['num_classes', 'hidden_dim', 'eval_size']
def __init__(self,
num_classes=80,
hidden_dim=256,
num_queries=300,
position_embed_type='sine',
backbone_feat_channels=[512, 1024, 2048],
feat_strides=[8, 16, 32],
num_levels=3,
num_decoder_points=4,
nhead=8,
num_decoder_layers=6,
dim_feedforward=1024,
dropout=0.,
activation="relu",
num_denoising=100,
label_noise_ratio=0.5,
box_noise_scale=1.0,
learnt_init_query=True,
eval_size=None,
eval_idx=-1,
eps=1e-2):
super(RTDETRTransformer, self).__init__()
assert position_embed_type in ['sine', 'learned'], \
f'ValueError: position_embed_type not supported {position_embed_type}!'
assert len(backbone_feat_channels) <= num_levels
assert len(feat_strides) == len(backbone_feat_channels)
for _ in range(num_levels - len(feat_strides)):
feat_strides.append(feat_strides[-1] * 2)
self.hidden_dim = hidden_dim
self.nhead = nhead
self.feat_strides = feat_strides
self.num_levels = num_levels
self.num_classes = num_classes
self.num_queries = num_queries
self.eps = eps
self.num_decoder_layers = num_decoder_layers
self.eval_size = eval_size
# backbone feature projection
self._build_input_proj_layer(backbone_feat_channels)
# Transformer module
decoder_layer = TransformerDecoderLayer(
hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
num_decoder_points)
self.decoder = TransformerDecoder(hidden_dim, decoder_layer,
num_decoder_layers, eval_idx)
# denoising part
self.denoising_class_embed = nn.Embedding(
num_classes,
hidden_dim,
weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
self.num_denoising = num_denoising
self.label_noise_ratio = label_noise_ratio
self.box_noise_scale = box_noise_scale
# decoder embedding
self.learnt_init_query = learnt_init_query
if learnt_init_query:
self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)
# encoder head
self.enc_output = nn.Sequential(
nn.Linear(hidden_dim, hidden_dim),
nn.LayerNorm(
hidden_dim,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0))))
self.enc_score_head = nn.Linear(hidden_dim, num_classes)
self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
# decoder head
self.dec_score_head = nn.LayerList([
nn.Linear(hidden_dim, num_classes)
for _ in range(num_decoder_layers)
])
self.dec_bbox_head = nn.LayerList([
MLP(hidden_dim, hidden_dim, 4, num_layers=3)
for _ in range(num_decoder_layers)
])
self._reset_parameters()
def _reset_parameters(self):
# class and bbox head init
bias_cls = bias_init_with_prob(0.01)
linear_init_(self.enc_score_head)
constant_(self.enc_score_head.bias, bias_cls)
constant_(self.enc_bbox_head.layers[-1].weight)
constant_(self.enc_bbox_head.layers[-1].bias)
for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
linear_init_(cls_)
constant_(cls_.bias, bias_cls)
constant_(reg_.layers[-1].weight)
constant_(reg_.layers[-1].bias)
linear_init_(self.enc_output[0])
xavier_uniform_(self.enc_output[0].weight)
if self.learnt_init_query:
xavier_uniform_(self.tgt_embed.weight)
xavier_uniform_(self.query_pos_head.layers[0].weight)
xavier_uniform_(self.query_pos_head.layers[1].weight)
for l in self.input_proj:
xavier_uniform_(l[0].weight)
# init encoder output anchors and valid_mask
if self.eval_size:
self.anchors, self.valid_mask = self._generate_anchors()
@classmethod
def from_config(cls, cfg, input_shape):
return {'backbone_feat_channels': [i.channels for i in input_shape]}
def _build_input_proj_layer(self, backbone_feat_channels):
self.input_proj = nn.LayerList()
for in_channels in backbone_feat_channels:
self.input_proj.append(
nn.Sequential(
('conv', nn.Conv2D(
in_channels,
self.hidden_dim,
kernel_size=1,
bias_attr=False)), ('norm', nn.BatchNorm2D(
self.hidden_dim,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
in_channels = backbone_feat_channels[-1]
for _ in range(self.num_levels - len(backbone_feat_channels)):
self.input_proj.append(
nn.Sequential(
('conv', nn.Conv2D(
in_channels,
self.hidden_dim,
kernel_size=3,
stride=2,
padding=1,
bias_attr=False)), ('norm', nn.BatchNorm2D(
self.hidden_dim,
weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
in_channels = self.hidden_dim
def _get_encoder_input(self, feats):
# get projection features
proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
if self.num_levels > len(proj_feats):
len_srcs = len(proj_feats)
for i in range(len_srcs, self.num_levels):
if i == len_srcs:
proj_feats.append(self.input_proj[i](feats[-1]))
else:
proj_feats.append(self.input_proj[i](proj_feats[-1]))
# get encoder inputs
feat_flatten = []
spatial_shapes = []
level_start_index = [0, ]
for i, feat in enumerate(proj_feats):
_, _, h, w = feat.shape
# [b, c, h, w] -> [b, h*w, c]
feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
# [num_levels, 2]
spatial_shapes.append([h, w])
# [l], start index of each level
level_start_index.append(h * w + level_start_index[-1])
# [b, l, c]
feat_flatten = paddle.concat(feat_flatten, 1)
level_start_index.pop()
return (feat_flatten, spatial_shapes, level_start_index)
def forward(self, feats, pad_mask=None, gt_meta=None):
# input projection and embedding
(memory, spatial_shapes,
level_start_index) = self._get_encoder_input(feats)
# prepare denoising training
if self.training:
denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
get_contrastive_denoising_training_group(gt_meta,
self.num_classes,
self.num_queries,
self.denoising_class_embed.weight,
self.num_denoising,
self.label_noise_ratio,
self.box_noise_scale)
else:
denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
self._get_decoder_input(
memory, spatial_shapes, denoising_class, denoising_bbox_unact)
# decoder
out_bboxes, out_logits = self.decoder(
target,
init_ref_points_unact,
memory,
spatial_shapes,
level_start_index,
self.dec_bbox_head,
self.dec_score_head,
self.query_pos_head,
attn_mask=attn_mask)
return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
dn_meta)
def _generate_anchors(self,
spatial_shapes=None,
grid_size=0.05,
dtype="float32"):
if spatial_shapes is None:
spatial_shapes = [
[int(self.eval_size[0] / s), int(self.eval_size[1] / s)]
for s in self.feat_strides
]
anchors = []
for lvl, (h, w) in enumerate(spatial_shapes):
grid_y, grid_x = paddle.meshgrid(
paddle.arange(
end=h, dtype=dtype),
paddle.arange(
end=w, dtype=dtype))
grid_xy = paddle.stack([grid_x, grid_y], -1)
valid_WH = paddle.to_tensor([w, h]).astype(dtype)
grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
anchors.append(
paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
anchors = paddle.concat(anchors, 1)
valid_mask = ((anchors > self.eps) *
(anchors < 1 - self.eps)).all(-1, keepdim=True)
anchors = paddle.log(anchors / (1 - anchors))
anchors = paddle.where(valid_mask, anchors,
paddle.to_tensor(float("inf")))
return anchors, valid_mask
def _get_decoder_input(self,
memory,
spatial_shapes,
denoising_class=None,
denoising_bbox_unact=None):
bs, _, _ = memory.shape
# prepare input for decoder
if self.training or self.eval_size is None:
anchors, valid_mask = self._generate_anchors(spatial_shapes)
else:
anchors, valid_mask = self.anchors, self.valid_mask
memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
output_memory = self.enc_output(memory)
enc_outputs_class = self.enc_score_head(output_memory)
enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors
_, topk_ind = paddle.topk(
enc_outputs_class.max(-1), self.num_queries, axis=1)
# extract region proposal boxes
batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype)
batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact,
topk_ind) # unsigmoided.
enc_topk_bboxes = F.sigmoid(reference_points_unact)
if denoising_bbox_unact is not None:
reference_points_unact = paddle.concat(
[denoising_bbox_unact, reference_points_unact], 1)
if self.training:
reference_points_unact = reference_points_unact.detach()
enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)
# extract region features
if self.learnt_init_query:
target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
else:
target = paddle.gather_nd(output_memory, topk_ind)
if self.training:
target = target.detach()
if denoising_class is not None:
target = paddle.concat([denoising_class, target], 1)
return target, reference_points_unact, enc_topk_bboxes, enc_topk_logits

View File

@@ -0,0 +1,481 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Modified from DETR (https://github.com/facebookresearch/detr)
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# Modified from detrex (https://github.com/IDEA-Research/detrex)
# Copyright 2022 The IDEA Authors. All rights reserved.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import copy
import math
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
__all__ = [
'_get_clones', 'bbox_cxcywh_to_xyxy',
'bbox_xyxy_to_cxcywh', 'sigmoid_focal_loss', 'inverse_sigmoid',
'deformable_attention_core_func', 'varifocal_loss_with_logits'
]
def bbox_area(boxes):
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
def bbox_overlaps(boxes1, boxes2):
"""
Calculate overlaps between boxes1 and boxes2
Args:
boxes1 (Tensor): boxes with shape [M, 4]
boxes2 (Tensor): boxes with shape [N, 4]
Return:
overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N]
"""
M = boxes1.shape[0]
N = boxes2.shape[0]
if M * N == 0:
return paddle.zeros([M, N], dtype='float32')
area1 = bbox_area(boxes1)
area2 = bbox_area(boxes2)
xy_max = paddle.minimum(
paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:])
xy_min = paddle.maximum(
paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2])
width_height = xy_max - xy_min
width_height = width_height.clip(min=0)
inter = width_height.prod(axis=2)
overlaps = paddle.where(inter > 0, inter /
(paddle.unsqueeze(area1, 1) + area2 - inter),
paddle.zeros_like(inter))
return overlaps
def _get_clones(module, N):
return nn.LayerList([copy.deepcopy(module) for _ in range(N)])
def bbox_cxcywh_to_xyxy(x):
cxcy, wh = paddle.split(x, 2, axis=-1)
return paddle.concat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], axis=-1)
def bbox_xyxy_to_cxcywh(x):
x1, y1, x2, y2 = x.split(4, axis=-1)
return paddle.concat(
[(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)], axis=-1)
def sigmoid_focal_loss(logit, label, normalizer=1.0, alpha=0.25, gamma=2.0):
prob = F.sigmoid(logit)
ce_loss = F.binary_cross_entropy_with_logits(logit, label, reduction="none")
p_t = prob * label + (1 - prob) * (1 - label)
loss = ce_loss * ((1 - p_t)**gamma)
if alpha >= 0:
alpha_t = alpha * label + (1 - alpha) * (1 - label)
loss = alpha_t * loss
return loss.mean(1).sum() / normalizer
def inverse_sigmoid(x, eps=1e-5):
x = x.clip(min=0., max=1.)
return paddle.log(x.clip(min=eps) / (1 - x).clip(min=eps))
def deformable_attention_core_func(value, value_spatial_shapes,
value_level_start_index, sampling_locations,
attention_weights):
"""
Args:
value (Tensor): [bs, value_length, n_head, c]
value_spatial_shapes (Tensor|List): [n_levels, 2]
value_level_start_index (Tensor|List): [n_levels]
sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2]
attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points]
Returns:
output (Tensor): [bs, Length_{query}, C]
"""
bs, _, n_head, c = value.shape
_, Len_q, _, n_levels, n_points, _ = sampling_locations.shape
split_shape = [h * w for h, w in value_spatial_shapes]
value_list = value.split(split_shape, axis=1)
sampling_grids = 2 * sampling_locations - 1
sampling_value_list = []
for level, (h, w) in enumerate(value_spatial_shapes):
# N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
value_l_ = value_list[level].flatten(2).transpose(
[0, 2, 1]).reshape([bs * n_head, c, h, w])
# N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(
[0, 2, 1, 3, 4]).flatten(0, 1)
# N_*M_, D_, Lq_, P_
sampling_value_l_ = F.grid_sample(
value_l_,
sampling_grid_l_,
mode='bilinear',
padding_mode='zeros',
align_corners=False)
sampling_value_list.append(sampling_value_l_)
# (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_)
attention_weights = attention_weights.transpose([0, 2, 1, 3, 4]).reshape(
[bs * n_head, 1, Len_q, n_levels * n_points])
output = (paddle.stack(
sampling_value_list, axis=-2).flatten(-2) *
attention_weights).sum(-1).reshape([bs, n_head * c, Len_q])
return output.transpose([0, 2, 1])
def get_valid_ratio(mask):
_, H, W = paddle.shape(mask)
valid_ratio_h = paddle.sum(mask[:, :, 0], 1) / H
valid_ratio_w = paddle.sum(mask[:, 0, :], 1) / W
# [b, 2]
return paddle.stack([valid_ratio_w, valid_ratio_h], -1)
def get_denoising_training_group(targets,
num_classes,
num_queries,
class_embed,
num_denoising=100,
label_noise_ratio=0.5,
box_noise_scale=1.0):
if num_denoising <= 0:
return None, None, None, None
num_gts = [len(t) for t in targets["gt_class"]]
max_gt_num = max(num_gts)
if max_gt_num == 0:
return None, None, None, None
num_group = num_denoising // max_gt_num
num_group = 1 if num_group == 0 else num_group
# pad gt to max_num of a batch
bs = len(targets["gt_class"])
input_query_class = paddle.full(
[bs, max_gt_num], num_classes, dtype='int32')
input_query_bbox = paddle.zeros([bs, max_gt_num, 4])
pad_gt_mask = paddle.zeros([bs, max_gt_num])
for i in range(bs):
num_gt = num_gts[i]
if num_gt > 0:
input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1)
input_query_bbox[i, :num_gt] = targets["gt_bbox"][i]
pad_gt_mask[i, :num_gt] = 1
input_query_class = input_query_class.tile([1, num_group])
input_query_bbox = input_query_bbox.tile([1, num_group, 1])
pad_gt_mask = pad_gt_mask.tile([1, num_group])
dn_positive_idx = paddle.nonzero(pad_gt_mask)[:, 1]
dn_positive_idx = paddle.split(dn_positive_idx,
[n * num_group for n in num_gts])
# total denoising queries
num_denoising = int(max_gt_num * num_group)
if label_noise_ratio > 0:
input_query_class = input_query_class.flatten()
pad_gt_mask = pad_gt_mask.flatten()
# half of bbox prob
mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5)
chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1)
# randomly put a new one here
new_label = paddle.randint_like(
chosen_idx, 0, num_classes, dtype=input_query_class.dtype)
input_query_class.scatter_(chosen_idx, new_label)
input_query_class.reshape_([bs, num_denoising])
pad_gt_mask.reshape_([bs, num_denoising])
if box_noise_scale > 0:
diff = paddle.concat(
[input_query_bbox[..., 2:] * 0.5, input_query_bbox[..., 2:]],
axis=-1) * box_noise_scale
diff *= (paddle.rand(input_query_bbox.shape) * 2.0 - 1.0)
input_query_bbox += diff
input_query_bbox = inverse_sigmoid(input_query_bbox)
class_embed = paddle.concat(
[class_embed, paddle.zeros([1, class_embed.shape[-1]])])
input_query_class = paddle.gather(
class_embed, input_query_class.flatten(),
axis=0).reshape([bs, num_denoising, -1])
tgt_size = num_denoising + num_queries
attn_mask = paddle.ones([tgt_size, tgt_size]) < 0
# match query cannot see the reconstruction
attn_mask[num_denoising:, :num_denoising] = True
# reconstruct cannot see each other
for i in range(num_group):
if i == 0:
attn_mask[max_gt_num * i:max_gt_num * (i + 1), max_gt_num * (i + 1):
num_denoising] = True
if i == num_group - 1:
attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num *
i] = True
else:
attn_mask[max_gt_num * i:max_gt_num * (i + 1), max_gt_num * (i + 1):
num_denoising] = True
attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num *
i] = True
attn_mask = ~attn_mask
dn_meta = {
"dn_positive_idx": dn_positive_idx,
"dn_num_group": num_group,
"dn_num_split": [num_denoising, num_queries]
}
return input_query_class, input_query_bbox, attn_mask, dn_meta
def get_contrastive_denoising_training_group(targets,
num_classes,
num_queries,
class_embed,
num_denoising=100,
label_noise_ratio=0.5,
box_noise_scale=1.0):
if num_denoising <= 0:
return None, None, None, None
num_gts = [len(t) for t in targets["gt_class"]]
max_gt_num = max(num_gts)
if max_gt_num == 0:
return None, None, None, None
num_group = num_denoising // max_gt_num
num_group = 1 if num_group == 0 else num_group
# pad gt to max_num of a batch
bs = len(targets["gt_class"])
input_query_class = paddle.full(
[bs, max_gt_num], num_classes, dtype='int32')
input_query_bbox = paddle.zeros([bs, max_gt_num, 4])
pad_gt_mask = paddle.zeros([bs, max_gt_num])
for i in range(bs):
num_gt = num_gts[i]
if num_gt > 0:
input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1)
input_query_bbox[i, :num_gt] = targets["gt_bbox"][i]
pad_gt_mask[i, :num_gt] = 1
# each group has positive and negative queries.
input_query_class = input_query_class.tile([1, 2 * num_group])
input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1])
pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group])
# positive and negative mask
negative_gt_mask = paddle.zeros([bs, max_gt_num * 2, 1])
negative_gt_mask[:, max_gt_num:] = 1
negative_gt_mask = negative_gt_mask.tile([1, num_group, 1])
positive_gt_mask = 1 - negative_gt_mask
# contrastive denoising training positive index
positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
dn_positive_idx = paddle.nonzero(positive_gt_mask)[:, 1]
dn_positive_idx = paddle.split(dn_positive_idx,
[n * num_group for n in num_gts])
# total denoising queries
num_denoising = int(max_gt_num * 2 * num_group)
if label_noise_ratio > 0:
input_query_class = input_query_class.flatten()
pad_gt_mask = pad_gt_mask.flatten()
# Convert pad_gt_mask to bool if it's not already
pad_gt_mask = pad_gt_mask.astype('bool')
# half of bbox prob
mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5)
chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1)
# randomly put a new one here
new_label = paddle.randint_like(
chosen_idx, 0, num_classes, dtype=input_query_class.dtype)
input_query_class.scatter_(chosen_idx, new_label)
input_query_class.reshape_([bs, num_denoising])
pad_gt_mask.reshape_([bs, num_denoising])
if box_noise_scale > 0:
known_bbox = bbox_cxcywh_to_xyxy(input_query_bbox)
diff = paddle.tile(input_query_bbox[..., 2:] * 0.5,
[1, 1, 2]) * box_noise_scale
rand_sign = paddle.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
rand_part = paddle.rand(input_query_bbox.shape)
rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (
1 - negative_gt_mask)
rand_part *= rand_sign
known_bbox += rand_part * diff
known_bbox.clip_(min=0.0, max=1.0)
input_query_bbox = bbox_xyxy_to_cxcywh(known_bbox)
input_query_bbox = inverse_sigmoid(input_query_bbox)
class_embed = paddle.concat(
[class_embed, paddle.zeros([1, class_embed.shape[-1]])])
input_query_class = paddle.gather(
class_embed, input_query_class.flatten(),
axis=0).reshape([bs, num_denoising, -1])
tgt_size = num_denoising + num_queries
attn_mask = paddle.ones([tgt_size, tgt_size]) < 0
# match query cannot see the reconstruction
attn_mask[num_denoising:, :num_denoising] = True
# reconstruct cannot see each other
for i in range(num_group):
if i == 0:
attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num *
2 * (i + 1):num_denoising] = True
if i == num_group - 1:
attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *
i * 2] = True
else:
attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num *
2 * (i + 1):num_denoising] = True
attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *
2 * i] = True
attn_mask = ~attn_mask
dn_meta = {
"dn_positive_idx": dn_positive_idx,
"dn_num_group": num_group,
"dn_num_split": [num_denoising, num_queries]
}
return input_query_class, input_query_bbox, attn_mask, dn_meta
def get_sine_pos_embed(pos_tensor,
num_pos_feats=128,
temperature=10000,
exchange_xy=True):
"""generate sine position embedding from a position tensor
Args:
pos_tensor (Tensor): Shape as `(None, n)`.
num_pos_feats (int): projected shape for each float in the tensor. Default: 128
temperature (int): The temperature used for scaling
the position embedding. Default: 10000.
exchange_xy (bool, optional): exchange pos x and pos y. \
For example, input tensor is `[x, y]`, the results will # noqa
be `[pos(y), pos(x)]`. Defaults: True.
Returns:
Tensor: Returned position embedding # noqa
with shape `(None, n * num_pos_feats)`.
"""
scale = 2. * math.pi
dim_t = 2. * paddle.floor_divide(
paddle.arange(num_pos_feats), paddle.to_tensor(2))
dim_t = scale / temperature**(dim_t / num_pos_feats)
def sine_func(x):
x *= dim_t
return paddle.stack(
(x[:, :, 0::2].sin(), x[:, :, 1::2].cos()), axis=3).flatten(2)
pos_res = [sine_func(x) for x in pos_tensor.split(pos_tensor.shape[-1], -1)]
if exchange_xy:
pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
pos_res = paddle.concat(pos_res, axis=2)
return pos_res
def mask_to_box_coordinate(mask,
normalize=False,
format="xyxy",
dtype="float32"):
"""
Compute the bounding boxes around the provided mask.
Args:
mask (Tensor:bool): [b, c, h, w]
Returns:
bbox (Tensor): [b, c, 4]
"""
assert mask.ndim == 4
assert format in ["xyxy", "xywh"]
if mask.sum() == 0:
return paddle.zeros([mask.shape[0], mask.shape[1], 4], dtype=dtype)
h, w = mask.shape[-2:]
y, x = paddle.meshgrid(
paddle.arange(
end=h, dtype=dtype), paddle.arange(
end=w, dtype=dtype))
x_mask = x * mask
x_max = x_mask.flatten(-2).max(-1) + 1
x_min = paddle.where(mask, x_mask,
paddle.to_tensor(1e8)).flatten(-2).min(-1)
y_mask = y * mask
y_max = y_mask.flatten(-2).max(-1) + 1
y_min = paddle.where(mask, y_mask,
paddle.to_tensor(1e8)).flatten(-2).min(-1)
out_bbox = paddle.stack([x_min, y_min, x_max, y_max], axis=-1)
if normalize:
out_bbox /= paddle.to_tensor([w, h, w, h]).astype(dtype)
return out_bbox if format == "xyxy" else bbox_xyxy_to_cxcywh(out_bbox)
def varifocal_loss_with_logits(pred_logits,
gt_score,
label,
normalizer=1.0,
alpha=0.75,
gamma=2.0):
pred_score = F.sigmoid(pred_logits)
weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
loss = F.binary_cross_entropy_with_logits(
pred_logits, gt_score, weight=weight, reduction='none')
return loss.mean(1).sum() / normalizer
from ..initializer import linear_init_
class MLP(nn.Layer):
"""This code is based on
https://github.com/facebookresearch/detr/blob/main/models/detr.py
"""
def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
super().__init__()
self.num_layers = num_layers
h = [hidden_dim] * (num_layers - 1)
self.layers = nn.LayerList(
nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
self._reset_parameters()
def _reset_parameters(self):
for l in self.layers:
linear_init_(l)
def forward(self, x):
for i, layer in enumerate(self.layers):
x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
return x