first commit

2026-06-03 12:42:47 +08:00
commit ec23799148
339 changed files with 57120 additions and 0 deletions
--- a/rtdetr_paddle/ppdet/modeling/transformers/init.py
+++ b/rtdetr_paddle/ppdet/modeling/transformers/init.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .utils import *
+from .matchers import *
+from .position_encoding import *
+from .rtdetr_transformer import *
+from .dino_transformer import *
+from .hybrid_encoder import *
--- a/rtdetr_paddle/ppdet/modeling/transformers/deformable_transformer.py
+++ b/rtdetr_paddle/ppdet/modeling/transformers/deformable_transformer.py
@@ -0,0 +1,537 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+
+from ppdet.core.workspace import register
+from ..layers import MultiHeadAttention
+from .position_encoding import PositionEmbedding
+from .utils import _get_clones, get_valid_ratio
+from ..initializer import linear_init_, constant_, xavier_uniform_, normal_
+
+__all__ = ['DeformableTransformer']
+
+
+class MSDeformableAttention(nn.Layer):
+    def __init__(self,
+                 embed_dim=256,
+                 num_heads=8,
+                 num_levels=4,
+                 num_points=4,
+                 lr_mult=0.1):
+        """
+        Multi-Scale Deformable Attention Module
+        """
+        super(MSDeformableAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_levels = num_levels
+        self.num_points = num_points
+        self.total_points = num_heads * num_levels * num_points
+
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        self.sampling_offsets = nn.Linear(
+            embed_dim,
+            self.total_points * 2,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult))
+
+        self.attention_weights = nn.Linear(embed_dim, self.total_points)
+        self.value_proj = nn.Linear(embed_dim, embed_dim)
+        self.output_proj = nn.Linear(embed_dim, embed_dim)
+        try:
+            # use cuda op
+            from deformable_detr_ops import ms_deformable_attn
+        except:
+            # use paddle func
+            from .utils import deformable_attention_core_func as ms_deformable_attn
+        self.ms_deformable_attn_core = ms_deformable_attn
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        # sampling_offsets
+        constant_(self.sampling_offsets.weight)
+        thetas = paddle.arange(
+            self.num_heads,
+            dtype=paddle.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = grid_init / grid_init.abs().max(-1, keepdim=True)
+        grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile(
+            [1, self.num_levels, self.num_points, 1])
+        scaling = paddle.arange(
+            1, self.num_points + 1,
+            dtype=paddle.float32).reshape([1, 1, -1, 1])
+        grid_init *= scaling
+        self.sampling_offsets.bias.set_value(grid_init.flatten())
+        # attention_weights
+        constant_(self.attention_weights.weight)
+        constant_(self.attention_weights.bias)
+        # proj
+        xavier_uniform_(self.value_proj.weight)
+        constant_(self.value_proj.bias)
+        xavier_uniform_(self.output_proj.weight)
+        constant_(self.output_proj.bias)
+
+    def forward(self,
+                query,
+                reference_points,
+                value,
+                value_spatial_shapes,
+                value_level_start_index,
+                value_mask=None):
+        """
+        Args:
+            query (Tensor): [bs, query_length, C]
+            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area
+            value (Tensor): [bs, value_length, C]
+            value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+            value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
+            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
+
+        Returns:
+            output (Tensor): [bs, Length_{query}, C]
+        """
+        bs, Len_q = query.shape[:2]
+        Len_v = value.shape[1]
+        assert int(value_spatial_shapes.prod(1).sum()) == Len_v
+
+        value = self.value_proj(value)
+        if value_mask is not None:
+            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
+            value *= value_mask
+        value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])
+
+        sampling_offsets = self.sampling_offsets(query).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
+        attention_weights = self.attention_weights(query).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels * self.num_points])
+        attention_weights = F.softmax(attention_weights).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels, self.num_points])
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = value_spatial_shapes.flip([1]).reshape(
+                [1, 1, 1, self.num_levels, 1, 2])
+            sampling_locations = reference_points.reshape([
+                bs, Len_q, 1, self.num_levels, 1, 2
+            ]) + sampling_offsets / offset_normalizer
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2] + sampling_offsets /
+                self.num_points * reference_points[:, :, None, :, None, 2:] *
+                0.5)
+        else:
+            raise ValueError(
+                "Last dim of reference_points must be 2 or 4, but get {} instead.".
+                format(reference_points.shape[-1]))
+
+        output = self.ms_deformable_attn_core(
+            value, value_spatial_shapes, value_level_start_index,
+            sampling_locations, attention_weights)
+        output = self.output_proj(output)
+
+        return output
+
+
+class DeformableTransformerEncoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model=256,
+                 n_head=8,
+                 dim_feedforward=1024,
+                 dropout=0.1,
+                 activation="relu",
+                 n_levels=4,
+                 n_points=4,
+                 lr_mult=0.1,
+                 weight_attr=None,
+                 bias_attr=None):
+        super(DeformableTransformerEncoderLayer, self).__init__()
+        # self attention
+        self.self_attn = MSDeformableAttention(d_model, n_head, n_levels,
+                                               n_points, lr_mult)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.activation = getattr(F, activation)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+        xavier_uniform_(self.linear1.weight)
+        xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, src):
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward(self,
+                src,
+                reference_points,
+                spatial_shapes,
+                level_start_index,
+                src_mask=None,
+                query_pos_embed=None):
+        # self attention
+        src2 = self.self_attn(
+            self.with_pos_embed(src, query_pos_embed), reference_points, src,
+            spatial_shapes, level_start_index, src_mask)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        # ffn
+        src = self.forward_ffn(src)
+
+        return src
+
+
+class DeformableTransformerEncoder(nn.Layer):
+    def __init__(self, encoder_layer, num_layers):
+        super(DeformableTransformerEncoder, self).__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, offset=0.5):
+        valid_ratios = valid_ratios.unsqueeze(1)
+        reference_points = []
+        for i, (H, W) in enumerate(spatial_shapes):
+            ref_y, ref_x = paddle.meshgrid(
+                paddle.arange(end=H) + offset, paddle.arange(end=W) + offset)
+            ref_y = ref_y.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 1] *
+                                                    H)
+            ref_x = ref_x.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 0] *
+                                                    W)
+            reference_points.append(paddle.stack((ref_x, ref_y), axis=-1))
+        reference_points = paddle.concat(reference_points, 1).unsqueeze(2)
+        reference_points = reference_points * valid_ratios
+        return reference_points
+
+    def forward(self,
+                feat,
+                spatial_shapes,
+                level_start_index,
+                feat_mask=None,
+                query_pos_embed=None,
+                valid_ratios=None):
+        if valid_ratios is None:
+            valid_ratios = paddle.ones(
+                [feat.shape[0], spatial_shapes.shape[0], 2])
+        reference_points = self.get_reference_points(spatial_shapes,
+                                                     valid_ratios)
+        for layer in self.layers:
+            feat = layer(feat, reference_points, spatial_shapes,
+                         level_start_index, feat_mask, query_pos_embed)
+
+        return feat
+
+
+class DeformableTransformerDecoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model=256,
+                 n_head=8,
+                 dim_feedforward=1024,
+                 dropout=0.1,
+                 activation="relu",
+                 n_levels=4,
+                 n_points=4,
+                 lr_mult=0.1,
+                 weight_attr=None,
+                 bias_attr=None):
+        super(DeformableTransformerDecoderLayer, self).__init__()
+
+        # self attention
+        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+
+        # cross attention
+        self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,
+                                                n_points, lr_mult)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.activation = getattr(F, activation)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+        xavier_uniform_(self.linear1.weight)
+        xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    def forward(self,
+                tgt,
+                reference_points,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                memory_mask=None,
+                query_pos_embed=None):
+        # self attention
+        q = k = self.with_pos_embed(tgt, query_pos_embed)
+        tgt2 = self.self_attn(q, k, value=tgt)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        # cross attention
+        tgt2 = self.cross_attn(
+            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
+            memory_spatial_shapes, memory_level_start_index, memory_mask)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+
+        # ffn
+        tgt = self.forward_ffn(tgt)
+
+        return tgt
+
+
+class DeformableTransformerDecoder(nn.Layer):
+    def __init__(self, decoder_layer, num_layers, return_intermediate=False):
+        super(DeformableTransformerDecoder, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.return_intermediate = return_intermediate
+
+    def forward(self,
+                tgt,
+                reference_points,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                memory_mask=None,
+                query_pos_embed=None):
+        output = tgt
+        intermediate = []
+        for lid, layer in enumerate(self.layers):
+            output = layer(output, reference_points, memory,
+                           memory_spatial_shapes, memory_level_start_index,
+                           memory_mask, query_pos_embed)
+
+            if self.return_intermediate:
+                intermediate.append(output)
+
+        if self.return_intermediate:
+            return paddle.stack(intermediate)
+
+        return output.unsqueeze(0)
+
+
+@register
+class DeformableTransformer(nn.Layer):
+    __shared__ = ['hidden_dim']
+
+    def __init__(self,
+                 num_queries=300,
+                 position_embed_type='sine',
+                 return_intermediate_dec=True,
+                 in_feats_channel=[512, 1024, 2048],
+                 num_feature_levels=4,
+                 num_encoder_points=4,
+                 num_decoder_points=4,
+                 hidden_dim=256,
+                 nhead=8,
+                 num_encoder_layers=6,
+                 num_decoder_layers=6,
+                 dim_feedforward=1024,
+                 dropout=0.1,
+                 activation="relu",
+                 lr_mult=0.1,
+                 pe_temperature=10000,
+                 pe_offset=-0.5):
+        super(DeformableTransformer, self).__init__()
+        assert position_embed_type in ['sine', 'learned'], \
+            f'ValueError: position_embed_type not supported {position_embed_type}!'
+        assert len(in_feats_channel) <= num_feature_levels
+
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.num_feature_levels = num_feature_levels
+
+        encoder_layer = DeformableTransformerEncoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation,
+            num_feature_levels, num_encoder_points, lr_mult)
+        self.encoder = DeformableTransformerEncoder(encoder_layer,
+                                                    num_encoder_layers)
+
+        decoder_layer = DeformableTransformerDecoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation,
+            num_feature_levels, num_decoder_points)
+        self.decoder = DeformableTransformerDecoder(
+            decoder_layer, num_decoder_layers, return_intermediate_dec)
+
+        self.level_embed = nn.Embedding(num_feature_levels, hidden_dim)
+        self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
+        self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)
+
+        self.reference_points = nn.Linear(
+            hidden_dim,
+            2,
+            weight_attr=ParamAttr(learning_rate=lr_mult),
+            bias_attr=ParamAttr(learning_rate=lr_mult))
+
+        self.input_proj = nn.LayerList()
+        for in_channels in in_feats_channel:
+            self.input_proj.append(
+                nn.Sequential(
+                    nn.Conv2D(
+                        in_channels, hidden_dim, kernel_size=1),
+                    nn.GroupNorm(32, hidden_dim)))
+        in_channels = in_feats_channel[-1]
+        for _ in range(num_feature_levels - len(in_feats_channel)):
+            self.input_proj.append(
+                nn.Sequential(
+                    nn.Conv2D(
+                        in_channels,
+                        hidden_dim,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1),
+                    nn.GroupNorm(32, hidden_dim)))
+            in_channels = hidden_dim
+
+        self.position_embedding = PositionEmbedding(
+            hidden_dim // 2,
+            temperature=pe_temperature,
+            normalize=True if position_embed_type == 'sine' else False,
+            embed_type=position_embed_type,
+            offset=pe_offset,
+            eps=1e-4)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        normal_(self.level_embed.weight)
+        normal_(self.tgt_embed.weight)
+        normal_(self.query_pos_embed.weight)
+        xavier_uniform_(self.reference_points.weight)
+        constant_(self.reference_points.bias)
+        for l in self.input_proj:
+            xavier_uniform_(l[0].weight)
+            constant_(l[0].bias)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_feats_channel': [i.channels for i in input_shape], }
+
+    def forward(self, src_feats, src_mask=None, *args, **kwargs):
+        srcs = []
+        for i in range(len(src_feats)):
+            srcs.append(self.input_proj[i](src_feats[i]))
+        if self.num_feature_levels > len(srcs):
+            len_srcs = len(srcs)
+            for i in range(len_srcs, self.num_feature_levels):
+                if i == len_srcs:
+                    srcs.append(self.input_proj[i](src_feats[-1]))
+                else:
+                    srcs.append(self.input_proj[i](srcs[-1]))
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        valid_ratios = []
+        for level, src in enumerate(srcs):
+            src_shape = paddle.shape(src)
+            bs = src_shape[0:1]
+            h = src_shape[2:3]
+            w = src_shape[3:4]
+            spatial_shapes.append(paddle.concat([h, w]))
+            src = src.flatten(2).transpose([0, 2, 1])
+            src_flatten.append(src)
+            if src_mask is not None:
+                mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]
+            else:
+                mask = paddle.ones([bs, h, w])
+            valid_ratios.append(get_valid_ratio(mask))
+            pos_embed = self.position_embedding(mask).flatten(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed.weight[level]
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            mask = mask.flatten(1)
+            mask_flatten.append(mask)
+        src_flatten = paddle.concat(src_flatten, 1)
+        mask_flatten = None if src_mask is None else paddle.concat(mask_flatten,
+                                                                   1)
+        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
+        # [l, 2]
+        spatial_shapes = paddle.to_tensor(
+            paddle.stack(spatial_shapes).astype('int64'))
+        # [l], 每一个level的起始index
+        level_start_index = paddle.concat([
+            paddle.zeros(
+                [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
+        ])
+        # [b, l, 2]
+        valid_ratios = paddle.stack(valid_ratios, 1)
+
+        # encoder
+        memory = self.encoder(src_flatten, spatial_shapes, level_start_index,
+                              mask_flatten, lvl_pos_embed_flatten, valid_ratios)
+
+        # prepare input for decoder
+        bs, _, c = memory.shape
+        query_embed = self.query_pos_embed.weight.unsqueeze(0).tile([bs, 1, 1])
+        tgt = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
+        reference_points = F.sigmoid(self.reference_points(query_embed))
+        reference_points_input = reference_points.unsqueeze(
+            2) * valid_ratios.unsqueeze(1)
+
+        # decoder
+        hs = self.decoder(tgt, reference_points_input, memory, spatial_shapes,
+                          level_start_index, mask_flatten, query_embed)
+
+        return (hs, memory, reference_points)
--- a/rtdetr_paddle/ppdet/modeling/transformers/detr_transformer.py
+++ b/rtdetr_paddle/ppdet/modeling/transformers/detr_transformer.py
@@ -0,0 +1,359 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ppdet.core.workspace import register
+from ..layers import MultiHeadAttention, _convert_attention_mask
+from .position_encoding import PositionEmbedding
+from .utils import _get_clones
+from ..initializer import linear_init_, conv_init_, xavier_uniform_, normal_
+
+__all__ = ['DETRTransformer']
+
+
+class TransformerEncoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False):
+        super(TransformerEncoderLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.activation = getattr(F, activation)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(self, src, src_mask=None, pos_embed=None):
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        q = k = self.with_pos_embed(src, pos_embed)
+        src = self.self_attn(q, k, value=src, attn_mask=src_mask)
+
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+        return src
+
+
+class TransformerEncoder(nn.Layer):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super(TransformerEncoder, self).__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, src, src_mask=None, pos_embed=None):
+        output = src
+        for layer in self.layers:
+            output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerDecoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False):
+        super(TransformerDecoderLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout3 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.activation = getattr(F, activation)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(self,
+                tgt,
+                memory,
+                tgt_mask=None,
+                memory_mask=None,
+                pos_embed=None,
+                query_pos_embed=None):
+        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt, query_pos_embed)
+        tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask)
+        tgt = residual + self.dropout1(tgt)
+        if not self.normalize_before:
+            tgt = self.norm1(tgt)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm2(tgt)
+        q = self.with_pos_embed(tgt, query_pos_embed)
+        k = self.with_pos_embed(memory, pos_embed)
+        tgt = self.cross_attn(q, k, value=memory, attn_mask=memory_mask)
+        tgt = residual + self.dropout2(tgt)
+        if not self.normalize_before:
+            tgt = self.norm2(tgt)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm3(tgt)
+        tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = residual + self.dropout3(tgt)
+        if not self.normalize_before:
+            tgt = self.norm3(tgt)
+        return tgt
+
+
+class TransformerDecoder(nn.Layer):
+    def __init__(self,
+                 decoder_layer,
+                 num_layers,
+                 norm=None,
+                 return_intermediate=False):
+        super(TransformerDecoder, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+
+    def forward(self,
+                tgt,
+                memory,
+                tgt_mask=None,
+                memory_mask=None,
+                pos_embed=None,
+                query_pos_embed=None):
+        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
+
+        output = tgt
+        intermediate = []
+        for layer in self.layers:
+            output = layer(
+                output,
+                memory,
+                tgt_mask=tgt_mask,
+                memory_mask=memory_mask,
+                pos_embed=pos_embed,
+                query_pos_embed=query_pos_embed)
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        if self.return_intermediate:
+            return paddle.stack(intermediate)
+
+        return output.unsqueeze(0)
+
+
+@register
+class DETRTransformer(nn.Layer):
+    __shared__ = ['hidden_dim']
+
+    def __init__(self,
+                 num_queries=100,
+                 position_embed_type='sine',
+                 return_intermediate_dec=True,
+                 backbone_num_channels=2048,
+                 hidden_dim=256,
+                 nhead=8,
+                 num_encoder_layers=6,
+                 num_decoder_layers=6,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 pe_temperature=10000,
+                 pe_offset=0.,
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False):
+        super(DETRTransformer, self).__init__()
+        assert position_embed_type in ['sine', 'learned'],\
+            f'ValueError: position_embed_type not supported {position_embed_type}!'
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+
+        encoder_layer = TransformerEncoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation,
+            attn_dropout, act_dropout, normalize_before)
+        encoder_norm = nn.LayerNorm(hidden_dim) if normalize_before else None
+        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
+                                          encoder_norm)
+
+        decoder_layer = TransformerDecoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation,
+            attn_dropout, act_dropout, normalize_before)
+        decoder_norm = nn.LayerNorm(hidden_dim)
+        self.decoder = TransformerDecoder(
+            decoder_layer,
+            num_decoder_layers,
+            decoder_norm,
+            return_intermediate=return_intermediate_dec)
+
+        self.input_proj = nn.Conv2D(
+            backbone_num_channels, hidden_dim, kernel_size=1)
+        self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)
+        self.position_embedding = PositionEmbedding(
+            hidden_dim // 2,
+            temperature=pe_temperature,
+            normalize=True if position_embed_type == 'sine' else False,
+            embed_type=position_embed_type,
+            offset=pe_offset)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                xavier_uniform_(p)
+        conv_init_(self.input_proj)
+        normal_(self.query_pos_embed.weight)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            'backbone_num_channels': [i.channels for i in input_shape][-1],
+        }
+
+    def _convert_attention_mask(self, mask):
+        return (mask - 1.0) * 1e9
+
+    def forward(self, src, src_mask=None, *args, **kwargs):
+        r"""
+        Applies a Transformer model on the inputs.
+
+        Parameters:
+            src (List(Tensor)): Backbone feature maps with shape [[bs, c, h, w]].
+            src_mask (Tensor, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                [bs, H, W]`. When the data type is bool, the unwanted positions
+                have `False` values and the others have `True` values. When the
+                data type is int, the unwanted positions have 0 values and the
+                others have 1 values. When the data type is float, the unwanted
+                positions have `-INF` values and the others have 0 values. It
+                can be None when nothing wanted or needed to be prevented
+                attention to. Default None.
+
+        Returns:
+            output (Tensor): [num_levels, batch_size, num_queries, hidden_dim]
+            memory (Tensor): [batch_size, hidden_dim, h, w]
+        """
+        # use last level feature map
+        src_proj = self.input_proj(src[-1])
+        bs, c, h, w = paddle.shape(src_proj)
+        # flatten [B, C, H, W] to [B, HxW, C]
+        src_flatten = src_proj.flatten(2).transpose([0, 2, 1])
+        if src_mask is not None:
+            src_mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]
+        else:
+            src_mask = paddle.ones([bs, h, w])
+        pos_embed = self.position_embedding(src_mask).flatten(1, 2)
+
+        if self.training:
+            src_mask = self._convert_attention_mask(src_mask)
+            src_mask = src_mask.reshape([bs, 1, 1, h * w])
+        else:
+            src_mask = None
+
+        memory = self.encoder(
+            src_flatten, src_mask=src_mask, pos_embed=pos_embed)
+
+        query_pos_embed = self.query_pos_embed.weight.unsqueeze(0).tile(
+            [bs, 1, 1])
+        tgt = paddle.zeros_like(query_pos_embed)
+        output = self.decoder(
+            tgt,
+            memory,
+            memory_mask=src_mask,
+            pos_embed=pos_embed,
+            query_pos_embed=query_pos_embed)
+
+        if self.training:
+            src_mask = src_mask.reshape([bs, 1, 1, h, w])
+        else:
+            src_mask = None
+
+        return (output, memory.transpose([0, 2, 1]).reshape([bs, c, h, w]),
+                src_proj, src_mask)
--- a/rtdetr_paddle/ppdet/modeling/transformers/dino_transformer.py
+++ b/rtdetr_paddle/ppdet/modeling/transformers/dino_transformer.py
@@ -0,0 +1,527 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Modified from detrex (https://github.com/IDEA-Research/detrex)
+# Copyright 2022 The IDEA Authors. All rights reserved.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+
+from ppdet.core.workspace import register
+from ..layers import MultiHeadAttention
+from .position_encoding import PositionEmbedding
+from .deformable_transformer import (MSDeformableAttention,
+                                     DeformableTransformerEncoderLayer,
+                                     DeformableTransformerEncoder)
+from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,
+                           bias_init_with_prob)
+from .utils import (_get_clones, get_valid_ratio,
+                    get_contrastive_denoising_training_group,
+                    get_sine_pos_embed, inverse_sigmoid, MLP)
+
+__all__ = ['DINOTransformer']
+
+
+class DINOTransformerDecoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model=256,
+                 n_head=8,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 n_levels=4,
+                 n_points=4,
+                 lr_mult=1.0,
+                 weight_attr=None,
+                 bias_attr=None):
+        super(DINOTransformerDecoderLayer, self).__init__()
+
+        # self attention
+        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+
+        # cross attention
+        self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,
+                                                n_points, lr_mult)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.activation = getattr(F, activation)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(
+            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+        xavier_uniform_(self.linear1.weight)
+        xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+
+    def forward(self,
+                tgt,
+                reference_points,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                attn_mask=None,
+                memory_mask=None,
+                query_pos_embed=None):
+        # self attention
+        q = k = self.with_pos_embed(tgt, query_pos_embed)
+        if attn_mask is not None:
+            attn_mask = paddle.where(
+                attn_mask.astype('bool'),
+                paddle.zeros(attn_mask.shape, tgt.dtype),
+                paddle.full(attn_mask.shape, float("-inf"), tgt.dtype))
+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        # cross attention
+        tgt2 = self.cross_attn(
+            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
+            memory_spatial_shapes, memory_level_start_index, memory_mask)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+
+        # ffn
+        tgt2 = self.forward_ffn(tgt)
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+
+        return tgt
+
+
+class DINOTransformerDecoder(nn.Layer):
+    def __init__(self,
+                 hidden_dim,
+                 decoder_layer,
+                 num_layers,
+                 weight_attr=None,
+                 bias_attr=None):
+        super(DINOTransformerDecoder, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.norm = nn.LayerNorm(
+            hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)
+
+    def forward(self,
+                tgt,
+                ref_points_unact,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                bbox_head,
+                query_pos_head,
+                valid_ratios=None,
+                attn_mask=None,
+                memory_mask=None):
+        if valid_ratios is None:
+            valid_ratios = paddle.ones(
+                [memory.shape[0], memory_spatial_shapes.shape[0], 2])
+
+        output = tgt
+        intermediate = []
+        inter_bboxes = []
+        ref_points = F.sigmoid(ref_points_unact)
+        for i, layer in enumerate(self.layers):
+            reference_points_input = ref_points.detach().unsqueeze(
+                2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1)
+            query_pos_embed = get_sine_pos_embed(
+                reference_points_input[..., 0, :], self.hidden_dim // 2)
+            query_pos_embed = query_pos_head(query_pos_embed)
+
+            output = layer(output, reference_points_input, memory,
+                           memory_spatial_shapes, memory_level_start_index,
+                           attn_mask, memory_mask, query_pos_embed)
+
+            ref_points = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
+                ref_points.detach()))
+
+            intermediate.append(self.norm(output))
+            inter_bboxes.append(ref_points)
+
+        return paddle.stack(intermediate), paddle.stack(inter_bboxes)
+
+
+@register
+class DINOTransformer(nn.Layer):
+    __shared__ = ['num_classes', 'hidden_dim']
+
+    def __init__(self,
+                 num_classes=80,
+                 hidden_dim=256,
+                 num_queries=900,
+                 position_embed_type='sine',
+                 in_feats_channel=[512, 1024, 2048],
+                 num_levels=4,
+                 num_encoder_points=4,
+                 num_decoder_points=4,
+                 nhead=8,
+                 num_encoder_layers=6,
+                 num_decoder_layers=6,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 lr_mult=1.0,
+                 pe_temperature=10000,
+                 pe_offset=-0.5,
+                 num_denoising=100,
+                 label_noise_ratio=0.5,
+                 box_noise_scale=1.0,
+                 learnt_init_query=True,
+                 eps=1e-2):
+        super(DINOTransformer, self).__init__()
+        assert position_embed_type in ['sine', 'learned'], \
+            f'ValueError: position_embed_type not supported {position_embed_type}!'
+        assert len(in_feats_channel) <= num_levels
+
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.num_levels = num_levels
+        self.num_classes = num_classes
+        self.num_queries = num_queries
+        self.eps = eps
+        self.num_decoder_layers = num_decoder_layers
+
+        weight_attr = ParamAttr(regularizer=L2Decay(0.0))
+        bias_attr = ParamAttr(regularizer=L2Decay(0.0))
+        # backbone feature projection
+        self._build_input_proj_layer(in_feats_channel, weight_attr, bias_attr)
+
+        # Transformer module
+        encoder_layer = DeformableTransformerEncoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
+            num_encoder_points, lr_mult, weight_attr, bias_attr)
+        self.encoder = DeformableTransformerEncoder(encoder_layer,
+                                                    num_encoder_layers)
+        decoder_layer = DINOTransformerDecoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
+            num_decoder_points, lr_mult, weight_attr, bias_attr)
+        self.decoder = DINOTransformerDecoder(hidden_dim, decoder_layer,
+                                              num_decoder_layers, weight_attr,
+                                              bias_attr)
+
+        # denoising part
+        self.denoising_class_embed = nn.Embedding(
+            num_classes,
+            hidden_dim,
+            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
+        self.num_denoising = num_denoising
+        self.label_noise_ratio = label_noise_ratio
+        self.box_noise_scale = box_noise_scale
+
+        # position embedding
+        self.position_embedding = PositionEmbedding(
+            hidden_dim // 2,
+            temperature=pe_temperature,
+            normalize=True if position_embed_type == 'sine' else False,
+            embed_type=position_embed_type,
+            offset=pe_offset)
+        self.level_embed = nn.Embedding(num_levels, hidden_dim)
+        # decoder embedding
+        self.learnt_init_query = learnt_init_query
+        if learnt_init_query:
+            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
+        self.query_pos_head = MLP(2 * hidden_dim,
+                                  hidden_dim,
+                                  hidden_dim,
+                                  num_layers=2)
+
+        # encoder head
+        self.enc_output = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.LayerNorm(
+                hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr))
+        self.enc_score_head = nn.Linear(hidden_dim, num_classes)
+        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+        # decoder head
+        self.dec_score_head = nn.LayerList([
+            nn.Linear(hidden_dim, num_classes)
+            for _ in range(num_decoder_layers)
+        ])
+        self.dec_bbox_head = nn.LayerList([
+            MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+            for _ in range(num_decoder_layers)
+        ])
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        # class and bbox head init
+        bias_cls = bias_init_with_prob(0.01)
+        linear_init_(self.enc_score_head)
+        constant_(self.enc_score_head.bias, bias_cls)
+        constant_(self.enc_bbox_head.layers[-1].weight)
+        constant_(self.enc_bbox_head.layers[-1].bias)
+        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
+            linear_init_(cls_)
+            constant_(cls_.bias, bias_cls)
+            constant_(reg_.layers[-1].weight)
+            constant_(reg_.layers[-1].bias)
+
+        linear_init_(self.enc_output[0])
+        xavier_uniform_(self.enc_output[0].weight)
+        normal_(self.level_embed.weight)
+        if self.learnt_init_query:
+            xavier_uniform_(self.tgt_embed.weight)
+        xavier_uniform_(self.query_pos_head.layers[0].weight)
+        xavier_uniform_(self.query_pos_head.layers[1].weight)
+        for l in self.input_proj:
+            xavier_uniform_(l[0].weight)
+            constant_(l[0].bias)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'in_feats_channel': [i.channels for i in input_shape], }
+
+    def _build_input_proj_layer(self,
+                                in_feats_channel,
+                                weight_attr=None,
+                                bias_attr=None):
+        self.input_proj = nn.LayerList()
+        for in_channels in in_feats_channel:
+            self.input_proj.append(
+                nn.Sequential(
+                    ('conv', nn.Conv2D(
+                        in_channels, self.hidden_dim, kernel_size=1)), (
+                            'norm', nn.GroupNorm(
+                                32,
+                                self.hidden_dim,
+                                weight_attr=weight_attr,
+                                bias_attr=bias_attr))))
+        in_channels = in_feats_channel[-1]
+        for _ in range(self.num_levels - len(in_feats_channel)):
+            self.input_proj.append(
+                nn.Sequential(
+                    ('conv', nn.Conv2D(
+                        in_channels,
+                        self.hidden_dim,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1)), ('norm', nn.GroupNorm(
+                            32,
+                            self.hidden_dim,
+                            weight_attr=weight_attr,
+                            bias_attr=bias_attr))))
+            in_channels = self.hidden_dim
+
+    def _get_encoder_input(self, feats, pad_mask=None):
+        # get projection features
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+        if self.num_levels > len(proj_feats):
+            len_srcs = len(proj_feats)
+            for i in range(len_srcs, self.num_levels):
+                if i == len_srcs:
+                    proj_feats.append(self.input_proj[i](feats[-1]))
+                else:
+                    proj_feats.append(self.input_proj[i](proj_feats[-1]))
+
+        # get encoder inputs
+        feat_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        valid_ratios = []
+        for i, feat in enumerate(proj_feats):
+            bs, _, h, w = paddle.shape(feat)
+            spatial_shapes.append(paddle.stack([h, w]))
+            # [b,c,h,w] -> [b,h*w,c]
+            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
+            if pad_mask is not None:
+                mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0]
+            else:
+                mask = paddle.ones([bs, h, w])
+            valid_ratios.append(get_valid_ratio(mask))
+            # [b, h*w, c]
+            pos_embed = self.position_embedding(mask).flatten(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed.weight[i]
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            if pad_mask is not None:
+                # [b, h*w]
+                mask_flatten.append(mask.flatten(1))
+
+        # [b, l, c]
+        feat_flatten = paddle.concat(feat_flatten, 1)
+        # [b, l]
+        mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten,
+                                                                   1)
+        # [b, l, c]
+        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
+        # [num_levels, 2]
+        spatial_shapes = paddle.to_tensor(
+            paddle.stack(spatial_shapes).astype('int64'))
+        # [l] start index of each level
+        level_start_index = paddle.concat([
+            paddle.zeros(
+                [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
+        ])
+        # [b, num_levels, 2]
+        valid_ratios = paddle.stack(valid_ratios, 1)
+        return (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
+                lvl_pos_embed_flatten, valid_ratios)
+
+    def forward(self, feats, pad_mask=None, gt_meta=None):
+        # input projection and embedding
+        (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
+         lvl_pos_embed_flatten,
+         valid_ratios) = self._get_encoder_input(feats, pad_mask)
+
+        # encoder
+        memory = self.encoder(feat_flatten, spatial_shapes, level_start_index,
+                              mask_flatten, lvl_pos_embed_flatten, valid_ratios)
+
+        # prepare denoising training
+        if self.training:
+            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
+                get_contrastive_denoising_training_group(gt_meta,
+                                            self.num_classes,
+                                            self.num_queries,
+                                            self.denoising_class_embed.weight,
+                                            self.num_denoising,
+                                            self.label_noise_ratio,
+                                            self.box_noise_scale)
+        else:
+            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
+
+        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
+            self._get_decoder_input(
+            memory, spatial_shapes, mask_flatten, denoising_class,
+            denoising_bbox_unact)
+
+        # decoder
+        inter_feats, inter_bboxes = self.decoder(
+            target, init_ref_points_unact, memory, spatial_shapes,
+            level_start_index, self.dec_bbox_head, self.query_pos_head,
+            valid_ratios, attn_mask, mask_flatten)
+        out_bboxes = []
+        out_logits = []
+        for i in range(self.num_decoder_layers):
+            out_logits.append(self.dec_score_head[i](inter_feats[i]))
+            if i == 0:
+                out_bboxes.append(
+                    F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
+                              init_ref_points_unact))
+            else:
+                out_bboxes.append(
+                    F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
+                              inverse_sigmoid(inter_bboxes[i - 1])))
+        out_bboxes = paddle.stack(out_bboxes)
+        out_logits = paddle.stack(out_logits)
+
+        return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
+                dn_meta)
+
+    def _get_encoder_output_anchors(self,
+                                    memory,
+                                    spatial_shapes,
+                                    memory_mask=None,
+                                    grid_size=0.05):
+        output_anchors = []
+        idx = 0
+        for lvl, (h, w) in enumerate(spatial_shapes):
+            if memory_mask is not None:
+                mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w])
+                valid_H = paddle.sum(mask_[:, :, 0], 1)
+                valid_W = paddle.sum(mask_[:, 0, :], 1)
+            else:
+                valid_H, valid_W = h, w
+
+            grid_y, grid_x = paddle.meshgrid(
+                paddle.arange(end=h), paddle.arange(end=w))
+            grid_xy = paddle.stack([grid_x, grid_y], -1).astype(memory.dtype)
+
+            valid_WH = paddle.stack([valid_W, valid_H], -1).reshape(
+                [-1, 1, 1, 2]).astype(grid_xy.dtype)
+            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
+            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
+            output_anchors.append(
+                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
+            idx += h * w
+
+        output_anchors = paddle.concat(output_anchors, 1)
+        valid_mask = ((output_anchors > self.eps) *
+                      (output_anchors < 1 - self.eps)).all(-1, keepdim=True)
+        output_anchors = paddle.log(output_anchors / (1 - output_anchors))
+        if memory_mask is not None:
+            valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0
+        output_anchors = paddle.where(valid_mask, output_anchors,
+                                      paddle.to_tensor(float("inf")))
+
+        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
+        output_memory = self.enc_output(memory)
+        return output_memory, output_anchors
+
+    def _get_decoder_input(self,
+                           memory,
+                           spatial_shapes,
+                           memory_mask=None,
+                           denoising_class=None,
+                           denoising_bbox_unact=None):
+        bs, _, _ = memory.shape
+        # prepare input for decoder
+        output_memory, output_anchors = self._get_encoder_output_anchors(
+            memory, spatial_shapes, memory_mask)
+        enc_outputs_class = self.enc_score_head(output_memory)
+        enc_outputs_coord_unact = self.enc_bbox_head(
+            output_memory) + output_anchors
+
+        _, topk_ind = paddle.topk(
+            enc_outputs_class.max(-1), self.num_queries, axis=1)
+        # extract region proposal boxes
+        batch_ind = paddle.arange(end=bs).astype(topk_ind.dtype)
+        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
+        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
+        reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact,
+                                                  topk_ind)  # unsigmoided.
+        enc_topk_bboxes = F.sigmoid(reference_points_unact)
+        if denoising_bbox_unact is not None:
+            reference_points_unact = paddle.concat(
+                [denoising_bbox_unact, reference_points_unact], 1)
+        enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)
+
+        # extract region features
+        if self.learnt_init_query:
+            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
+        else:
+            target = paddle.gather_nd(output_memory, topk_ind).detach()
+        if denoising_class is not None:
+            target = paddle.concat([denoising_class, target], 1)
+
+        return target, reference_points_unact.detach(
+        ), enc_topk_bboxes, enc_topk_logits
--- a/rtdetr_paddle/ppdet/modeling/transformers/ext_op/README.md
+++ b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/README.md
@@ -0,0 +1,85 @@
+# Multi-scale deformable attention自定义OP编译
+该自定义OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。
+
+## 1. 环境依赖
+- Paddle >= 2.3.2
+- gcc 8.2
+
+## 2. 安装
+请在当前路径下进行编译安装
+```
+cd rtdetr_paddle/ppdet/modeling/transformers/ext_op/
+python setup_ms_deformable_attn_op.py install
+```
+
+编译完成后即可使用，以下为`ms_deformable_attn`的使用示例
+```
+# 引入自定义op
+from deformable_detr_ops import ms_deformable_attn
+
+# 构造fake input tensor
+bs, n_heads, c = 2, 8, 8
+query_length, n_levels, n_points = 2, 2, 2
+spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
+level_start_index = paddle.concat((paddle.to_tensor(
+    [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
+value_length = sum([(H * W).item() for H, W in spatial_shapes])
+
+def get_test_tensors(channels):
+    value = paddle.rand(
+        [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
+    sampling_locations = paddle.rand(
+        [bs, query_length, n_heads, n_levels, n_points, 2],
+        dtype=paddle.float32)
+    attention_weights = paddle.rand(
+        [bs, query_length, n_heads, n_levels, n_points],
+        dtype=paddle.float32) + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
+        -2, keepdim=True)
+    return [value, sampling_locations, attention_weights]
+
+value, sampling_locations, attention_weights = get_test_tensors(c)
+
+output = ms_deformable_attn(value,
+                            spatial_shapes,
+                            level_start_index,
+                            sampling_locations,
+                            attention_weights)
+```
+
+## 3. 单元测试
+可以通过执行单元测试来确认自定义算子功能的正确性，执行单元测试的示例如下所示：
+```
+python test_ms_deformable_attn_op.py
+```
+运行成功后，打印如下：
+```
+*True check_forward_equal_with_paddle_float: max_abs_err 6.98e-10 max_rel_err 2.03e-07
+*tensor1 True check_gradient_numerical(D=30)
+*tensor2 True check_gradient_numerical(D=30)
+*tensor3 True check_gradient_numerical(D=30)
+*tensor1 True check_gradient_numerical(D=32)
+*tensor2 True check_gradient_numerical(D=32)
+*tensor3 True check_gradient_numerical(D=32)
+*tensor1 True check_gradient_numerical(D=64)
+*tensor2 True check_gradient_numerical(D=64)
+*tensor3 True check_gradient_numerical(D=64)
+*tensor1 True check_gradient_numerical(D=71)
+*tensor2 True check_gradient_numerical(D=71)
+*tensor3 True check_gradient_numerical(D=71)
+*tensor1 True check_gradient_numerical(D=128)
+*tensor2 True check_gradient_numerical(D=128)
+*tensor3 True check_gradient_numerical(D=128)
+*tensor1 True check_gradient_numerical(D=1024)
+*tensor2 True check_gradient_numerical(D=1024)
+*tensor3 True check_gradient_numerical(D=1024)
+*tensor1 True check_gradient_numerical(D=1025)
+*tensor2 True check_gradient_numerical(D=1025)
+*tensor3 True check_gradient_numerical(D=1025)
+*tensor1 True check_gradient_numerical(D=2048)
+*tensor2 True check_gradient_numerical(D=2048)
+*tensor3 True check_gradient_numerical(D=2048)
+*tensor1 True check_gradient_numerical(D=3096)
+*tensor2 True check_gradient_numerical(D=3096)
+*tensor3 True check_gradient_numerical(D=3096)
+```
--- a/rtdetr_paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cc
+++ b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/extension.h"
+
+#include <vector>
+
+// declare GPU implementation
+std::vector<paddle::Tensor>
+MSDeformableAttnCUDAForward(const paddle::Tensor &value,
+                            const paddle::Tensor &value_spatial_shapes,
+                            const paddle::Tensor &value_level_start_index,
+                            const paddle::Tensor &sampling_locations,
+                            const paddle::Tensor &attention_weights);
+
+std::vector<paddle::Tensor> MSDeformableAttnCUDABackward(
+    const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes,
+    const paddle::Tensor &value_level_start_index,
+    const paddle::Tensor &sampling_locations,
+    const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out);
+
+//// CPU not implemented
+
+std::vector<std::vector<int64_t>>
+MSDeformableAttnInferShape(std::vector<int64_t> value_shape,
+                           std::vector<int64_t> value_spatial_shapes_shape,
+                           std::vector<int64_t> value_level_start_index_shape,
+                           std::vector<int64_t> sampling_locations_shape,
+                           std::vector<int64_t> attention_weights_shape) {
+  return {{value_shape[0], sampling_locations_shape[1],
+           value_shape[2] * value_shape[3]}};
+}
+
+std::vector<paddle::DataType>
+MSDeformableAttnInferDtype(paddle::DataType value_dtype,
+                           paddle::DataType value_spatial_shapes_dtype,
+                           paddle::DataType value_level_start_index_dtype,
+                           paddle::DataType sampling_locations_dtype,
+                           paddle::DataType attention_weights_dtype) {
+  return {value_dtype};
+}
+
+PD_BUILD_OP(ms_deformable_attn)
+    .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
+             "AttentionWeights"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDAForward))
+    .SetInferShapeFn(PD_INFER_SHAPE(MSDeformableAttnInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(MSDeformableAttnInferDtype));
+
+PD_BUILD_GRAD_OP(ms_deformable_attn)
+    .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
+             "AttentionWeights", paddle::Grad("Out")})
+    .Outputs({paddle::Grad("Value"), paddle::Grad("SpatialShapes"),
+              paddle::Grad("LevelIndex"), paddle::Grad("SamplingLocations"),
+              paddle::Grad("AttentionWeights")})
+    .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDABackward));
--- a/rtdetr_paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cu
+++ b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cu
--- a/rtdetr_paddle/ppdet/modeling/transformers/ext_op/setup_ms_deformable_attn_op.py
+++ b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/setup_ms_deformable_attn_op.py
@@ -0,0 +1,7 @@
+from paddle.utils.cpp_extension import CUDAExtension, setup
+
+if __name__ == "__main__":
+    setup(
+        name='deformable_detr_ops',
+        ext_modules=CUDAExtension(
+            sources=['ms_deformable_attn_op.cc', 'ms_deformable_attn_op.cu']))
--- a/rtdetr_paddle/ppdet/modeling/transformers/ext_op/test_ms_deformable_attn_op.py
+++ b/rtdetr_paddle/ppdet/modeling/transformers/ext_op/test_ms_deformable_attn_op.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import os
+import sys
+import random
+import numpy as np
+import paddle
+# add python path of PaddleDetection to sys.path
+parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 5)))
+if parent_path not in sys.path:
+    sys.path.append(parent_path)
+
+from ppdet.modeling.transformers.utils import deformable_attention_core_func
+ms_deform_attn_core_paddle = deformable_attention_core_func
+
+try:
+    gpu_index = int(sys.argv[1])
+except:
+    gpu_index = 0
+print(f'Use gpu {gpu_index} to test...')
+paddle.set_device(f'gpu:{gpu_index}')
+
+try:
+    from deformable_detr_ops import ms_deformable_attn
+except Exception as e:
+    print('import deformable_detr_ops error', e)
+    sys.exit(-1)
+
+paddle.seed(1)
+random.seed(1)
+np.random.seed(1)
+
+bs, n_heads, c = 2, 8, 8
+query_length, n_levels, n_points = 2, 2, 2
+spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
+level_start_index = paddle.concat((paddle.to_tensor(
+    [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
+value_length = sum([(H * W).item() for H, W in spatial_shapes])
+
+
+def get_test_tensors(channels):
+    value = paddle.rand(
+        [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
+    sampling_locations = paddle.rand(
+        [bs, query_length, n_heads, n_levels, n_points, 2],
+        dtype=paddle.float32)
+    attention_weights = paddle.rand(
+        [bs, query_length, n_heads, n_levels, n_points],
+        dtype=paddle.float32) + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
+        -2, keepdim=True)
+
+    return [value, sampling_locations, attention_weights]
+
+
+@paddle.no_grad()
+def check_forward_equal_with_paddle_float():
+    value, sampling_locations, attention_weights = get_test_tensors(c)
+
+    output_paddle = ms_deform_attn_core_paddle(
+        value, spatial_shapes, level_start_index, sampling_locations,
+        attention_weights).detach().cpu()
+    output_cuda = ms_deformable_attn(value, spatial_shapes, level_start_index,
+                                     sampling_locations,
+                                     attention_weights).detach().cpu()
+    fwdok = paddle.allclose(
+        output_cuda, output_paddle, rtol=1e-2, atol=1e-3).item()
+    max_abs_err = (output_cuda - output_paddle).abs().max().item()
+    max_rel_err = (
+        (output_cuda - output_paddle).abs() / output_paddle.abs()).max().item()
+
+    print(
+        f'*{fwdok} check_forward_equal_with_paddle_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}'
+    )
+
+
+def check_gradient_numerical(channels=4):
+    value_paddle, sampling_locations_paddle, attention_weights_paddle = get_test_tensors(
+        channels)
+    value_paddle.stop_gradient = False
+    sampling_locations_paddle.stop_gradient = False
+    attention_weights_paddle.stop_gradient = False
+
+    value_cuda = value_paddle.detach().clone()
+    sampling_locations_cuda = sampling_locations_paddle.detach().clone()
+    attention_weights_cuda = attention_weights_paddle.detach().clone()
+    value_cuda.stop_gradient = False
+    sampling_locations_cuda.stop_gradient = False
+    attention_weights_cuda.stop_gradient = False
+
+    output_paddle = ms_deform_attn_core_paddle(
+        value_paddle, spatial_shapes, level_start_index,
+        sampling_locations_paddle, attention_weights_paddle)
+    output_paddle.sum().backward()
+
+    output_cuda = ms_deformable_attn(value_cuda, spatial_shapes,
+                                     level_start_index, sampling_locations_cuda,
+                                     attention_weights_cuda)
+    output_cuda.sum().backward()
+
+    res = paddle.allclose(
+        value_paddle.grad, value_cuda.grad, rtol=1e-2, atol=1e-3).item()
+    print(f'*tensor1 {res} check_gradient_numerical(D={channels})')
+
+    res = paddle.allclose(
+        sampling_locations_paddle.grad,
+        sampling_locations_cuda.grad,
+        rtol=1e-2,
+        atol=1e-3).item()
+    print(f'*tensor2 {res} check_gradient_numerical(D={channels})')
+
+    res = paddle.allclose(
+        attention_weights_paddle.grad,
+        attention_weights_cuda.grad,
+        rtol=1e-2,
+        atol=1e-3).item()
+    print(f'*tensor3 {res} check_gradient_numerical(D={channels})')
+
+
+if __name__ == '__main__':
+    check_forward_equal_with_paddle_float()
+
+    for channels in [30, 32, 64, 71, 128, 1024, 1025, 2048, 3096]:
+        check_gradient_numerical(channels)
--- a/rtdetr_paddle/ppdet/modeling/transformers/hybrid_encoder.py
+++ b/rtdetr_paddle/ppdet/modeling/transformers/hybrid_encoder.py
@@ -0,0 +1,287 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from ppdet.core.workspace import register, serializable
+from ppdet.modeling.ops import get_act_fn
+from ..shape_spec import ShapeSpec
+from ..backbones.csp_darknet import BaseConv
+from ..backbones.cspresnet import RepVggBlock
+from ppdet.modeling.transformers.detr_transformer import TransformerEncoder
+from ..initializer import xavier_uniform_, linear_init_
+from ..layers import MultiHeadAttention
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+
+__all__ = ['HybridEncoder']
+
+
+class CSPRepLayer(nn.Layer):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 num_blocks=3,
+                 expansion=1.0,
+                 bias=False,
+                 act="silu"):
+        super(CSPRepLayer, self).__init__()
+        hidden_channels = int(out_channels * expansion)
+        self.conv1 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.conv2 = BaseConv(
+            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
+        self.bottlenecks = nn.Sequential(*[
+            RepVggBlock(
+                hidden_channels, hidden_channels, act=act)
+            for _ in range(num_blocks)
+        ])
+        if hidden_channels != out_channels:
+            self.conv3 = BaseConv(
+                hidden_channels,
+                out_channels,
+                ksize=1,
+                stride=1,
+                bias=bias,
+                act=act)
+        else:
+            self.conv3 = nn.Identity()
+
+    def forward(self, x):
+        x_1 = self.conv1(x)
+        x_1 = self.bottlenecks(x_1)
+        x_2 = self.conv2(x)
+        return self.conv3(x_1 + x_2)
+
+
+@register
+class TransformerLayer(nn.Layer):
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False):
+        super(TransformerLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
+        self.activation = getattr(F, activation)
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos_embed):
+        return tensor if pos_embed is None else tensor + pos_embed
+
+    def forward(self, src, src_mask=None, pos_embed=None):
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        q = k = self.with_pos_embed(src, pos_embed)
+        src = self.self_attn(q, k, value=src, attn_mask=src_mask)
+
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+        return src
+
+
+@register
+@serializable
+class HybridEncoder(nn.Layer):
+    __shared__ = ['depth_mult', 'act', 'trt', 'eval_size']
+    __inject__ = ['encoder_layer']
+
+    def __init__(self,
+                 in_channels=[512, 1024, 2048],
+                 feat_strides=[8, 16, 32],
+                 hidden_dim=256,
+                 use_encoder_idx=[2],
+                 num_encoder_layers=1,
+                 encoder_layer='TransformerLayer',
+                 pe_temperature=10000,
+                 expansion=1.0,
+                 depth_mult=1.0,
+                 act='silu',
+                 trt=False,
+                 eval_size=None):
+        super(HybridEncoder, self).__init__()
+        self.in_channels = in_channels
+        self.feat_strides = feat_strides
+        self.hidden_dim = hidden_dim
+        self.use_encoder_idx = use_encoder_idx
+        self.num_encoder_layers = num_encoder_layers
+        self.pe_temperature = pe_temperature
+        self.eval_size = eval_size
+
+        # channel projection
+        self.input_proj = nn.LayerList()
+        for in_channel in in_channels:
+            self.input_proj.append(
+                nn.Sequential(
+                    nn.Conv2D(
+                        in_channel, hidden_dim, kernel_size=1, bias_attr=False),
+                    nn.BatchNorm2D(
+                        hidden_dim,
+                        weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                        bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))
+        # encoder transformer
+        self.encoder = nn.LayerList([
+            TransformerEncoder(encoder_layer, num_encoder_layers)
+            for _ in range(len(use_encoder_idx))
+        ])
+
+        act = get_act_fn(
+            act, trt=trt) if act is None or isinstance(act,
+                                                       (str, dict)) else act
+        # top-down fpn
+        self.lateral_convs = nn.LayerList()
+        self.fpn_blocks = nn.LayerList()
+        for idx in range(len(in_channels) - 1, 0, -1):
+            self.lateral_convs.append(
+                BaseConv(
+                    hidden_dim, hidden_dim, 1, 1, act=act))
+            self.fpn_blocks.append(
+                CSPRepLayer(
+                    hidden_dim * 2,
+                    hidden_dim,
+                    round(3 * depth_mult),
+                    act=act,
+                    expansion=expansion))
+
+        # bottom-up pan
+        self.downsample_convs = nn.LayerList()
+        self.pan_blocks = nn.LayerList()
+        for idx in range(len(in_channels) - 1):
+            self.downsample_convs.append(
+                BaseConv(
+                    hidden_dim, hidden_dim, 3, stride=2, act=act))
+            self.pan_blocks.append(
+                CSPRepLayer(
+                    hidden_dim * 2,
+                    hidden_dim,
+                    round(3 * depth_mult),
+                    act=act,
+                    expansion=expansion))
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        if self.eval_size:
+            for idx in self.use_encoder_idx:
+                stride = self.feat_strides[idx]
+                pos_embed = self.build_2d_sincos_position_embedding(
+                    self.eval_size[1] // stride, self.eval_size[0] // stride,
+                    self.hidden_dim, self.pe_temperature)
+                setattr(self, f'pos_embed{idx}', pos_embed)
+
+    @staticmethod
+    def build_2d_sincos_position_embedding(w,
+                                           h,
+                                           embed_dim=256,
+                                           temperature=10000.):
+        grid_w = paddle.arange(int(w), dtype=paddle.float32)
+        grid_h = paddle.arange(int(h), dtype=paddle.float32)
+        grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
+        assert embed_dim % 4 == 0, \
+            'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
+        pos_dim = embed_dim // 4
+        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
+        omega = 1. / (temperature**omega)
+
+        out_w = grid_w.flatten()[..., None] @omega[None]
+        out_h = grid_h.flatten()[..., None] @omega[None]
+
+        return paddle.concat(
+            [
+                paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
+                paddle.cos(out_h)
+            ],
+            axis=1)[None, :, :]
+
+    def forward(self, feats, for_mot=False):
+        assert len(feats) == len(self.in_channels)
+        # get projection features
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+        # encoder
+        if self.num_encoder_layers > 0:
+            for i, enc_ind in enumerate(self.use_encoder_idx):
+                h, w = proj_feats[enc_ind].shape[2:]
+                # flatten [B, C, H, W] to [B, HxW, C]
+                src_flatten = proj_feats[enc_ind].flatten(2).transpose(
+                    [0, 2, 1])
+                if self.training or self.eval_size is None:
+                    pos_embed = self.build_2d_sincos_position_embedding(
+                        w, h, self.hidden_dim, self.pe_temperature)
+                else:
+                    pos_embed = getattr(self, f'pos_embed{enc_ind}', None)
+                memory = self.encoder[i](src_flatten, pos_embed=pos_embed)
+                proj_feats[enc_ind] = memory.transpose([0, 2, 1]).reshape(
+                    [-1, self.hidden_dim, h, w])
+
+        # top-down fpn
+        inner_outs = [proj_feats[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_heigh = inner_outs[0]
+            feat_low = proj_feats[idx - 1]
+            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
+                feat_heigh)
+            inner_outs[0] = feat_heigh
+
+            upsample_feat = F.interpolate(
+                feat_heigh, scale_factor=2., mode="nearest")
+            inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
+                paddle.concat(
+                    [upsample_feat, feat_low], axis=1))
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up pan
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_height = inner_outs[idx + 1]
+            downsample_feat = self.downsample_convs[idx](feat_low)
+            out = self.pan_blocks[idx](paddle.concat(
+                [downsample_feat, feat_height], axis=1))
+            outs.append(out)
+
+        return outs
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {
+            'in_channels': [i.channels for i in input_shape],
+            'feat_strides': [i.stride for i in input_shape]
+        }
+
+    @property
+    def out_shape(self):
+        return [
+            ShapeSpec(
+                channels=self.hidden_dim, stride=self.feat_strides[idx])
+            for idx in range(len(self.in_channels))
+        ]
--- a/rtdetr_paddle/ppdet/modeling/transformers/matchers.py
+++ b/rtdetr_paddle/ppdet/modeling/transformers/matchers.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from scipy.optimize import linear_sum_assignment
+
+from ppdet.core.workspace import register, serializable
+from ..losses.iou_loss import GIoULoss
+from .utils import bbox_cxcywh_to_xyxy
+
+__all__ = ['HungarianMatcher']
+
+
+@register
+@serializable
+class HungarianMatcher(nn.Layer):
+    __shared__ = ['use_focal_loss', 'with_mask', 'num_sample_points']
+
+    def __init__(self,
+                 matcher_coeff={
+                     'class': 1,
+                     'bbox': 5,
+                     'giou': 2,
+                     'mask': 1,
+                     'dice': 1
+                 },
+                 use_focal_loss=False,
+                 with_mask=False,
+                 num_sample_points=12544,
+                 alpha=0.25,
+                 gamma=2.0):
+        r"""
+        Args:
+            matcher_coeff (dict): The coefficient of hungarian matcher cost.
+        """
+        super(HungarianMatcher, self).__init__()
+        self.matcher_coeff = matcher_coeff
+        self.use_focal_loss = use_focal_loss
+        self.with_mask = with_mask
+        self.num_sample_points = num_sample_points
+        self.alpha = alpha
+        self.gamma = gamma
+
+        self.giou_loss = GIoULoss()
+
+    def forward(self,
+                boxes,
+                logits,
+                gt_bbox,
+                gt_class,
+                masks=None,
+                gt_mask=None):
+        r"""
+        Args:
+            boxes (Tensor): [b, query, 4]
+            logits (Tensor): [b, query, num_classes]
+            gt_bbox (List(Tensor)): list[[n, 4]]
+            gt_class (List(Tensor)): list[[n, 1]]
+            masks (Tensor|None): [b, query, h, w]
+            gt_mask (List(Tensor)): list[[n, H, W]]
+
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        bs, num_queries = boxes.shape[:2]
+
+        num_gts = [len(a) for a in gt_class]
+        if sum(num_gts) == 0:
+            return [(paddle.to_tensor(
+                [], dtype=paddle.int64), paddle.to_tensor(
+                    [], dtype=paddle.int64)) for _ in range(bs)]
+
+        # We flatten to compute the cost matrices in a batch
+        # [batch_size * num_queries, num_classes]
+        logits = logits.detach()
+        out_prob = F.sigmoid(logits.flatten(
+            0, 1)) if self.use_focal_loss else F.softmax(logits.flatten(0, 1))
+        # [batch_size * num_queries, 4]
+        out_bbox = boxes.detach().flatten(0, 1)
+
+        # Also concat the target labels and boxes
+        tgt_ids = paddle.concat(gt_class).flatten()
+        tgt_bbox = paddle.concat(gt_bbox)
+
+        # Compute the classification cost
+        out_prob = paddle.gather(out_prob, tgt_ids, axis=1)
+        if self.use_focal_loss:
+            neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-(
+                1 - out_prob + 1e-8).log())
+            pos_cost_class = self.alpha * (
+                (1 - out_prob)**self.gamma) * (-(out_prob + 1e-8).log())
+            cost_class = pos_cost_class - neg_cost_class
+        else:
+            cost_class = -out_prob
+
+        # Compute the L1 cost between boxes
+        cost_bbox = (
+            out_bbox.unsqueeze(1) - tgt_bbox.unsqueeze(0)).abs().sum(-1)
+
+        # Compute the giou cost betwen boxes
+        cost_giou = self.giou_loss(
+            bbox_cxcywh_to_xyxy(out_bbox.unsqueeze(1)),
+            bbox_cxcywh_to_xyxy(tgt_bbox.unsqueeze(0))).squeeze(-1)
+
+        # Final cost matrix
+        C = self.matcher_coeff['class'] * cost_class + \
+            self.matcher_coeff['bbox'] * cost_bbox + \
+            self.matcher_coeff['giou'] * cost_giou
+        # Compute the mask cost and dice cost
+        if self.with_mask:
+            assert (masks is not None and gt_mask is not None,
+                    'Make sure the input has `mask` and `gt_mask`')
+            # all masks share the same set of points for efficient matching
+            sample_points = paddle.rand([bs, 1, self.num_sample_points, 2])
+            sample_points = 2.0 * sample_points - 1.0
+
+            out_mask = F.grid_sample(
+                masks.detach(), sample_points, align_corners=False).squeeze(-2)
+            out_mask = out_mask.flatten(0, 1)
+
+            tgt_mask = paddle.concat(gt_mask).unsqueeze(1)
+            sample_points = paddle.concat([
+                a.tile([b, 1, 1, 1]) for a, b in zip(sample_points, num_gts)
+                if b > 0
+            ])
+            tgt_mask = F.grid_sample(
+                tgt_mask, sample_points, align_corners=False).squeeze([1, 2])
+
+            with paddle.amp.auto_cast(enable=False):
+                # binary cross entropy cost
+                pos_cost_mask = F.binary_cross_entropy_with_logits(
+                    out_mask, paddle.ones_like(out_mask), reduction='none')
+                neg_cost_mask = F.binary_cross_entropy_with_logits(
+                    out_mask, paddle.zeros_like(out_mask), reduction='none')
+                cost_mask = paddle.matmul(
+                    pos_cost_mask, tgt_mask, transpose_y=True) + paddle.matmul(
+                        neg_cost_mask, 1 - tgt_mask, transpose_y=True)
+                cost_mask /= self.num_sample_points
+
+                # dice cost
+                out_mask = F.sigmoid(out_mask)
+                numerator = 2 * paddle.matmul(
+                    out_mask, tgt_mask, transpose_y=True)
+                denominator = out_mask.sum(
+                    -1, keepdim=True) + tgt_mask.sum(-1).unsqueeze(0)
+                cost_dice = 1 - (numerator + 1) / (denominator + 1)
+
+                C = C + self.matcher_coeff['mask'] * cost_mask + \
+                    self.matcher_coeff['dice'] * cost_dice
+
+        C = C.reshape([bs, num_queries, -1])
+        C = [a.squeeze(0) for a in C.chunk(bs)]
+        sizes = [a.shape[0] for a in gt_bbox]
+        indices = [
+            linear_sum_assignment(c.split(sizes, -1)[i].numpy())
+            for i, c in enumerate(C)
+        ]
+        return [(paddle.to_tensor(
+            i, dtype=paddle.int64), paddle.to_tensor(
+                j, dtype=paddle.int64)) for i, j in indices]
--- a/rtdetr_paddle/ppdet/modeling/transformers/position_encoding.py
+++ b/rtdetr_paddle/ppdet/modeling/transformers/position_encoding.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math
+import paddle
+import paddle.nn as nn
+
+from ppdet.core.workspace import register, serializable
+
+
+@register
+@serializable
+class PositionEmbedding(nn.Layer):
+    def __init__(self,
+                 num_pos_feats=128,
+                 temperature=10000,
+                 normalize=True,
+                 scale=2 * math.pi,
+                 embed_type='sine',
+                 num_embeddings=50,
+                 offset=0.,
+                 eps=1e-6):
+        super(PositionEmbedding, self).__init__()
+        assert embed_type in ['sine', 'learned']
+
+        self.embed_type = embed_type
+        self.offset = offset
+        self.eps = eps
+        if self.embed_type == 'sine':
+            self.num_pos_feats = num_pos_feats
+            self.temperature = temperature
+            self.normalize = normalize
+            self.scale = scale
+        elif self.embed_type == 'learned':
+            self.row_embed = nn.Embedding(num_embeddings, num_pos_feats)
+            self.col_embed = nn.Embedding(num_embeddings, num_pos_feats)
+        else:
+            raise ValueError(f"{self.embed_type} is not supported.")
+
+    def forward(self, mask):
+        """
+        Args:
+            mask (Tensor): [B, H, W]
+        Returns:
+            pos (Tensor): [B, H, W, C]
+        """
+        if self.embed_type == 'sine':
+            y_embed = mask.cumsum(1)
+            x_embed = mask.cumsum(2)
+            if self.normalize:
+                y_embed = (y_embed + self.offset) / (
+                    y_embed[:, -1:, :] + self.eps) * self.scale
+                x_embed = (x_embed + self.offset) / (
+                    x_embed[:, :, -1:] + self.eps) * self.scale
+
+            dim_t = 2 * (paddle.arange(self.num_pos_feats) //
+                         2).astype('float32')
+            dim_t = self.temperature**(dim_t / self.num_pos_feats)
+
+            pos_x = x_embed.unsqueeze(-1) / dim_t
+            pos_y = y_embed.unsqueeze(-1) / dim_t
+            pos_x = paddle.stack(
+                (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+                axis=4).flatten(3)
+            pos_y = paddle.stack(
+                (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+                axis=4).flatten(3)
+            return paddle.concat((pos_y, pos_x), axis=3)
+        elif self.embed_type == 'learned':
+            h, w = mask.shape[-2:]
+            i = paddle.arange(w)
+            j = paddle.arange(h)
+            x_emb = self.col_embed(i)
+            y_emb = self.row_embed(j)
+            return paddle.concat(
+                [
+                    x_emb.unsqueeze(0).tile([h, 1, 1]),
+                    y_emb.unsqueeze(1).tile([1, w, 1]),
+                ],
+                axis=-1).unsqueeze(0)
+        else:
+            raise ValueError(f"not supported {self.embed_type}")
--- a/rtdetr_paddle/ppdet/modeling/transformers/rtdetr_transformer.py
+++ b/rtdetr_paddle/ppdet/modeling/transformers/rtdetr_transformer.py
@@ -0,0 +1,523 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import ParamAttr
+from paddle.regularizer import L2Decay
+
+from ppdet.core.workspace import register
+from ..layers import MultiHeadAttention
+from .deformable_transformer import MSDeformableAttention
+from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,
+                           bias_init_with_prob)
+from .utils import (_get_clones, get_sine_pos_embed,
+                    get_contrastive_denoising_training_group, inverse_sigmoid, MLP)
+
+__all__ = ['RTDETRTransformer']
+
+
+class PPMSDeformableAttention(MSDeformableAttention):
+    def forward(self,
+                query,
+                reference_points,
+                value,
+                value_spatial_shapes,
+                value_level_start_index,
+                value_mask=None):
+        """
+        Args:
+            query (Tensor): [bs, query_length, C]
+            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area
+            value (Tensor): [bs, value_length, C]
+            value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+            value_level_start_index (List): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
+            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
+
+        Returns:
+            output (Tensor): [bs, Length_{query}, C]
+        """
+        bs, Len_q = query.shape[:2]
+        Len_v = value.shape[1]
+
+        value = self.value_proj(value)
+        if value_mask is not None:
+            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
+            value *= value_mask
+        value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])
+
+        sampling_offsets = self.sampling_offsets(query).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
+        attention_weights = self.attention_weights(query).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels * self.num_points])
+        attention_weights = F.softmax(attention_weights).reshape(
+            [bs, Len_q, self.num_heads, self.num_levels, self.num_points])
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = paddle.to_tensor(value_spatial_shapes)
+            offset_normalizer = offset_normalizer.flip([1]).reshape(
+                [1, 1, 1, self.num_levels, 1, 2])
+            sampling_locations = reference_points.reshape([
+                bs, Len_q, 1, self.num_levels, 1, 2
+            ]) + sampling_offsets / offset_normalizer
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2] + sampling_offsets /
+                self.num_points * reference_points[:, :, None, :, None, 2:] *
+                0.5)
+        else:
+            raise ValueError(
+                "Last dim of reference_points must be 2 or 4, but get {} instead.".
+                format(reference_points.shape[-1]))
+
+        if not isinstance(query, paddle.Tensor):
+            from ppdet.modeling.transformers.utils import deformable_attention_core_func
+            output = deformable_attention_core_func(
+                value, value_spatial_shapes, value_level_start_index,
+                sampling_locations, attention_weights)
+        else:
+            value_spatial_shapes = paddle.to_tensor(value_spatial_shapes)
+            value_level_start_index = paddle.to_tensor(value_level_start_index)
+            output = self.ms_deformable_attn_core(
+                value, value_spatial_shapes, value_level_start_index,
+                sampling_locations, attention_weights)
+        output = self.output_proj(output)
+
+        return output
+
+
+class TransformerDecoderLayer(nn.Layer):
+    def __init__(self,
+                 d_model=256,
+                 n_head=8,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 n_levels=4,
+                 n_points=4,
+                 weight_attr=None,
+                 bias_attr=None):
+        super(TransformerDecoderLayer, self).__init__()
+
+        # self attention
+        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(
+            d_model,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+        # cross attention
+        self.cross_attn = PPMSDeformableAttention(d_model, n_head, n_levels,
+                                                  n_points, 1.0)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(
+            d_model,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr,
+                                 bias_attr)
+        self.activation = getattr(F, activation)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr,
+                                 bias_attr)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(
+            d_model,
+            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        linear_init_(self.linear1)
+        linear_init_(self.linear2)
+        xavier_uniform_(self.linear1.weight)
+        xavier_uniform_(self.linear2.weight)
+
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+
+    def forward(self,
+                tgt,
+                reference_points,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                attn_mask=None,
+                memory_mask=None,
+                query_pos_embed=None):
+        # self attention
+        q = k = self.with_pos_embed(tgt, query_pos_embed)
+        if attn_mask is not None:
+            attn_mask = paddle.where(
+                attn_mask.astype('bool'),
+                paddle.zeros(attn_mask.shape, tgt.dtype),
+                paddle.full(attn_mask.shape, float("-inf"), tgt.dtype))
+        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        # cross attention
+        tgt2 = self.cross_attn(
+            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
+            memory_spatial_shapes, memory_level_start_index, memory_mask)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+
+        # ffn
+        tgt2 = self.forward_ffn(tgt)
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+
+        return tgt
+
+
+class TransformerDecoder(nn.Layer):
+    def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
+        super(TransformerDecoder, self).__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
+
+    def forward(self,
+                tgt,
+                ref_points_unact,
+                memory,
+                memory_spatial_shapes,
+                memory_level_start_index,
+                bbox_head,
+                score_head,
+                query_pos_head,
+                attn_mask=None,
+                memory_mask=None):
+        output = tgt
+        dec_out_bboxes = []
+        dec_out_logits = []
+        ref_points_detach = F.sigmoid(ref_points_unact)
+        for i, layer in enumerate(self.layers):
+            ref_points_input = ref_points_detach.unsqueeze(2)
+            query_pos_embed = query_pos_head(ref_points_detach)
+
+            output = layer(output, ref_points_input, memory,
+                           memory_spatial_shapes, memory_level_start_index,
+                           attn_mask, memory_mask, query_pos_embed)
+
+            inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
+                ref_points_detach))
+
+            if self.training:
+                dec_out_logits.append(score_head[i](output))
+                if i == 0:
+                    dec_out_bboxes.append(inter_ref_bbox)
+                else:
+                    dec_out_bboxes.append(
+                        F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
+                            ref_points)))
+            elif i == self.eval_idx:
+                dec_out_logits.append(score_head[i](output))
+                dec_out_bboxes.append(inter_ref_bbox)
+                break
+
+            ref_points = inter_ref_bbox
+            ref_points_detach = inter_ref_bbox.detach(
+            ) if self.training else inter_ref_bbox
+
+        return paddle.stack(dec_out_bboxes), paddle.stack(dec_out_logits)
+
+
+@register
+class RTDETRTransformer(nn.Layer):
+    __shared__ = ['num_classes', 'hidden_dim', 'eval_size']
+
+    def __init__(self,
+                 num_classes=80,
+                 hidden_dim=256,
+                 num_queries=300,
+                 position_embed_type='sine',
+                 backbone_feat_channels=[512, 1024, 2048],
+                 feat_strides=[8, 16, 32],
+                 num_levels=3,
+                 num_decoder_points=4,
+                 nhead=8,
+                 num_decoder_layers=6,
+                 dim_feedforward=1024,
+                 dropout=0.,
+                 activation="relu",
+                 num_denoising=100,
+                 label_noise_ratio=0.5,
+                 box_noise_scale=1.0,
+                 learnt_init_query=True,
+                 eval_size=None,
+                 eval_idx=-1,
+                 eps=1e-2):
+        super(RTDETRTransformer, self).__init__()
+        assert position_embed_type in ['sine', 'learned'], \
+            f'ValueError: position_embed_type not supported {position_embed_type}!'
+        assert len(backbone_feat_channels) <= num_levels
+        assert len(feat_strides) == len(backbone_feat_channels)
+        for _ in range(num_levels - len(feat_strides)):
+            feat_strides.append(feat_strides[-1] * 2)
+
+        self.hidden_dim = hidden_dim
+        self.nhead = nhead
+        self.feat_strides = feat_strides
+        self.num_levels = num_levels
+        self.num_classes = num_classes
+        self.num_queries = num_queries
+        self.eps = eps
+        self.num_decoder_layers = num_decoder_layers
+        self.eval_size = eval_size
+
+        # backbone feature projection
+        self._build_input_proj_layer(backbone_feat_channels)
+
+        # Transformer module
+        decoder_layer = TransformerDecoderLayer(
+            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
+            num_decoder_points)
+        self.decoder = TransformerDecoder(hidden_dim, decoder_layer,
+                                          num_decoder_layers, eval_idx)
+
+        # denoising part
+        self.denoising_class_embed = nn.Embedding(
+            num_classes,
+            hidden_dim,
+            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
+        self.num_denoising = num_denoising
+        self.label_noise_ratio = label_noise_ratio
+        self.box_noise_scale = box_noise_scale
+
+        # decoder embedding
+        self.learnt_init_query = learnt_init_query
+        if learnt_init_query:
+            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
+        self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)
+
+        # encoder head
+        self.enc_output = nn.Sequential(
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.LayerNorm(
+                hidden_dim,
+                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                bias_attr=ParamAttr(regularizer=L2Decay(0.0))))
+        self.enc_score_head = nn.Linear(hidden_dim, num_classes)
+        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+
+        # decoder head
+        self.dec_score_head = nn.LayerList([
+            nn.Linear(hidden_dim, num_classes)
+            for _ in range(num_decoder_layers)
+        ])
+        self.dec_bbox_head = nn.LayerList([
+            MLP(hidden_dim, hidden_dim, 4, num_layers=3)
+            for _ in range(num_decoder_layers)
+        ])
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        # class and bbox head init
+        bias_cls = bias_init_with_prob(0.01)
+        linear_init_(self.enc_score_head)
+        constant_(self.enc_score_head.bias, bias_cls)
+        constant_(self.enc_bbox_head.layers[-1].weight)
+        constant_(self.enc_bbox_head.layers[-1].bias)
+        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
+            linear_init_(cls_)
+            constant_(cls_.bias, bias_cls)
+            constant_(reg_.layers[-1].weight)
+            constant_(reg_.layers[-1].bias)
+
+        linear_init_(self.enc_output[0])
+        xavier_uniform_(self.enc_output[0].weight)
+        if self.learnt_init_query:
+            xavier_uniform_(self.tgt_embed.weight)
+        xavier_uniform_(self.query_pos_head.layers[0].weight)
+        xavier_uniform_(self.query_pos_head.layers[1].weight)
+        for l in self.input_proj:
+            xavier_uniform_(l[0].weight)
+
+        # init encoder output anchors and valid_mask
+        if self.eval_size:
+            self.anchors, self.valid_mask = self._generate_anchors()
+
+    @classmethod
+    def from_config(cls, cfg, input_shape):
+        return {'backbone_feat_channels': [i.channels for i in input_shape]}
+
+    def _build_input_proj_layer(self, backbone_feat_channels):
+        self.input_proj = nn.LayerList()
+        for in_channels in backbone_feat_channels:
+            self.input_proj.append(
+                nn.Sequential(
+                    ('conv', nn.Conv2D(
+                        in_channels,
+                        self.hidden_dim,
+                        kernel_size=1,
+                        bias_attr=False)), ('norm', nn.BatchNorm2D(
+                            self.hidden_dim,
+                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
+        in_channels = backbone_feat_channels[-1]
+        for _ in range(self.num_levels - len(backbone_feat_channels)):
+            self.input_proj.append(
+                nn.Sequential(
+                    ('conv', nn.Conv2D(
+                        in_channels,
+                        self.hidden_dim,
+                        kernel_size=3,
+                        stride=2,
+                        padding=1,
+                        bias_attr=False)), ('norm', nn.BatchNorm2D(
+                            self.hidden_dim,
+                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
+                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
+            in_channels = self.hidden_dim
+
+    def _get_encoder_input(self, feats):
+        # get projection features
+        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
+        if self.num_levels > len(proj_feats):
+            len_srcs = len(proj_feats)
+            for i in range(len_srcs, self.num_levels):
+                if i == len_srcs:
+                    proj_feats.append(self.input_proj[i](feats[-1]))
+                else:
+                    proj_feats.append(self.input_proj[i](proj_feats[-1]))
+
+        # get encoder inputs
+        feat_flatten = []
+        spatial_shapes = []
+        level_start_index = [0, ]
+        for i, feat in enumerate(proj_feats):
+            _, _, h, w = feat.shape
+            # [b, c, h, w] -> [b, h*w, c]
+            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
+            # [num_levels, 2]
+            spatial_shapes.append([h, w])
+            # [l], start index of each level
+            level_start_index.append(h * w + level_start_index[-1])
+
+        # [b, l, c]
+        feat_flatten = paddle.concat(feat_flatten, 1)
+        level_start_index.pop()
+        return (feat_flatten, spatial_shapes, level_start_index)
+
+    def forward(self, feats, pad_mask=None, gt_meta=None):
+        # input projection and embedding
+        (memory, spatial_shapes,
+         level_start_index) = self._get_encoder_input(feats)
+
+        # prepare denoising training
+        if self.training:
+            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
+                get_contrastive_denoising_training_group(gt_meta,
+                                            self.num_classes,
+                                            self.num_queries,
+                                            self.denoising_class_embed.weight,
+                                            self.num_denoising,
+                                            self.label_noise_ratio,
+                                            self.box_noise_scale)
+        else:
+            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
+
+        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
+            self._get_decoder_input(
+            memory, spatial_shapes, denoising_class, denoising_bbox_unact)
+
+        # decoder
+        out_bboxes, out_logits = self.decoder(
+            target,
+            init_ref_points_unact,
+            memory,
+            spatial_shapes,
+            level_start_index,
+            self.dec_bbox_head,
+            self.dec_score_head,
+            self.query_pos_head,
+            attn_mask=attn_mask)
+        return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
+                dn_meta)
+
+    def _generate_anchors(self,
+                          spatial_shapes=None,
+                          grid_size=0.05,
+                          dtype="float32"):
+        if spatial_shapes is None:
+            spatial_shapes = [
+                [int(self.eval_size[0] / s), int(self.eval_size[1] / s)]
+                for s in self.feat_strides
+            ]
+        anchors = []
+        for lvl, (h, w) in enumerate(spatial_shapes):
+            grid_y, grid_x = paddle.meshgrid(
+                paddle.arange(
+                    end=h, dtype=dtype),
+                paddle.arange(
+                    end=w, dtype=dtype))
+            grid_xy = paddle.stack([grid_x, grid_y], -1)
+
+            valid_WH = paddle.to_tensor([w, h]).astype(dtype)
+            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
+            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
+            anchors.append(
+                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
+
+        anchors = paddle.concat(anchors, 1)
+        valid_mask = ((anchors > self.eps) *
+                      (anchors < 1 - self.eps)).all(-1, keepdim=True)
+        anchors = paddle.log(anchors / (1 - anchors))
+        anchors = paddle.where(valid_mask, anchors,
+                               paddle.to_tensor(float("inf")))
+        return anchors, valid_mask
+
+    def _get_decoder_input(self,
+                           memory,
+                           spatial_shapes,
+                           denoising_class=None,
+                           denoising_bbox_unact=None):
+        bs, _, _ = memory.shape
+        # prepare input for decoder
+        if self.training or self.eval_size is None:
+            anchors, valid_mask = self._generate_anchors(spatial_shapes)
+        else:
+            anchors, valid_mask = self.anchors, self.valid_mask
+        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
+        output_memory = self.enc_output(memory)
+
+        enc_outputs_class = self.enc_score_head(output_memory)
+        enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors
+
+        _, topk_ind = paddle.topk(
+            enc_outputs_class.max(-1), self.num_queries, axis=1)
+        # extract region proposal boxes
+        batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype)
+        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
+        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
+
+        reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact,
+                                                  topk_ind)  # unsigmoided.
+        enc_topk_bboxes = F.sigmoid(reference_points_unact)
+        if denoising_bbox_unact is not None:
+            reference_points_unact = paddle.concat(
+                [denoising_bbox_unact, reference_points_unact], 1)
+        if self.training:
+            reference_points_unact = reference_points_unact.detach()
+        enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)
+
+        # extract region features
+        if self.learnt_init_query:
+            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
+        else:
+            target = paddle.gather_nd(output_memory, topk_ind)
+            if self.training:
+                target = target.detach()
+        if denoising_class is not None:
+            target = paddle.concat([denoising_class, target], 1)
+
+        return target, reference_points_unact, enc_topk_bboxes, enc_topk_logits
--- a/rtdetr_paddle/ppdet/modeling/transformers/utils.py
+++ b/rtdetr_paddle/ppdet/modeling/transformers/utils.py
@@ -0,0 +1,481 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+# Modified from detrex (https://github.com/IDEA-Research/detrex)
+# Copyright 2022 The IDEA Authors. All rights reserved.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import math
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+__all__ = [
+    '_get_clones', 'bbox_cxcywh_to_xyxy',
+    'bbox_xyxy_to_cxcywh', 'sigmoid_focal_loss', 'inverse_sigmoid',
+    'deformable_attention_core_func', 'varifocal_loss_with_logits'
+]
+
+
+
+def bbox_area(boxes):
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+def bbox_overlaps(boxes1, boxes2):
+    """
+    Calculate overlaps between boxes1 and boxes2
+
+    Args:
+        boxes1 (Tensor): boxes with shape [M, 4]
+        boxes2 (Tensor): boxes with shape [N, 4]
+
+    Return:
+        overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N]
+    """
+    M = boxes1.shape[0]
+    N = boxes2.shape[0]
+    if M * N == 0:
+        return paddle.zeros([M, N], dtype='float32')
+    area1 = bbox_area(boxes1)
+    area2 = bbox_area(boxes2)
+
+    xy_max = paddle.minimum(
+        paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:])
+    xy_min = paddle.maximum(
+        paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2])
+    width_height = xy_max - xy_min
+    width_height = width_height.clip(min=0)
+    inter = width_height.prod(axis=2)
+
+    overlaps = paddle.where(inter > 0, inter /
+                            (paddle.unsqueeze(area1, 1) + area2 - inter),
+                            paddle.zeros_like(inter))
+    return overlaps
+
+
+def _get_clones(module, N):
+    return nn.LayerList([copy.deepcopy(module) for _ in range(N)])
+
+
+def bbox_cxcywh_to_xyxy(x):
+    cxcy, wh = paddle.split(x, 2, axis=-1)
+    return paddle.concat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], axis=-1)
+
+
+def bbox_xyxy_to_cxcywh(x):
+    x1, y1, x2, y2 = x.split(4, axis=-1)
+    return paddle.concat(
+        [(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)], axis=-1)
+
+
+def sigmoid_focal_loss(logit, label, normalizer=1.0, alpha=0.25, gamma=2.0):
+    prob = F.sigmoid(logit)
+    ce_loss = F.binary_cross_entropy_with_logits(logit, label, reduction="none")
+    p_t = prob * label + (1 - prob) * (1 - label)
+    loss = ce_loss * ((1 - p_t)**gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * label + (1 - alpha) * (1 - label)
+        loss = alpha_t * loss
+    return loss.mean(1).sum() / normalizer
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clip(min=0., max=1.)
+    return paddle.log(x.clip(min=eps) / (1 - x).clip(min=eps))
+
+
+def deformable_attention_core_func(value, value_spatial_shapes,
+                                   value_level_start_index, sampling_locations,
+                                   attention_weights):
+    """
+    Args:
+        value (Tensor): [bs, value_length, n_head, c]
+        value_spatial_shapes (Tensor|List): [n_levels, 2]
+        value_level_start_index (Tensor|List): [n_levels]
+        sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2]
+        attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points]
+
+    Returns:
+        output (Tensor): [bs, Length_{query}, C]
+    """
+    bs, _, n_head, c = value.shape
+    _, Len_q, _, n_levels, n_points, _ = sampling_locations.shape
+
+    split_shape = [h * w for h, w in value_spatial_shapes]
+    value_list = value.split(split_shape, axis=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level, (h, w) in enumerate(value_spatial_shapes):
+        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
+        value_l_ = value_list[level].flatten(2).transpose(
+            [0, 2, 1]).reshape([bs * n_head, c, h, w])
+        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(
+            [0, 2, 1, 3, 4]).flatten(0, 1)
+        # N_*M_, D_, Lq_, P_
+        sampling_value_l_ = F.grid_sample(
+            value_l_,
+            sampling_grid_l_,
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_)
+    attention_weights = attention_weights.transpose([0, 2, 1, 3, 4]).reshape(
+        [bs * n_head, 1, Len_q, n_levels * n_points])
+    output = (paddle.stack(
+        sampling_value_list, axis=-2).flatten(-2) *
+              attention_weights).sum(-1).reshape([bs, n_head * c, Len_q])
+
+    return output.transpose([0, 2, 1])
+
+
+def get_valid_ratio(mask):
+    _, H, W = paddle.shape(mask)
+    valid_ratio_h = paddle.sum(mask[:, :, 0], 1) / H
+    valid_ratio_w = paddle.sum(mask[:, 0, :], 1) / W
+    # [b, 2]
+    return paddle.stack([valid_ratio_w, valid_ratio_h], -1)
+
+
+def get_denoising_training_group(targets,
+                                 num_classes,
+                                 num_queries,
+                                 class_embed,
+                                 num_denoising=100,
+                                 label_noise_ratio=0.5,
+                                 box_noise_scale=1.0):
+    if num_denoising <= 0:
+        return None, None, None, None
+    num_gts = [len(t) for t in targets["gt_class"]]
+    max_gt_num = max(num_gts)
+    if max_gt_num == 0:
+        return None, None, None, None
+
+    num_group = num_denoising // max_gt_num
+    num_group = 1 if num_group == 0 else num_group
+    # pad gt to max_num of a batch
+    bs = len(targets["gt_class"])
+    input_query_class = paddle.full(
+        [bs, max_gt_num], num_classes, dtype='int32')
+    input_query_bbox = paddle.zeros([bs, max_gt_num, 4])
+    pad_gt_mask = paddle.zeros([bs, max_gt_num])
+    for i in range(bs):
+        num_gt = num_gts[i]
+        if num_gt > 0:
+            input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1)
+            input_query_bbox[i, :num_gt] = targets["gt_bbox"][i]
+            pad_gt_mask[i, :num_gt] = 1
+
+    input_query_class = input_query_class.tile([1, num_group])
+    input_query_bbox = input_query_bbox.tile([1, num_group, 1])
+    pad_gt_mask = pad_gt_mask.tile([1, num_group])
+
+    dn_positive_idx = paddle.nonzero(pad_gt_mask)[:, 1]
+    dn_positive_idx = paddle.split(dn_positive_idx,
+                                   [n * num_group for n in num_gts])
+    # total denoising queries
+    num_denoising = int(max_gt_num * num_group)
+
+    if label_noise_ratio > 0:
+        input_query_class = input_query_class.flatten()
+        pad_gt_mask = pad_gt_mask.flatten()
+        # half of bbox prob
+        mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5)
+        chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1)
+        # randomly put a new one here
+        new_label = paddle.randint_like(
+            chosen_idx, 0, num_classes, dtype=input_query_class.dtype)
+        input_query_class.scatter_(chosen_idx, new_label)
+        input_query_class.reshape_([bs, num_denoising])
+        pad_gt_mask.reshape_([bs, num_denoising])
+
+    if box_noise_scale > 0:
+        diff = paddle.concat(
+            [input_query_bbox[..., 2:] * 0.5, input_query_bbox[..., 2:]],
+            axis=-1) * box_noise_scale
+        diff *= (paddle.rand(input_query_bbox.shape) * 2.0 - 1.0)
+        input_query_bbox += diff
+        input_query_bbox = inverse_sigmoid(input_query_bbox)
+
+    class_embed = paddle.concat(
+        [class_embed, paddle.zeros([1, class_embed.shape[-1]])])
+    input_query_class = paddle.gather(
+        class_embed, input_query_class.flatten(),
+        axis=0).reshape([bs, num_denoising, -1])
+
+    tgt_size = num_denoising + num_queries
+    attn_mask = paddle.ones([tgt_size, tgt_size]) < 0
+    # match query cannot see the reconstruction
+    attn_mask[num_denoising:, :num_denoising] = True
+    # reconstruct cannot see each other
+    for i in range(num_group):
+        if i == 0:
+            attn_mask[max_gt_num * i:max_gt_num * (i + 1), max_gt_num * (i + 1):
+                      num_denoising] = True
+        if i == num_group - 1:
+            attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num *
+                      i] = True
+        else:
+            attn_mask[max_gt_num * i:max_gt_num * (i + 1), max_gt_num * (i + 1):
+                      num_denoising] = True
+            attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num *
+                      i] = True
+    attn_mask = ~attn_mask
+    dn_meta = {
+        "dn_positive_idx": dn_positive_idx,
+        "dn_num_group": num_group,
+        "dn_num_split": [num_denoising, num_queries]
+    }
+
+    return input_query_class, input_query_bbox, attn_mask, dn_meta
+
+
+def get_contrastive_denoising_training_group(targets,
+                                             num_classes,
+                                             num_queries,
+                                             class_embed,
+                                             num_denoising=100,
+                                             label_noise_ratio=0.5,
+                                             box_noise_scale=1.0):
+    if num_denoising <= 0:
+        return None, None, None, None
+    num_gts = [len(t) for t in targets["gt_class"]]
+    max_gt_num = max(num_gts)
+    if max_gt_num == 0:
+        return None, None, None, None
+
+    num_group = num_denoising // max_gt_num
+    num_group = 1 if num_group == 0 else num_group
+    # pad gt to max_num of a batch
+    bs = len(targets["gt_class"])
+    input_query_class = paddle.full(
+        [bs, max_gt_num], num_classes, dtype='int32')
+    input_query_bbox = paddle.zeros([bs, max_gt_num, 4])
+    pad_gt_mask = paddle.zeros([bs, max_gt_num])
+    for i in range(bs):
+        num_gt = num_gts[i]
+        if num_gt > 0:
+            input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1)
+            input_query_bbox[i, :num_gt] = targets["gt_bbox"][i]
+            pad_gt_mask[i, :num_gt] = 1
+    # each group has positive and negative queries.
+    input_query_class = input_query_class.tile([1, 2 * num_group])
+    input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1])
+    pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group])
+    # positive and negative mask
+    negative_gt_mask = paddle.zeros([bs, max_gt_num * 2, 1])
+    negative_gt_mask[:, max_gt_num:] = 1
+    negative_gt_mask = negative_gt_mask.tile([1, num_group, 1])
+    positive_gt_mask = 1 - negative_gt_mask
+    # contrastive denoising training positive index
+    positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
+    dn_positive_idx = paddle.nonzero(positive_gt_mask)[:, 1]
+    dn_positive_idx = paddle.split(dn_positive_idx,
+                                   [n * num_group for n in num_gts])
+    # total denoising queries
+    num_denoising = int(max_gt_num * 2 * num_group)
+
+    if label_noise_ratio > 0:
+        input_query_class = input_query_class.flatten()
+        pad_gt_mask = pad_gt_mask.flatten()
+
+        # Convert pad_gt_mask to bool if it's not already
+        pad_gt_mask = pad_gt_mask.astype('bool')
+
+        # half of bbox prob
+        mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5)
+        chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1)
+        
+        # randomly put a new one here
+        new_label = paddle.randint_like(
+            chosen_idx, 0, num_classes, dtype=input_query_class.dtype)
+        
+        input_query_class.scatter_(chosen_idx, new_label)
+        input_query_class.reshape_([bs, num_denoising])
+        pad_gt_mask.reshape_([bs, num_denoising])
+
+    if box_noise_scale > 0:
+        known_bbox = bbox_cxcywh_to_xyxy(input_query_bbox)
+
+        diff = paddle.tile(input_query_bbox[..., 2:] * 0.5,
+                           [1, 1, 2]) * box_noise_scale
+
+        rand_sign = paddle.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
+        rand_part = paddle.rand(input_query_bbox.shape)
+        rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (
+            1 - negative_gt_mask)
+        rand_part *= rand_sign
+        known_bbox += rand_part * diff
+        known_bbox.clip_(min=0.0, max=1.0)
+        input_query_bbox = bbox_xyxy_to_cxcywh(known_bbox)
+        input_query_bbox = inverse_sigmoid(input_query_bbox)
+
+    class_embed = paddle.concat(
+        [class_embed, paddle.zeros([1, class_embed.shape[-1]])])
+    input_query_class = paddle.gather(
+        class_embed, input_query_class.flatten(),
+        axis=0).reshape([bs, num_denoising, -1])
+
+    tgt_size = num_denoising + num_queries
+    attn_mask = paddle.ones([tgt_size, tgt_size]) < 0
+    # match query cannot see the reconstruction
+    attn_mask[num_denoising:, :num_denoising] = True
+    # reconstruct cannot see each other
+    for i in range(num_group):
+        if i == 0:
+            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num *
+                      2 * (i + 1):num_denoising] = True
+        if i == num_group - 1:
+            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *
+                      i * 2] = True
+        else:
+            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num *
+                      2 * (i + 1):num_denoising] = True
+            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *
+                      2 * i] = True
+    attn_mask = ~attn_mask
+    dn_meta = {
+        "dn_positive_idx": dn_positive_idx,
+        "dn_num_group": num_group,
+        "dn_num_split": [num_denoising, num_queries]
+    }
+
+    return input_query_class, input_query_bbox, attn_mask, dn_meta
+
+
+def get_sine_pos_embed(pos_tensor,
+                       num_pos_feats=128,
+                       temperature=10000,
+                       exchange_xy=True):
+    """generate sine position embedding from a position tensor
+
+    Args:
+        pos_tensor (Tensor): Shape as `(None, n)`.
+        num_pos_feats (int): projected shape for each float in the tensor. Default: 128
+        temperature (int): The temperature used for scaling
+            the position embedding. Default: 10000.
+        exchange_xy (bool, optional): exchange pos x and pos y. \
+            For example, input tensor is `[x, y]`, the results will  # noqa
+            be `[pos(y), pos(x)]`. Defaults: True.
+
+    Returns:
+        Tensor: Returned position embedding  # noqa
+        with shape `(None, n * num_pos_feats)`.
+    """
+    scale = 2. * math.pi
+    dim_t = 2. * paddle.floor_divide(
+        paddle.arange(num_pos_feats), paddle.to_tensor(2))
+    dim_t = scale / temperature**(dim_t / num_pos_feats)
+
+    def sine_func(x):
+        x *= dim_t
+        return paddle.stack(
+            (x[:, :, 0::2].sin(), x[:, :, 1::2].cos()), axis=3).flatten(2)
+
+    pos_res = [sine_func(x) for x in pos_tensor.split(pos_tensor.shape[-1], -1)]
+    if exchange_xy:
+        pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
+    pos_res = paddle.concat(pos_res, axis=2)
+    return pos_res
+
+
+def mask_to_box_coordinate(mask,
+                           normalize=False,
+                           format="xyxy",
+                           dtype="float32"):
+    """
+    Compute the bounding boxes around the provided mask.
+    Args:
+        mask (Tensor:bool): [b, c, h, w]
+
+    Returns:
+        bbox (Tensor): [b, c, 4]
+    """
+    assert mask.ndim == 4
+    assert format in ["xyxy", "xywh"]
+    if mask.sum() == 0:
+        return paddle.zeros([mask.shape[0], mask.shape[1], 4], dtype=dtype)
+
+    h, w = mask.shape[-2:]
+    y, x = paddle.meshgrid(
+        paddle.arange(
+            end=h, dtype=dtype), paddle.arange(
+                end=w, dtype=dtype))
+
+    x_mask = x * mask
+    x_max = x_mask.flatten(-2).max(-1) + 1
+    x_min = paddle.where(mask, x_mask,
+                         paddle.to_tensor(1e8)).flatten(-2).min(-1)
+
+    y_mask = y * mask
+    y_max = y_mask.flatten(-2).max(-1) + 1
+    y_min = paddle.where(mask, y_mask,
+                         paddle.to_tensor(1e8)).flatten(-2).min(-1)
+    out_bbox = paddle.stack([x_min, y_min, x_max, y_max], axis=-1)
+    if normalize:
+        out_bbox /= paddle.to_tensor([w, h, w, h]).astype(dtype)
+
+    return out_bbox if format == "xyxy" else bbox_xyxy_to_cxcywh(out_bbox)
+
+
+def varifocal_loss_with_logits(pred_logits,
+                               gt_score,
+                               label,
+                               normalizer=1.0,
+                               alpha=0.75,
+                               gamma=2.0):
+    pred_score = F.sigmoid(pred_logits)
+    weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
+    loss = F.binary_cross_entropy_with_logits(
+        pred_logits, gt_score, weight=weight, reduction='none')
+    return loss.mean(1).sum() / normalizer
+
+
+
+
+from ..initializer import linear_init_
+
+class MLP(nn.Layer):
+    """This code is based on
+        https://github.com/facebookresearch/detr/blob/main/models/detr.py
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.LayerList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for l in self.layers:
+            linear_init_(l)
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+